diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1775,13 +1775,15 @@ WorklistInserter AddNodes(*this); + DAG.AssignTopologicalOrder(); + // Add all the dag nodes to the worklist. // // Note: All nodes are not added to PruningList here, this is because the only // nodes which can be deleted are those which have no uses and all other nodes // which would otherwise be added to the worklist by the first call to // getNextWorklistEntry are already present in it. - for (SDNode &Node : DAG.allnodes()) + for (SDNode &Node : reverse(DAG.allnodes())) AddToWorklist(&Node, /* IsCandidateForPruning */ Node.use_empty()); // Create a dummy node (which is not added to allnodes), that adds a reference diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll @@ -925,14 +925,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -944,14 +944,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -963,14 +963,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -982,14 +982,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1001,14 +1001,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] @@ -1675,14 +1675,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -1694,14 +1694,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -1713,14 +1713,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -1732,14 +1732,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1751,14 +1751,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll @@ -557,8 +557,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O0: adds x2, x0, x9 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -572,8 +572,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O0: adds x2, x0, x9 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -587,8 +587,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_release: ; -O0: adds x2, x0, x9 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -602,8 +602,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O0: adds x2, x0, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -617,8 +617,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O0: adds x2, x0, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] @@ -1132,8 +1132,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O0: subs x2, x0, x9 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -1147,8 +1147,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O0: subs x2, x0, x9 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -1162,8 +1162,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: ; -O0: subs x2, x0, x9 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -1177,8 +1177,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O0: subs x2, x0, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1192,8 +1192,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O0: subs x2, x0, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll @@ -925,14 +925,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -944,14 +944,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -963,14 +963,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -982,14 +982,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1001,14 +1001,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] @@ -1675,14 +1675,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -1694,14 +1694,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -1713,14 +1713,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -1732,14 +1732,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1751,14 +1751,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll @@ -925,14 +925,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -944,14 +944,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -963,14 +963,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -982,14 +982,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1001,14 +1001,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] @@ -1675,14 +1675,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -1694,14 +1694,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -1713,14 +1713,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -1732,14 +1732,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1751,14 +1751,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll @@ -925,14 +925,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -944,14 +944,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -963,14 +963,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -982,14 +982,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1001,14 +1001,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] @@ -1675,14 +1675,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -1694,14 +1694,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -1713,14 +1713,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -1732,14 +1732,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1751,14 +1751,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll @@ -118,8 +118,8 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_unordered: ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_unordered: ; -O1: ldxp xzr, x8, [x2] @@ -131,8 +131,8 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_monotonic: ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_monotonic: ; -O1: ldxp xzr, x8, [x2] @@ -144,8 +144,8 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_release: ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_release: ; -O1: ldxp xzr, x8, [x2] @@ -157,8 +157,8 @@ define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_seq_cst: ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_seq_cst: ; -O1: ldaxp xzr, x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll @@ -117,13 +117,13 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_unordered: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_unordered: ; -O1: ldxp xzr, x8, [x2] @@ -134,13 +134,13 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_monotonic: ; -O1: ldxp xzr, x8, [x2] @@ -151,13 +151,13 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_release: ; -O1: ldxp xzr, x8, [x2] @@ -168,13 +168,13 @@ define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_seq_cst: ; -O1: ldaxp xzr, x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll @@ -117,13 +117,13 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_unordered: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_unordered: ; -O1: ldxp xzr, x8, [x2] @@ -134,13 +134,13 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_monotonic: ; -O1: ldxp xzr, x8, [x2] @@ -151,13 +151,13 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_release: ; -O1: ldxp xzr, x8, [x2] @@ -168,13 +168,13 @@ define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_seq_cst: ; -O1: ldaxp xzr, x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll @@ -305,13 +305,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O1: ldxp x1, x8, [x0] @@ -322,13 +322,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O1: ldaxp x1, x8, [x0] @@ -339,13 +339,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O1: ldxp x1, x8, [x0] @@ -356,13 +356,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O1: ldaxp x1, x8, [x0] @@ -373,13 +373,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O1: ldaxp x1, x8, [x0] @@ -945,14 +945,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -964,14 +964,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -983,14 +983,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1002,14 +1002,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1021,14 +1021,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1710,14 +1710,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1729,14 +1729,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1748,14 +1748,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1767,14 +1767,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1786,14 +1786,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -2475,15 +2475,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -2496,15 +2496,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -2517,15 +2517,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_release: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -2538,15 +2538,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -2559,15 +2559,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3300,17 +3300,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3325,17 +3325,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3350,17 +3350,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_release: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3375,17 +3375,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3400,17 +3400,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4165,15 +4165,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4186,15 +4186,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4207,15 +4207,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_release: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -4228,15 +4228,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -4249,15 +4249,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4950,15 +4950,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4971,15 +4971,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4992,15 +4992,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_release: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5013,15 +5013,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5034,15 +5034,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -5795,16 +5795,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -5818,16 +5818,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -5841,16 +5841,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5864,16 +5864,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5887,16 +5887,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6720,16 +6720,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6743,16 +6743,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6766,16 +6766,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6789,16 +6789,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6812,16 +6812,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -7645,16 +7645,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -7668,16 +7668,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -7691,16 +7691,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -7714,16 +7714,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -7737,16 +7737,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -8570,16 +8570,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -8593,16 +8593,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -8616,16 +8616,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -8639,16 +8639,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -8662,16 +8662,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll @@ -146,8 +146,8 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O1: ldxp x1, x8, [x0] @@ -159,8 +159,8 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O1: ldaxp x1, x8, [x0] @@ -172,8 +172,8 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O1: ldxp x1, x8, [x0] @@ -185,8 +185,8 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O1: ldaxp x1, x8, [x0] @@ -198,8 +198,8 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O1: ldaxp x1, x8, [x0] @@ -527,8 +527,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O0: adds x3, x1, x9 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -542,8 +542,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O0: adds x3, x1, x9 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -557,8 +557,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_release: ; -O0: adds x3, x1, x9 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -572,8 +572,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O0: adds x3, x1, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -587,8 +587,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O0: adds x3, x1, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1102,8 +1102,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O0: subs x3, x1, x9 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1117,8 +1117,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O0: subs x3, x1, x9 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1132,8 +1132,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: ; -O0: subs x3, x1, x9 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1147,8 +1147,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O0: subs x3, x1, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1162,8 +1162,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O0: subs x3, x1, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1718,8 +1718,8 @@ ; -O0: and x2, x0, x9 ; -O0: and x3, x1, x8 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1735,8 +1735,8 @@ ; -O0: and x2, x0, x9 ; -O0: and x3, x1, x8 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1752,8 +1752,8 @@ ; -O0: and x2, x0, x9 ; -O0: and x3, x1, x8 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1769,8 +1769,8 @@ ; -O0: and x2, x0, x9 ; -O0: and x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1786,8 +1786,8 @@ ; -O0: and x2, x0, x9 ; -O0: and x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -2460,8 +2460,8 @@ ; -O0: mvn x2, x9 ; -O0: mvn x3, x8 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -2481,8 +2481,8 @@ ; -O0: mvn x2, x9 ; -O0: mvn x3, x8 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -2502,8 +2502,8 @@ ; -O0: mvn x2, x9 ; -O0: mvn x3, x8 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -2523,8 +2523,8 @@ ; -O0: mvn x2, x9 ; -O0: mvn x3, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -2544,8 +2544,8 @@ ; -O0: mvn x2, x9 ; -O0: mvn x3, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3093,8 +3093,8 @@ ; -O0: orr x2, x0, x9 ; -O0: orr x3, x1, x8 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3110,8 +3110,8 @@ ; -O0: orr x2, x0, x9 ; -O0: orr x3, x1, x8 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3127,8 +3127,8 @@ ; -O0: orr x2, x0, x9 ; -O0: orr x3, x1, x8 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3144,8 +3144,8 @@ ; -O0: orr x2, x0, x9 ; -O0: orr x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3161,8 +3161,8 @@ ; -O0: orr x2, x0, x9 ; -O0: orr x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3608,8 +3608,8 @@ ; -O0: eor x2, x0, x9 ; -O0: eor x3, x1, x8 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3625,8 +3625,8 @@ ; -O0: eor x2, x0, x9 ; -O0: eor x3, x1, x8 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3642,8 +3642,8 @@ ; -O0: eor x2, x0, x9 ; -O0: eor x3, x1, x8 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3659,8 +3659,8 @@ ; -O0: eor x2, x0, x9 ; -O0: eor x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3676,8 +3676,8 @@ ; -O0: eor x2, x0, x9 ; -O0: eor x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4344,8 +4344,8 @@ ; -O0: csel x2, x0, x9, lt ; -O0: csel x3, x1, x8, lt ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4363,8 +4363,8 @@ ; -O0: csel x2, x0, x9, lt ; -O0: csel x3, x1, x8, lt ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4382,8 +4382,8 @@ ; -O0: csel x2, x0, x9, lt ; -O0: csel x3, x1, x8, lt ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -4401,8 +4401,8 @@ ; -O0: csel x2, x0, x9, lt ; -O0: csel x3, x1, x8, lt ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -4420,8 +4420,8 @@ ; -O0: csel x2, x0, x9, lt ; -O0: csel x3, x1, x8, lt ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -5199,8 +5199,8 @@ ; -O0: csel x2, x0, x9, ge ; -O0: csel x3, x1, x8, ge ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -5218,8 +5218,8 @@ ; -O0: csel x2, x0, x9, ge ; -O0: csel x3, x1, x8, ge ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -5237,8 +5237,8 @@ ; -O0: csel x2, x0, x9, ge ; -O0: csel x3, x1, x8, ge ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5256,8 +5256,8 @@ ; -O0: csel x2, x0, x9, ge ; -O0: csel x3, x1, x8, ge ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5275,8 +5275,8 @@ ; -O0: csel x2, x0, x9, ge ; -O0: csel x3, x1, x8, ge ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6054,8 +6054,8 @@ ; -O0: csel x2, x0, x9, lo ; -O0: csel x3, x1, x8, lo ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6073,8 +6073,8 @@ ; -O0: csel x2, x0, x9, lo ; -O0: csel x3, x1, x8, lo ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6092,8 +6092,8 @@ ; -O0: csel x2, x0, x9, lo ; -O0: csel x3, x1, x8, lo ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6111,8 +6111,8 @@ ; -O0: csel x2, x0, x9, lo ; -O0: csel x3, x1, x8, lo ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6130,8 +6130,8 @@ ; -O0: csel x2, x0, x9, lo ; -O0: csel x3, x1, x8, lo ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6909,8 +6909,8 @@ ; -O0: csel x2, x0, x9, hs ; -O0: csel x3, x1, x8, hs ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6928,8 +6928,8 @@ ; -O0: csel x2, x0, x9, hs ; -O0: csel x3, x1, x8, hs ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6947,8 +6947,8 @@ ; -O0: csel x2, x0, x9, hs ; -O0: csel x3, x1, x8, hs ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6966,8 +6966,8 @@ ; -O0: csel x2, x0, x9, hs ; -O0: csel x3, x1, x8, hs ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6985,8 +6985,8 @@ ; -O0: csel x2, x0, x9, hs ; -O0: csel x3, x1, x8, hs ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll @@ -305,13 +305,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O1: ldxp x1, x8, [x0] @@ -322,13 +322,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O1: ldaxp x1, x8, [x0] @@ -339,13 +339,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O1: ldxp x1, x8, [x0] @@ -356,13 +356,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O1: ldaxp x1, x8, [x0] @@ -373,13 +373,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O1: ldaxp x1, x8, [x0] @@ -945,14 +945,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -964,14 +964,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -983,14 +983,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1002,14 +1002,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1021,14 +1021,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1710,14 +1710,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1729,14 +1729,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1748,14 +1748,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1767,14 +1767,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1786,14 +1786,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -2475,15 +2475,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -2496,15 +2496,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -2517,15 +2517,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_release: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -2538,15 +2538,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -2559,15 +2559,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3300,17 +3300,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3325,17 +3325,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3350,17 +3350,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_release: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3375,17 +3375,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3400,17 +3400,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4165,15 +4165,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4186,15 +4186,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4207,15 +4207,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_release: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -4228,15 +4228,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -4249,15 +4249,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4950,15 +4950,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4971,15 +4971,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4992,15 +4992,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_release: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5013,15 +5013,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5034,15 +5034,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -5795,16 +5795,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -5818,16 +5818,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -5841,16 +5841,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5864,16 +5864,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5887,16 +5887,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6720,16 +6720,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6743,16 +6743,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6766,16 +6766,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6789,16 +6789,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6812,16 +6812,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -7645,16 +7645,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -7668,16 +7668,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -7691,16 +7691,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -7714,16 +7714,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -7737,16 +7737,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -8570,16 +8570,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -8593,16 +8593,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -8616,16 +8616,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -8639,16 +8639,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -8662,16 +8662,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll @@ -305,13 +305,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O1: ldxp x1, x8, [x0] @@ -322,13 +322,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O1: ldaxp x1, x8, [x0] @@ -339,13 +339,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O1: ldxp x1, x8, [x0] @@ -356,13 +356,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O1: ldaxp x1, x8, [x0] @@ -373,13 +373,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O1: ldaxp x1, x8, [x0] @@ -945,14 +945,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -964,14 +964,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -983,14 +983,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1002,14 +1002,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1021,14 +1021,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1710,14 +1710,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1729,14 +1729,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1748,14 +1748,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1767,14 +1767,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1786,14 +1786,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -2475,15 +2475,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -2496,15 +2496,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -2517,15 +2517,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_release: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -2538,15 +2538,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -2559,15 +2559,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3300,17 +3300,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3325,17 +3325,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3350,17 +3350,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_release: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3375,17 +3375,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3400,17 +3400,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4165,15 +4165,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4186,15 +4186,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4207,15 +4207,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_release: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -4228,15 +4228,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -4249,15 +4249,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4950,15 +4950,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4971,15 +4971,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4992,15 +4992,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_release: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5013,15 +5013,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5034,15 +5034,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -5795,16 +5795,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -5818,16 +5818,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -5841,16 +5841,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5864,16 +5864,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5887,16 +5887,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6720,16 +6720,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6743,16 +6743,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6766,16 +6766,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6789,16 +6789,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6812,16 +6812,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -7645,16 +7645,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -7668,16 +7668,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -7691,16 +7691,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -7714,16 +7714,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -7737,16 +7737,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -8570,16 +8570,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -8593,16 +8593,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -8616,16 +8616,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -8639,16 +8639,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -8662,16 +8662,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll @@ -305,13 +305,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O1: ldxp x1, x8, [x0] @@ -322,13 +322,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O1: ldaxp x1, x8, [x0] @@ -339,13 +339,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O1: ldxp x1, x8, [x0] @@ -356,13 +356,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O1: ldaxp x1, x8, [x0] @@ -373,13 +373,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O1: ldaxp x1, x8, [x0] @@ -945,14 +945,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -964,14 +964,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -983,14 +983,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1002,14 +1002,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1021,14 +1021,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1710,14 +1710,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1729,14 +1729,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1748,14 +1748,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1767,14 +1767,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1786,14 +1786,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -2475,15 +2475,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -2496,15 +2496,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -2517,15 +2517,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_release: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -2538,15 +2538,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -2559,15 +2559,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3300,17 +3300,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3325,17 +3325,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3350,17 +3350,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_release: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3375,17 +3375,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3400,17 +3400,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4165,15 +4165,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4186,15 +4186,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4207,15 +4207,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_release: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -4228,15 +4228,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -4249,15 +4249,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4950,15 +4950,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4971,15 +4971,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4992,15 +4992,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_release: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5013,15 +5013,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5034,15 +5034,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -5795,16 +5795,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -5818,16 +5818,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -5841,16 +5841,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5864,16 +5864,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5887,16 +5887,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6720,16 +6720,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6743,16 +6743,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6766,16 +6766,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6789,16 +6789,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6812,16 +6812,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -7645,16 +7645,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -7668,16 +7668,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -7691,16 +7691,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -7714,16 +7714,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -7737,16 +7737,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -8570,16 +8570,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -8593,16 +8593,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -8616,16 +8616,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -8639,16 +8639,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -8662,16 +8662,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -73,8 +73,13 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: uabdl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: uaddlv s0, v0.8h +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: abs v1.4s, v1.4s +; CHECK-NEXT: abs v0.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -93,16 +98,16 @@ declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) define i32 @oversized_ADDV_512(ptr %arr) { -; SDAG-LABEL: oversized_ADDV_512: -; SDAG: // %bb.0: -; SDAG-NEXT: ldp q0, q1, [x0, #32] -; SDAG-NEXT: ldp q3, q2, [x0] -; SDAG-NEXT: add v0.4s, v3.4s, v0.4s -; SDAG-NEXT: add v1.4s, v2.4s, v1.4s -; SDAG-NEXT: add v0.4s, v0.4s, v1.4s -; SDAG-NEXT: addv s0, v0.4s -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; SDAG-LABEL: oversized_ADDV_512: +; SDAG: // %bb.0: +; SDAG-NEXT: ldp q0, q1, [x0, #32] +; SDAG-NEXT: ldp q3, q2, [x0] +; SDAG-NEXT: add v0.4s, v3.4s, v0.4s +; SDAG-NEXT: add v1.4s, v2.4s, v1.4s +; SDAG-NEXT: add v0.4s, v0.4s, v1.4s +; SDAG-NEXT: addv s0, v0.4s +; SDAG-NEXT: fmov w0, s0 +; SDAG-NEXT: ret ; ; GISEL-LABEL: oversized_ADDV_512: ; GISEL: // %bb.0: @@ -148,19 +153,19 @@ } define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) { -; SDAG-LABEL: addv_combine_i32: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.4s, v0.4s, v1.4s -; SDAG-NEXT: addv s0, v0.4s -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; SDAG-LABEL: addv_combine_i32: +; SDAG: // %bb.0: // %entry +; SDAG-NEXT: add v0.4s, v0.4s, v1.4s +; SDAG-NEXT: addv s0, v0.4s +; SDAG-NEXT: fmov w0, s0 +; SDAG-NEXT: ret ; ; GISEL-LABEL: addv_combine_i32: ; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addv s0, v0.4s -; GISEL-NEXT: addv s1, v1.4s -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: addv s0, v0.4s +; GISEL-NEXT: addv s1, v1.4s +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: add w0, w8, w9 ; GISEL-NEXT: ret entry: @@ -171,19 +176,19 @@ } define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) { -; SDAG-LABEL: addv_combine_i64: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.2d, v0.2d, v1.2d -; SDAG-NEXT: addp d0, v0.2d -; SDAG-NEXT: fmov x0, d0 -; SDAG-NEXT: ret +; SDAG-LABEL: addv_combine_i64: +; SDAG: // %bb.0: // %entry +; SDAG-NEXT: add v0.2d, v0.2d, v1.2d +; SDAG-NEXT: addp d0, v0.2d +; SDAG-NEXT: fmov x0, d0 +; SDAG-NEXT: ret ; ; GISEL-LABEL: addv_combine_i64: ; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addp d0, v0.2d -; GISEL-NEXT: addp d1, v1.2d -; GISEL-NEXT: fmov x8, d0 -; GISEL-NEXT: fmov x9, d1 +; GISEL-NEXT: addp d0, v0.2d +; GISEL-NEXT: addp d1, v1.2d +; GISEL-NEXT: fmov x8, d0 +; GISEL-NEXT: fmov x9, d1 ; GISEL-NEXT: add x0, x8, x9 ; GISEL-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll b/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll --- a/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll @@ -51,7 +51,8 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) { ; CHECK-LABEL: test_vbfdot_laneq_f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3] +; CHECK-NEXT: dup v2.2s, v2.s[3] +; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h ; CHECK-NEXT: ret entry: %.cast = bitcast <8 x bfloat> %b to <4 x float> diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -194,8 +194,9 @@ ; CHECK-NEXT: cmtst v0.8h, v0.8h, v0.8h ; CHECK-NEXT: dup v1.8h, w8 ; CHECK-NEXT: cmeq v1.8h, v1.8h, #0 -; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b ; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v1.16b +; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %tmp = xor <16 x i1> zeroinitializer, diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -320,8 +320,8 @@ ; CHECK-LABEL: insert_vec_v12i16_uaddlv_from_v4i16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: stp xzr, xzr, [x0, #32] +; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: uaddlv.4h s1, v0 ; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: ushll.4s v0, v0, #0 diff --git a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll --- a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll +++ b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll @@ -9,21 +9,32 @@ ; CHECK-NEXT: add x8, sp, #40 ; CHECK-NEXT: add x0, sp, #40 ; CHECK-NEXT: stp x30, x18, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x3, x4, [sp, #56] -; CHECK-NEXT: stp x1, x2, [sp, #40] -; CHECK-NEXT: stp x5, x6, [sp, #72] -; CHECK-NEXT: str x7, [sp, #88] +; CHECK-NEXT: stp x6, x7, [sp, #80] +; CHECK-NEXT: stp x4, x5, [sp, #64] +; CHECK-NEXT: stp x2, x3, [sp, #48] +; CHECK-NEXT: str x1, [sp, #40] ; CHECK-NEXT: str x8, [sp, #8] ; CHECK-NEXT: bl other_func ; CHECK-NEXT: ldp x30, x18, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret ; +; DARWIN-LABEL: pass_va: ; DARWIN: ; %bb.0: ; %entry -; DARWIN-DAG: stp x3, x4, [sp, #56] -; DARWIN-DAG: stp x1, x2, [sp, #40] -; DARWIN-DAG: stp x5, x6, [sp, #72] -; DARWIN-DAG: str x7, [sp, #88] +; DARWIN-NEXT: str x18, [sp, #-96]! ; 8-byte Folded Spill +; DARWIN-NEXT: add x8, sp, #8 +; DARWIN-NEXT: add x9, sp, #40 +; DARWIN-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; DARWIN-NEXT: stp x1, x2, [sp, #40] +; DARWIN-NEXT: str x9, [x8] +; DARWIN-NEXT: ldr x0, [sp, #8] +; DARWIN-NEXT: stp x3, x4, [sp, #56] +; DARWIN-NEXT: stp x5, x6, [sp, #72] +; DARWIN-NEXT: str x7, [sp, #88] +; DARWIN-NEXT: bl _other_func +; DARWIN-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; DARWIN-NEXT: ldr x18, [sp], #96 ; 8-byte Folded Reload +; DARWIN-NEXT: ret entry: %ap = alloca ptr, align 8 call void @llvm.va_start(ptr %ap) @@ -47,15 +58,15 @@ ; CHECK-NEXT: ldr x18, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret ; -; DARWIN-LABEL: _f9: -; DARWIN: ; %bb.0: ; %entry -; DARWIN-NEXT: str x18, [sp, #-16]! ; 8-byte Folded Spill -; DARWIN-NEXT: add x8, sp, #8 -; DARWIN-NEXT: add x9, sp, #24 -; DARWIN-NEXT: str x9, [x8] -; DARWIN-NEXT: ldr x0, [sp, #8] -; DARWIN-NEXT: ldr x18, [sp], #16 ; 8-byte Folded Reload -; DARWIN-NEXT: ret +; DARWIN-LABEL: f9: +; DARWIN: ; %bb.0: ; %entry +; DARWIN-NEXT: str x18, [sp, #-16]! ; 8-byte Folded Spill +; DARWIN-NEXT: add x8, sp, #8 +; DARWIN-NEXT: add x9, sp, #24 +; DARWIN-NEXT: str x9, [x8] +; DARWIN-NEXT: ldr x0, [sp, #8] +; DARWIN-NEXT: ldr x18, [sp], #16 ; 8-byte Folded Reload +; DARWIN-NEXT: ret entry: %ap = alloca ptr, align 8 call void @llvm.va_start(ptr %ap) @@ -73,15 +84,15 @@ ; CHECK-NEXT: ldr x18, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret ; -; DARWIN-LABEL: _f8: -; DARWIN: ; %bb.0: ; %entry -; DARWIN-NEXT: str x18, [sp, #-16]! ; 8-byte Folded Spill -; DARWIN-NEXT: add x8, sp, #8 -; DARWIN-NEXT: add x9, sp, #16 -; DARWIN-NEXT: str x9, [x8] -; DARWIN-NEXT: ldr x0, [sp, #8] -; DARWIN-NEXT: ldr x18, [sp], #16 ; 8-byte Folded Reload -; DARWIN-NEXT: ret +; DARWIN-LABEL: f8: +; DARWIN: ; %bb.0: ; %entry +; DARWIN-NEXT: str x18, [sp, #-16]! ; 8-byte Folded Spill +; DARWIN-NEXT: add x8, sp, #8 +; DARWIN-NEXT: add x9, sp, #16 +; DARWIN-NEXT: str x9, [x8] +; DARWIN-NEXT: ldr x0, [sp, #8] +; DARWIN-NEXT: ldr x18, [sp], #16 ; 8-byte Folded Reload +; DARWIN-NEXT: ret entry: %ap = alloca ptr, align 8 call void @llvm.va_start(ptr %ap) @@ -100,16 +111,16 @@ ; CHECK-NEXT: ldr x18, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: ret ; -; DARWIN-LABEL: _f7: -; DARWIN: ; %bb.0: ; %entry -; DARWIN-NEXT: str x18, [sp, #-32]! ; 8-byte Folded Spill -; DARWIN-NEXT: add x8, sp, #8 -; DARWIN-NEXT: add x9, sp, #24 -; DARWIN-NEXT: str x7, [sp, #24] -; DARWIN-NEXT: str x9, [x8] -; DARWIN-NEXT: ldr x0, [sp, #8] -; DARWIN-NEXT: ldr x18, [sp], #32 ; 8-byte Folded Reload -; DARWIN-NEXT: ret +; DARWIN-LABEL: f7: +; DARWIN: ; %bb.0: ; %entry +; DARWIN-NEXT: str x18, [sp, #-32]! ; 8-byte Folded Spill +; DARWIN-NEXT: add x8, sp, #8 +; DARWIN-NEXT: add x9, sp, #24 +; DARWIN-NEXT: str x7, [sp, #24] +; DARWIN-NEXT: str x9, [x8] +; DARWIN-NEXT: ldr x0, [sp, #8] +; DARWIN-NEXT: ldr x18, [sp], #32 ; 8-byte Folded Reload +; DARWIN-NEXT: ret entry: %ap = alloca ptr, align 8 call void @llvm.va_start(ptr %ap) diff --git a/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll b/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll --- a/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll +++ b/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -frame-pointer=all -mtriple=arm64-windows | FileCheck %s ; Test generated from C code: @@ -15,18 +16,59 @@ declare ptr @llvm.addressofreturnaddress() define dso_local ptr @"foo"() { +; CHECK-LABEL: foo: +; CHECK: .seh_proc foo +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr_x 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .seh_set_fp +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: add x0, x29, #8 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr_x 16 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc entry: %0 = call ptr @llvm.addressofreturnaddress() ret ptr %0 -; CHECK-LABEL: foo -; CHECK: stp x29, x30, [sp, #-16]! -; CHECK: mov x29, sp -; CHECK: add x0, x29, #8 -; CHECK: ldp x29, x30, [sp], #16 } define dso_local i32 @"bar"(ptr %x, ...) { +; CHECK-LABEL: bar: +; CHECK: .seh_proc bar +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .seh_stackalloc 96 +; CHECK-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr 16 +; CHECK-NEXT: add x29, sp, #16 +; CHECK-NEXT: .seh_add_fp 16 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: add x9, x29, #24 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: str x1, [x29, #24] +; CHECK-NEXT: add x1, x29, #8 +; CHECK-NEXT: stp x6, x7, [x29, #64] +; CHECK-NEXT: stp x9, x0, [sp] +; CHECK-NEXT: add x0, x29, #24 +; CHECK-NEXT: stp x4, x5, [x29, #48] +; CHECK-NEXT: stp x2, x3, [x29, #32] +; CHECK-NEXT: blr x8 +; CHECK-NEXT: add w0, w0, #1 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr 16 +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: .seh_stackalloc 96 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc entry: %x.addr = alloca ptr, align 8 %y = alloca ptr, align 8 @@ -39,12 +81,4 @@ %add = add nsw i32 %call, 1 ret i32 %add -; CHECK-LABEL: bar -; CHECK: sub sp, sp, #96 -; CHECK: stp x29, x30, [sp, #16] -; CHECK: add x29, sp, #16 -; CHECK: stp x1, x2, [x29, #24] -; CHECK: add x1, x29, #8 -; CHECK: ldp x29, x30, [sp, #16] -; CHECK: add sp, sp, #96 } diff --git a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll --- a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll +++ b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll @@ -385,12 +385,12 @@ ; CHECK-NEXT: bl return_in_block ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: str d0, [x8] -; CHECK-NEXT: str d1, [x8, #8] -; CHECK-NEXT: str d2, [x8, #16] ; CHECK-NEXT: str d3, [x8, #24] -; CHECK-NEXT: str d4, [x8, #32] ; CHECK-NEXT: str d5, [x8, #40] +; CHECK-NEXT: str d4, [x8, #32] +; CHECK-NEXT: str d2, [x8, #16] +; CHECK-NEXT: str d1, [x8, #8] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %1 = call %T_IN_BLOCK @return_in_block() @@ -457,17 +457,21 @@ ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: add x8, sp, #8 ; CHECK-NEXT: bl return_in_memory -; CHECK-NEXT: ldur q0, [sp, #24] +; CHECK-NEXT: ldr d0, [sp, #24] ; CHECK-NEXT: adrp x8, in_memory_store ; CHECK-NEXT: add x8, x8, :lo12:in_memory_store -; CHECK-NEXT: ldur q1, [sp, #8] +; CHECK-NEXT: ldr d1, [sp, #48] ; CHECK-NEXT: ldur q2, [sp, #56] -; CHECK-NEXT: ldur q3, [sp, #40] -; CHECK-NEXT: ldr d4, [sp, #72] -; CHECK-NEXT: stp q1, q0, [x8] +; CHECK-NEXT: ldur q3, [sp, #32] +; CHECK-NEXT: ldur q4, [sp, #8] +; CHECK-NEXT: ldr d5, [sp, #72] +; CHECK-NEXT: str q2, [x8, #48] ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: stp q3, q2, [x8, #32] -; CHECK-NEXT: str d4, [x8, #64] +; CHECK-NEXT: stur q3, [x8, #24] +; CHECK-NEXT: str q4, [x8] +; CHECK-NEXT: str d5, [x8, #64] +; CHECK-NEXT: str d1, [x8, #40] +; CHECK-NEXT: str d0, [x8, #16] ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %1 = call %T_IN_MEMORY @return_in_memory() @@ -540,10 +544,10 @@ ; CHECK-NEXT: bl return_no_block ; CHECK-NEXT: adrp x8, no_block_store ; CHECK-NEXT: add x8, x8, :lo12:no_block_store -; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: str w0, [x8, #8] -; CHECK-NEXT: str d1, [x8, #16] ; CHECK-NEXT: str w1, [x8, #24] +; CHECK-NEXT: str d1, [x8, #16] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %1 = call %T_NO_BLOCK @return_no_block() diff --git a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll --- a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll @@ -14,15 +14,14 @@ ; CHECK-NEXT: stp w6, w5, [sp, #36] ; CHECK-NEXT: str w7, [sp, #32] ; CHECK-NEXT: str w8, [x0] -; CHECK-NEXT: add x8, sp, #72 -; CHECK-NEXT: add x8, x8, #8 ; CHECK-NEXT: ldr w9, [sp, #72] -; CHECK-NEXT: str w9, [sp, #20] -; CHECK-NEXT: ldr w9, [x8], #8 -; CHECK-NEXT: str w9, [sp, #16] -; CHECK-NEXT: ldr w9, [x8], #8 +; CHECK-NEXT: ldr w8, [sp, #80] +; CHECK-NEXT: stp w8, w9, [sp, #16] +; CHECK-NEXT: add x8, sp, #72 +; CHECK-NEXT: add x8, x8, #24 ; CHECK-NEXT: str x8, [sp, #24] -; CHECK-NEXT: str w9, [sp, #12] +; CHECK-NEXT: ldr w8, [sp, #88] +; CHECK-NEXT: str w8, [sp, #12] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret %1 = alloca i32, align 4 @@ -64,37 +63,37 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: sub sp, sp, #96 ; CHECK-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov w9, #1 ; =0x1 +; CHECK-NEXT: mov w8, #2 ; =0x2 ; CHECK-NEXT: stp w8, w9, [sp, #72] -; CHECK-NEXT: mov w9, #3 -; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: mov w9, #3 ; =0x3 +; CHECK-NEXT: mov w8, #4 ; =0x4 ; CHECK-NEXT: stp w8, w9, [sp, #64] -; CHECK-NEXT: mov w9, #5 -; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: mov w9, #5 ; =0x5 +; CHECK-NEXT: mov w8, #6 ; =0x6 ; CHECK-NEXT: stp w8, w9, [sp, #56] -; CHECK-NEXT: mov w9, #7 -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w9, #7 ; =0x7 +; CHECK-NEXT: mov w8, #8 ; =0x8 ; CHECK-NEXT: stp w8, w9, [sp, #48] -; CHECK-NEXT: mov w8, #9 -; CHECK-NEXT: mov w9, #10 +; CHECK-NEXT: mov w8, #9 ; =0x9 +; CHECK-NEXT: mov w9, #10 ; =0xa ; CHECK-NEXT: stp w9, w8, [sp, #40] -; CHECK-NEXT: mov w10, #11 -; CHECK-NEXT: mov w11, #12 +; CHECK-NEXT: mov w10, #11 ; =0xb +; CHECK-NEXT: mov w11, #12 ; =0xc ; CHECK-NEXT: stp w11, w10, [sp, #32] ; CHECK-NEXT: stp x10, x11, [sp, #16] ; CHECK-NEXT: str x9, [sp, #8] ; CHECK-NEXT: str w8, [sp] ; CHECK-NEXT: add x0, sp, #76 -; CHECK-NEXT: mov w1, #2 -; CHECK-NEXT: mov w2, #3 -; CHECK-NEXT: mov w3, #4 -; CHECK-NEXT: mov w4, #5 -; CHECK-NEXT: mov w5, #6 -; CHECK-NEXT: mov w6, #7 -; CHECK-NEXT: mov w7, #8 +; CHECK-NEXT: mov w1, #2 ; =0x2 +; CHECK-NEXT: mov w2, #3 ; =0x3 +; CHECK-NEXT: mov w3, #4 ; =0x4 +; CHECK-NEXT: mov w4, #5 ; =0x5 +; CHECK-NEXT: mov w5, #6 ; =0x6 +; CHECK-NEXT: mov w6, #7 ; =0x7 +; CHECK-NEXT: mov w7, #8 ; =0x8 ; CHECK-NEXT: bl _fn9 -; CHECK-NEXT: mov w0, #0 +; CHECK-NEXT: mov w0, #0 ; =0x0 ; CHECK-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -24,7 +24,7 @@ define <8 x i16> @build_all_zero(<8 x i16> %a) #1 { ; CHECK-LABEL: build_all_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #44672 +; CHECK-NEXT: mov w8, #44672 // =0xae80 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret @@ -56,9 +56,9 @@ define void @widen_f16_build_vector(ptr %addr) { ; CHECK-LABEL: widen_f16_build_vector: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #13294 -; CHECK-NEXT: movk w8, #13294, lsl #16 -; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: mov w8, #13294 // =0x33ee +; CHECK-NEXT: dup v0.8h, w8 +; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret store <2 x half> , ptr %addr, align 2 ret void @@ -68,7 +68,7 @@ define <1 x i64> @single_element_vector_i64(<1 x i64> %arg) { ; CHECK-LABEL: single_element_vector_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: fmov d1, x8 ; CHECK-NEXT: add d0, d0, d1 ; CHECK-NEXT: ret @@ -94,7 +94,7 @@ ; CHECK-LABEL: convert_single_fp_vector_constant: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: mov x8, #4607182418800017408 +; CHECK-NEXT: mov x8, #4607182418800017408 // =0x3ff0000000000000 ; CHECK-NEXT: csetm x9, ne ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: fmov d1, x9 @@ -120,7 +120,7 @@ define <2 x double> @negzero_v2f64(<2 x double> %a) { ; CHECK-LABEL: negzero_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-9223372036854775808 +; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret @@ -141,7 +141,7 @@ define <1 x double> @negzero_v1f64(<1 x double> %a) { ; CHECK-LABEL: negzero_v1f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-9223372036854775808 +; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 ; CHECK-NEXT: fmov d1, x8 ; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll --- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll @@ -14,7 +14,7 @@ ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: LBB0_2: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 5 @@ -42,7 +42,7 @@ ; SDISEL-NEXT: bl _foo ; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; SDISEL-NEXT: LBB1_2: ; %if.end -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; ; GISEL-LABEL: single_different: @@ -55,7 +55,7 @@ ; GISEL-NEXT: bl _foo ; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; GISEL-NEXT: LBB1_2: ; %if.end -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret entry: %cmp = icmp sle i32 %a, 5 @@ -88,7 +88,7 @@ ; SDISEL-NEXT: bl _foo ; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; SDISEL-NEXT: LBB2_3: ; %if.end -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; ; GISEL-LABEL: single_flagclobber: @@ -106,7 +106,7 @@ ; GISEL-NEXT: bl _foo ; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; GISEL-NEXT: LBB2_3: ; %if.end -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret entry: %cmp = icmp eq i32 %a, 5 @@ -144,7 +144,7 @@ ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: LBB3_3: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 5 @@ -178,13 +178,13 @@ ; SDISEL-NEXT: ccmp w8, #16, #0, ge ; SDISEL-NEXT: b.le LBB4_2 ; SDISEL-NEXT: ; %bb.1: ; %if.end -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; SDISEL-NEXT: LBB4_2: ; %if.then ; SDISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; SDISEL-NEXT: bl _foo ; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; ; GISEL-LABEL: speculate_division: @@ -194,13 +194,13 @@ ; GISEL-NEXT: ccmp w8, #17, #0, gt ; GISEL-NEXT: b.lt LBB4_2 ; GISEL-NEXT: ; %bb.1: ; %if.end -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret ; GISEL-NEXT: LBB4_2: ; %if.then ; GISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; GISEL-NEXT: bl _foo ; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret entry: %cmp = icmp sgt i32 %a, 0 @@ -230,13 +230,13 @@ ; SDISEL-NEXT: fccmp s0, s1, #8, ge ; SDISEL-NEXT: b.ge LBB5_2 ; SDISEL-NEXT: ; %bb.1: ; %if.end -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; SDISEL-NEXT: LBB5_2: ; %if.then ; SDISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; SDISEL-NEXT: bl _foo ; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; ; GISEL-LABEL: single_fcmp: @@ -248,13 +248,13 @@ ; GISEL-NEXT: fccmp s0, s1, #8, gt ; GISEL-NEXT: b.ge LBB5_2 ; GISEL-NEXT: ; %bb.1: ; %if.end -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret ; GISEL-NEXT: LBB5_2: ; %if.then ; GISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; GISEL-NEXT: bl _foo ; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret entry: %cmp = icmp sgt i32 %a, 0 @@ -318,7 +318,7 @@ ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: LBB7_2: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 0 @@ -346,13 +346,13 @@ ; CHECK-NEXT: cmp w1, #32 ; CHECK-NEXT: b.eq LBB8_3 ; CHECK-NEXT: ; %bb.2: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret ; CHECK-NEXT: LBB8_3: ; %if.then ; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 5 @@ -380,7 +380,7 @@ ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: LBB9_2: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 0 @@ -408,7 +408,7 @@ ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: LBB10_2: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 0 @@ -466,7 +466,7 @@ ; ; GISEL-LABEL: select_and: ; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #5 +; GISEL-NEXT: mov w8, #5 ; =0x5 ; GISEL-NEXT: cmp w8, w1 ; GISEL-NEXT: ccmp w0, w1, #0, ne ; GISEL-NEXT: csel x0, x2, x3, lt @@ -488,7 +488,7 @@ ; ; GISEL-LABEL: select_or: ; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #5 +; GISEL-NEXT: mov w8, #5 ; =0x5 ; GISEL-NEXT: cmp w8, w1 ; GISEL-NEXT: ccmp w0, w1, #8, eq ; GISEL-NEXT: csel x0, x2, x3, lt @@ -510,7 +510,7 @@ ; ; GISEL-LABEL: select_or_float: ; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #5 +; GISEL-NEXT: mov w8, #5 ; =0x5 ; GISEL-NEXT: cmp w8, w1 ; GISEL-NEXT: ccmp w0, w1, #8, eq ; GISEL-NEXT: fcsel s0, s0, s1, lt @@ -528,13 +528,13 @@ ; SDISEL-NEXT: cmp x0, #2 ; SDISEL-NEXT: ccmp x0, #4, #4, ne ; SDISEL-NEXT: ccmp x1, #0, #0, eq -; SDISEL-NEXT: mov w8, #1 +; SDISEL-NEXT: mov w8, #1 ; =0x1 ; SDISEL-NEXT: cinc x0, x8, eq ; SDISEL-NEXT: ret ; ; GISEL-LABEL: gccbug: ; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #2 +; GISEL-NEXT: mov w8, #2 ; =0x2 ; GISEL-NEXT: cmp x0, #2 ; GISEL-NEXT: ccmp x0, #4, #4, ne ; GISEL-NEXT: ccmp x1, #0, #0, eq @@ -592,7 +592,7 @@ ; SDISEL-LABEL: select_andor32: ; SDISEL: ; %bb.0: ; SDISEL-NEXT: cmp w1, w2 -; SDISEL-NEXT: mov w8, #32 +; SDISEL-NEXT: mov w8, #32 ; =0x20 ; SDISEL-NEXT: ccmp w0, w8, #4, lt ; SDISEL-NEXT: ccmp w0, w1, #0, eq ; SDISEL-NEXT: csel w0, w0, w1, eq @@ -600,7 +600,7 @@ ; ; GISEL-LABEL: select_andor32: ; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #32 +; GISEL-NEXT: mov w8, #32 ; =0x20 ; GISEL-NEXT: cmp w1, w2 ; GISEL-NEXT: ccmp w0, w8, #4, lt ; GISEL-NEXT: ccmp w0, w1, #0, eq @@ -663,8 +663,7 @@ ; SDISEL-NEXT: cmp x0, #0 ; SDISEL-NEXT: ccmp x0, #13, #0, ge ; SDISEL-NEXT: cset w8, gt -; SDISEL-NEXT: cmp w8, #0 -; SDISEL-NEXT: csel x0, xzr, x3, ne +; SDISEL-NEXT: csel x0, xzr, x3, gt ; SDISEL-NEXT: sbfx w8, w8, #0, #1 ; SDISEL-NEXT: adrp x9, _g@PAGE ; SDISEL-NEXT: str w8, [x9, _g@PAGEOFF] @@ -701,11 +700,11 @@ ; SDISEL-NEXT: ccmp w0, #13, #0, ge ; SDISEL-NEXT: cset w8, gt ; SDISEL-NEXT: cmp w0, #22 -; SDISEL-NEXT: mov w9, #44 +; SDISEL-NEXT: mov w9, #44 ; =0x2c ; SDISEL-NEXT: ccmp w0, w9, #0, ge ; SDISEL-NEXT: csel w8, wzr, w8, le ; SDISEL-NEXT: cmp w0, #99 -; SDISEL-NEXT: mov w9, #77 +; SDISEL-NEXT: mov w9, #77 ; =0x4d ; SDISEL-NEXT: ccmp w0, w9, #4, ne ; SDISEL-NEXT: cset w9, eq ; SDISEL-NEXT: tst w8, w9 diff --git a/llvm/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll --- a/llvm/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll +++ b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll @@ -14,7 +14,9 @@ define void @test(ptr nocapture %su) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: str wzr, [x0, #96] +; CHECK-NEXT: ldrh w8, [x0, #100] +; CHECK-NEXT: lsl x8, x8, #32 +; CHECK-NEXT: str w8, [x0, #96] ; CHECK-NEXT: ret entry: %r1 = getelementptr inbounds %"struct.SU", ptr %su, i64 1, i32 5 diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll --- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -593,7 +593,7 @@ define ptr @test_v16i8_post_reg_st1_lane(<16 x i8> %in, ptr %addr) { ; CHECK-LABEL: test_v16i8_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov w8, #2 ; =0x2 ; CHECK-NEXT: st1.b { v0 }[3], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <16 x i8> %in, i32 3 @@ -619,7 +619,7 @@ define ptr @test_v8i16_post_reg_st1_lane(<8 x i16> %in, ptr %addr) { ; CHECK-LABEL: test_v8i16_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: mov w8, #4 ; =0x4 ; CHECK-NEXT: st1.h { v0 }[3], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <8 x i16> %in, i32 3 @@ -644,7 +644,7 @@ define ptr @test_v4i32_post_reg_st1_lane(<4 x i32> %in, ptr %addr) { ; CHECK-LABEL: test_v4i32_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 ; =0x8 ; CHECK-NEXT: st1.s { v0 }[3], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <4 x i32> %in, i32 3 @@ -669,7 +669,7 @@ define ptr @test_v4f32_post_reg_st1_lane(<4 x float> %in, ptr %addr) { ; CHECK-LABEL: test_v4f32_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 ; =0x8 ; CHECK-NEXT: st1.s { v0 }[3], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <4 x float> %in, i32 3 @@ -694,7 +694,7 @@ define ptr @test_v2i64_post_reg_st1_lane(<2 x i64> %in, ptr %addr) { ; CHECK-LABEL: test_v2i64_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov w8, #16 ; =0x10 ; CHECK-NEXT: st1.d { v0 }[1], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <2 x i64> %in, i64 1 @@ -719,7 +719,7 @@ define ptr @test_v2f64_post_reg_st1_lane(<2 x double> %in, ptr %addr) { ; CHECK-LABEL: test_v2f64_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov w8, #16 ; =0x10 ; CHECK-NEXT: st1.d { v0 }[1], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <2 x double> %in, i32 1 @@ -745,7 +745,7 @@ define ptr @test_v8i8_post_reg_st1_lane(<8 x i8> %in, ptr %addr) { ; CHECK-LABEL: test_v8i8_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov w8, #2 ; =0x2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: st1.b { v0 }[3], [x0], x8 ; CHECK-NEXT: ret @@ -772,7 +772,7 @@ define ptr @test_v4i16_post_reg_st1_lane(<4 x i16> %in, ptr %addr) { ; CHECK-LABEL: test_v4i16_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: mov w8, #4 ; =0x4 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: st1.h { v0 }[3], [x0], x8 ; CHECK-NEXT: ret @@ -799,7 +799,7 @@ define ptr @test_v2i32_post_reg_st1_lane(<2 x i32> %in, ptr %addr) { ; CHECK-LABEL: test_v2i32_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 ; =0x8 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: st1.s { v0 }[1], [x0], x8 ; CHECK-NEXT: ret @@ -826,7 +826,7 @@ define ptr @test_v2f32_post_reg_st1_lane(<2 x float> %in, ptr %addr) { ; CHECK-LABEL: test_v2f32_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 ; =0x8 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: st1.s { v0 }[1], [x0], x8 ; CHECK-NEXT: ret @@ -8271,8 +8271,9 @@ define <16 x i8> @test_v16i8_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v16i8_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.16b { v0 }, [x0], #1 +; CHECK-NEXT: ldrb w8, [x0], #1 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.16b v0, w8 ; CHECK-NEXT: ret %tmp1 = load i8, ptr %bar %tmp2 = insertelement <16 x i8> , i8 %tmp1, i32 0 @@ -8327,8 +8328,9 @@ define <8 x i8> @test_v8i8_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v8i8_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.8b { v0 }, [x0], #1 +; CHECK-NEXT: ldrb w8, [x0], #1 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.8b v0, w8 ; CHECK-NEXT: ret %tmp1 = load i8, ptr %bar %tmp2 = insertelement <8 x i8> , i8 %tmp1, i32 0 @@ -8367,8 +8369,9 @@ define <8 x i16> @test_v8i16_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v8i16_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.8h { v0 }, [x0], #2 +; CHECK-NEXT: ldrh w8, [x0], #2 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.8h v0, w8 ; CHECK-NEXT: ret %tmp1 = load i16, ptr %bar %tmp2 = insertelement <8 x i16> , i16 %tmp1, i32 0 @@ -8408,8 +8411,9 @@ define <4 x i16> @test_v4i16_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v4i16_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 +; CHECK-NEXT: ldrh w8, [x0], #2 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.4h v0, w8 ; CHECK-NEXT: ret %tmp1 = load i16, ptr %bar %tmp2 = insertelement <4 x i16> , i16 %tmp1, i32 0 @@ -8441,8 +8445,9 @@ define <4 x i32> @test_v4i32_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v4i32_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.4s { v0 }, [x0], #4 +; CHECK-NEXT: ldr w8, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.4s v0, w8 ; CHECK-NEXT: ret %tmp1 = load i32, ptr %bar %tmp2 = insertelement <4 x i32> , i32 %tmp1, i32 0 @@ -8474,8 +8479,9 @@ define <2 x i32> @test_v2i32_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v2i32_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.2s { v0 }, [x0], #4 +; CHECK-NEXT: ldr w8, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.2s v0, w8 ; CHECK-NEXT: ret %tmp1 = load i32, ptr %bar %tmp2 = insertelement <2 x i32> , i32 %tmp1, i32 0 @@ -8503,8 +8509,9 @@ define <2 x i64> @test_v2i64_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v2i64_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.2d { v0 }, [x0], #8 +; CHECK-NEXT: ldr x8, [x0], #8 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.2d v0, x8 ; CHECK-NEXT: ret %tmp1 = load i64, ptr %bar %tmp2 = insertelement <2 x i64> , i64 %tmp1, i32 0 @@ -8532,8 +8539,9 @@ define <4 x float> @test_v4f32_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v4f32_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.4s { v0 }, [x0], #4 +; CHECK-NEXT: ldr s0, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.4s v0, v0[0] ; CHECK-NEXT: ret %tmp1 = load float, ptr %bar %tmp2 = insertelement <4 x float> , float %tmp1, i32 0 @@ -8565,8 +8573,9 @@ define <2 x float> @test_v2f32_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v2f32_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.2s { v0 }, [x0], #4 +; CHECK-NEXT: ldr s0, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.2s v0, v0[0] ; CHECK-NEXT: ret %tmp1 = load float, ptr %bar %tmp2 = insertelement <2 x float> , float %tmp1, i32 0 @@ -8594,8 +8603,9 @@ define <2 x double> @test_v2f64_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v2f64_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.2d { v0 }, [x0], #8 +; CHECK-NEXT: ldr d0, [x0], #8 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.2d v0, v0[0] ; CHECK-NEXT: ret %tmp1 = load double, ptr %bar %tmp2 = insertelement <2 x double> , double %tmp1, i32 0 @@ -8623,8 +8633,9 @@ define <16 x i8> @test_v16i8_post_imm_ld1lane(ptr %bar, ptr %ptr, <16 x i8> %A) { ; CHECK-LABEL: test_v16i8_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.b { v0 }[1], [x0], #1 +; CHECK-NEXT: ldrb w8, [x0], #1 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.b v0[1], w8 ; CHECK-NEXT: ret %tmp1 = load i8, ptr %bar %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1 @@ -8649,10 +8660,11 @@ define <8 x i8> @test_v8i8_post_imm_ld1lane(ptr %bar, ptr %ptr, <8 x i8> %A) { ; CHECK-LABEL: test_v8i8_post_imm_ld1lane: ; CHECK: ; %bb.0: +; CHECK-NEXT: ldrb w8, [x0], #1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.b { v0 }[1], [x0], #1 -; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.b v0[1], w8 +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = load i8, ptr %bar %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1 @@ -8679,8 +8691,9 @@ define <8 x i16> @test_v8i16_post_imm_ld1lane(ptr %bar, ptr %ptr, <8 x i16> %A) { ; CHECK-LABEL: test_v8i16_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.h { v0 }[1], [x0], #2 +; CHECK-NEXT: ldrh w8, [x0], #2 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.h v0[1], w8 ; CHECK-NEXT: ret %tmp1 = load i16, ptr %bar %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1 @@ -8706,10 +8719,11 @@ define <4 x i16> @test_v4i16_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x i16> %A) { ; CHECK-LABEL: test_v4i16_post_imm_ld1lane: ; CHECK: ; %bb.0: +; CHECK-NEXT: ldrh w8, [x0], #2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.h { v0 }[1], [x0], #2 -; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.h v0[1], w8 +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = load i16, ptr %bar %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1 @@ -8737,8 +8751,9 @@ define <4 x i32> @test_v4i32_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x i32> %A) { ; CHECK-LABEL: test_v4i32_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.s { v0 }[1], [x0], #4 +; CHECK-NEXT: ldr w8, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.s v0[1], w8 ; CHECK-NEXT: ret %tmp1 = load i32, ptr %bar %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1 @@ -8764,10 +8779,11 @@ define <2 x i32> @test_v2i32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x i32> %A) { ; CHECK-LABEL: test_v2i32_post_imm_ld1lane: ; CHECK: ; %bb.0: +; CHECK-NEXT: ldr w8, [x0], #4 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.s { v0 }[1], [x0], #4 -; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.s v0[1], w8 +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = load i32, ptr %bar %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1 @@ -8795,8 +8811,9 @@ define <2 x i64> @test_v2i64_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x i64> %A) { ; CHECK-LABEL: test_v2i64_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.d { v0 }[1], [x0], #8 +; CHECK-NEXT: ldr x8, [x0], #8 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.d v0[1], x8 ; CHECK-NEXT: ret %tmp1 = load i64, ptr %bar %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1 @@ -8822,8 +8839,9 @@ define <4 x float> @test_v4f32_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x float> %A) { ; CHECK-LABEL: test_v4f32_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.s { v0 }[1], [x0], #4 +; CHECK-NEXT: ldr s1, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.s v0[1], v1[0] ; CHECK-NEXT: ret %tmp1 = load float, ptr %bar %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1 @@ -8849,10 +8867,11 @@ define <2 x float> @test_v2f32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x float> %A) { ; CHECK-LABEL: test_v2f32_post_imm_ld1lane: ; CHECK: ; %bb.0: +; CHECK-NEXT: ldr s1, [x0], #4 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.s { v0 }[1], [x0], #4 -; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.s v0[1], v1[0] +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = load float, ptr %bar %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1 @@ -8880,8 +8899,9 @@ define <2 x double> @test_v2f64_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x double> %A) { ; CHECK-LABEL: test_v2f64_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.d { v0 }[1], [x0], #8 +; CHECK-NEXT: ldr d1, [x0], #8 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret %tmp1 = load double, ptr %bar %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1 @@ -9143,7 +9163,7 @@ ; CHECK-LABEL: load_single_extract_variable_index_v3i32_small_align: ; CHECK: ; %bb.0: ; CHECK-NEXT: mov w9, w1 -; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov w8, #2 ; =0x2 ; CHECK-NEXT: cmp x9, #2 ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: ldr w0, [x0, x8, lsl #2] @@ -9157,7 +9177,7 @@ ; CHECK-LABEL: load_single_extract_variable_index_v3i32_default_align: ; CHECK: ; %bb.0: ; CHECK-NEXT: mov w9, w1 -; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov w8, #2 ; =0x2 ; CHECK-NEXT: cmp x9, #2 ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: ldr w0, [x0, x8, lsl #2] diff --git a/llvm/test/CodeGen/AArch64/arm64-ld-from-st.ll b/llvm/test/CodeGen/AArch64/arm64-ld-from-st.ll --- a/llvm/test/CodeGen/AArch64/arm64-ld-from-st.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ld-from-st.ll @@ -319,9 +319,10 @@ define i16 @Str16Ldr16(ptr nocapture %P, i16 %v, i64 %n) { ; CHECK-LABEL: Str16Ldr16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w0, w1 -; CHECK-NEXT: strh w1, [x8, #2] +; CHECK-NEXT: and w8, w1, #0xffff +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: strh w1, [x9, #2] ; CHECK-NEXT: ret entry: %arrayidx0 = getelementptr inbounds i16, ptr %P, i64 1 @@ -334,9 +335,8 @@ define i8 @Str16Ldr8_0(ptr nocapture %P, i16 %v, i64 %n) { ; CHECK-LABEL: Str16Ldr8_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w0, w1 -; CHECK-NEXT: strh w1, [x8, #2] +; CHECK-NEXT: strh w1, [x0, #2] +; CHECK-NEXT: and w0, w1, #0xff ; CHECK-NEXT: ret entry: %arrayidx0 = getelementptr inbounds i16, ptr %P, i64 1 @@ -679,9 +679,10 @@ define i16 @Unscaled_Str16Ldr16(ptr nocapture %P, i16 %v, i64 %n) { ; CHECK-LABEL: Unscaled_Str16Ldr16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w0, w1 -; CHECK-NEXT: sturh w1, [x8, #-2] +; CHECK-NEXT: and w8, w1, #0xffff +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: sturh w1, [x9, #-2] ; CHECK-NEXT: ret entry: %arrayidx0 = getelementptr inbounds i16, ptr %P, i64 -1 @@ -694,9 +695,8 @@ define i8 @Unscaled_Str16Ldr8_0(ptr nocapture %P, i16 %v, i64 %n) { ; CHECK-LABEL: Unscaled_Str16Ldr8_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w0, w1 -; CHECK-NEXT: sturh w1, [x8, #-2] +; CHECK-NEXT: sturh w1, [x0, #-2] +; CHECK-NEXT: and w0, w1, #0xff ; CHECK-NEXT: ret entry: %arrayidx0 = getelementptr inbounds i16, ptr %P, i64 -1 diff --git a/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll --- a/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll +++ b/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll @@ -19,12 +19,12 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, src ; CHECK-NEXT: add x8, x8, :lo12:src -; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: ldur w9, [x8, #7] ; CHECK-NEXT: adrp x10, dst ; CHECK-NEXT: add x10, x10, :lo12:dst -; CHECK-NEXT: str x9, [x10] -; CHECK-NEXT: ldur w8, [x8, #7] -; CHECK-NEXT: stur w8, [x10, #7] +; CHECK-NEXT: stur w9, [x10, #7] +; CHECK-NEXT: ldr x8, [x8] +; CHECK-NEXT: str x8, [x10] ; CHECK-NEXT: mov w0, #0 // =0x0 ; CHECK-NEXT: ret entry: @@ -37,10 +37,10 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, .L.str1 ; CHECK-NEXT: add x8, x8, :lo12:.L.str1 -; CHECK-NEXT: ldr q0, [x8] -; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ldur q0, [x8, #15] ; CHECK-NEXT: stur q0, [x0, #15] +; CHECK-NEXT: ldr q0, [x8] +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false) @@ -55,8 +55,8 @@ ; CHECK-NEXT: str w8, [x0, #32] ; CHECK-NEXT: adrp x8, .L.str2 ; CHECK-NEXT: add x8, x8, :lo12:.L.str2 -; CHECK-NEXT: ldp q0, q1, [x8] -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x8] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false) @@ -68,10 +68,10 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, .L.str3 ; CHECK-NEXT: add x8, x8, :lo12:.L.str3 +; CHECK-NEXT: ldr x9, [x8, #16] +; CHECK-NEXT: str x9, [x0, #16] ; CHECK-NEXT: ldr q0, [x8] ; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: ldr x8, [x8, #16] -; CHECK-NEXT: str x8, [x0, #16] ; CHECK-NEXT: ret entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false) @@ -113,12 +113,12 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, .L.str6 ; CHECK-NEXT: add x8, x8, :lo12:.L.str6 -; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: ldur x9, [x8, #6] ; CHECK-NEXT: adrp x10, spool.splbuf ; CHECK-NEXT: add x10, x10, :lo12:spool.splbuf -; CHECK-NEXT: str x9, [x10] -; CHECK-NEXT: ldur x8, [x8, #6] -; CHECK-NEXT: stur x8, [x10, #6] +; CHECK-NEXT: stur x9, [x10, #6] +; CHECK-NEXT: ldr x8, [x8] +; CHECK-NEXT: str x8, [x10] ; CHECK-NEXT: ret entry: call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -2541,9 +2541,9 @@ ; CHECK-NEXT: adrp x9, .LCPI196_0 ; CHECK-NEXT: fmov d4, x0 ; CHECK-NEXT: rev32 v5.8h, v0.8h -; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: dup v2.8h, w8 ; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI196_0] -; CHECK-NEXT: sqneg v2.8h, v1.8h +; CHECK-NEXT: sqneg v1.8h, v2.8h ; CHECK-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b ; CHECK-NEXT: sqdmull v2.4s, v0.4h, v4.h[0] ; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v4.h[0] diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1147,7 +1147,15 @@ ; CHECK-LABEL: testDUP.v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.8b, v0.b[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.b[1], v0.b[0] +; CHECK-NEXT: mov v1.b[2], v0.b[0] +; CHECK-NEXT: mov v1.b[3], v0.b[0] +; CHECK-NEXT: mov v1.b[4], v0.b[0] +; CHECK-NEXT: mov v1.b[5], v0.b[0] +; CHECK-NEXT: mov v1.b[6], v0.b[0] +; CHECK-NEXT: mov v1.b[7], v0.b[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %b = extractelement <1 x i8> %a, i32 0 %c = insertelement <8 x i8> undef, i8 %b, i32 0 @@ -1165,7 +1173,15 @@ ; CHECK-LABEL: testDUP.v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.h[1], v0.h[0] +; CHECK-NEXT: mov v1.h[2], v0.h[0] +; CHECK-NEXT: mov v1.h[3], v0.h[0] +; CHECK-NEXT: mov v1.h[4], v0.h[0] +; CHECK-NEXT: mov v1.h[5], v0.h[0] +; CHECK-NEXT: mov v1.h[6], v0.h[0] +; CHECK-NEXT: mov v1.h[7], v0.h[0] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %b = extractelement <1 x i16> %a, i32 0 %c = insertelement <8 x i16> undef, i16 %b, i32 0 @@ -1183,7 +1199,11 @@ ; CHECK-LABEL: testDUP.v1i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: mov v1.s[2], v0.s[0] +; CHECK-NEXT: mov v1.s[3], v0.s[0] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %b = extractelement <1 x i32> %a, i32 0 %c = insertelement <4 x i32> undef, i32 %b, i32 0 @@ -1196,7 +1216,15 @@ define <8 x i8> @getl(<16 x i8> %x) #0 { ; CHECK-LABEL: getl: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.b[1], v0.b[1] +; CHECK-NEXT: mov v1.b[2], v0.b[2] +; CHECK-NEXT: mov v1.b[3], v0.b[3] +; CHECK-NEXT: mov v1.b[4], v0.b[4] +; CHECK-NEXT: mov v1.b[5], v0.b[5] +; CHECK-NEXT: mov v1.b[6], v0.b[6] +; CHECK-NEXT: mov v1.b[7], v0.b[7] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %vecext = extractelement <16 x i8> %x, i32 0 %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0 @@ -1310,7 +1338,11 @@ ; CHECK-LABEL: test_dup_v1i64_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.h[1], v0.h[0] +; CHECK-NEXT: mov v1.h[2], v0.h[0] +; CHECK-NEXT: mov v1.h[3], v0.h[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret entry: %x = extractelement <1 x i64> %a, i32 0 @@ -1326,7 +1358,8 @@ ; CHECK-LABEL: test_dup_v1i64_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: mov v0.s[1], v0.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %x = extractelement <1 x i64> %a, i32 0 @@ -1388,7 +1421,11 @@ define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.h[1], v0.h[0] +; CHECK-NEXT: mov v1.h[2], v0.h[0] +; CHECK-NEXT: mov v1.h[3], v0.h[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret entry: %x = extractelement <2 x i64> %a, i32 0 @@ -1403,7 +1440,8 @@ define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: mov v0.s[1], v0.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %x = extractelement <2 x i64> %a, i32 0 @@ -1472,7 +1510,8 @@ ; CHECK-LABEL: test_concat_same_v1i32_v1i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: mov v0.s[1], v0.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %0 = extractelement <2 x i32> %a, i32 0 @@ -1515,7 +1554,16 @@ ; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v2.b[1], v0.b[1] +; CHECK-NEXT: mov v2.b[2], v0.b[2] +; CHECK-NEXT: mov v2.b[3], v0.b[3] +; CHECK-NEXT: mov v2.b[4], v0.b[4] +; CHECK-NEXT: mov v2.b[5], v0.b[5] +; CHECK-NEXT: mov v2.b[6], v0.b[6] +; CHECK-NEXT: mov v2.b[7], v0.b[7] +; CHECK-NEXT: mov v2.d[1], v1.d[0] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1542,7 +1590,14 @@ ; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v0.b[8], v1.b[0] +; CHECK-NEXT: mov v0.b[9], v1.b[1] +; CHECK-NEXT: mov v0.b[10], v1.b[2] +; CHECK-NEXT: mov v0.b[11], v1.b[3] +; CHECK-NEXT: mov v0.b[12], v1.b[4] +; CHECK-NEXT: mov v0.b[13], v1.b[5] +; CHECK-NEXT: mov v0.b[14], v1.b[6] +; CHECK-NEXT: mov v0.b[15], v1.b[7] ; CHECK-NEXT: ret entry: %vecext = extractelement <16 x i8> %x, i32 0 @@ -1584,8 +1639,24 @@ ; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v2.16b, v0.16b ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.b[1], v0.b[1] +; CHECK-NEXT: mov v2.b[2], v0.b[2] +; CHECK-NEXT: mov v2.b[3], v0.b[3] +; CHECK-NEXT: mov v2.b[4], v0.b[4] +; CHECK-NEXT: mov v2.b[5], v0.b[5] +; CHECK-NEXT: mov v2.b[6], v0.b[6] +; CHECK-NEXT: mov v2.b[7], v0.b[7] +; CHECK-NEXT: mov v2.b[8], v1.b[0] +; CHECK-NEXT: mov v2.b[9], v1.b[1] +; CHECK-NEXT: mov v2.b[10], v1.b[2] +; CHECK-NEXT: mov v2.b[11], v1.b[3] +; CHECK-NEXT: mov v2.b[12], v1.b[4] +; CHECK-NEXT: mov v2.b[13], v1.b[5] +; CHECK-NEXT: mov v2.b[14], v1.b[6] +; CHECK-NEXT: mov v2.b[15], v1.b[7] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1637,7 +1708,12 @@ ; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v2.h[1], v0.h[1] +; CHECK-NEXT: mov v2.h[2], v0.h[2] +; CHECK-NEXT: mov v2.h[3], v0.h[3] +; CHECK-NEXT: mov v2.d[1], v1.d[0] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -1656,7 +1732,10 @@ ; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v0.h[4], v1.h[0] +; CHECK-NEXT: mov v0.h[5], v1.h[1] +; CHECK-NEXT: mov v0.h[6], v1.h[2] +; CHECK-NEXT: mov v0.h[7], v1.h[3] ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i16> %x, i32 0 @@ -1682,8 +1761,16 @@ ; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v2.16b, v0.16b ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.h[1], v0.h[1] +; CHECK-NEXT: mov v2.h[2], v0.h[2] +; CHECK-NEXT: mov v2.h[3], v0.h[3] +; CHECK-NEXT: mov v2.h[4], v1.h[0] +; CHECK-NEXT: mov v2.h[5], v1.h[1] +; CHECK-NEXT: mov v2.h[6], v1.h[2] +; CHECK-NEXT: mov v2.h[7], v1.h[3] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -1719,6 +1806,7 @@ ; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[1], v0.s[1] ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: @@ -1734,7 +1822,8 @@ ; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: mov v0.s[3], v1.s[1] ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i32> %x, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll b/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll @@ -4,10 +4,10 @@ define void @test(ptr %p1, ptr %p2) { ; CHECK-LABEL: test: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #3 // =0x3 -; CHECK-NEXT: mov w9, #1 // =0x1 -; CHECK-NEXT: str w8, [x0] -; CHECK-NEXT: str w9, [x1] +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: st1 { v0.s }[1], [x0] +; CHECK-NEXT: st1 { v0.s }[2], [x1] ; CHECK-NEXT: ret %tmp = shufflevector <1 x i32> , <1 x i32> undef, <3 x i32> %tmp2 = shufflevector <3 x i32> , <3 x i32> %tmp, <3 x i32> diff --git a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll --- a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll @@ -128,15 +128,9 @@ define void @i56_or(ptr %a) { ; CHECK-LABEL: i56_or: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: ldrh w10, [x8, #4]! -; CHECK-NEXT: ldrb w11, [x8, #2] -; CHECK-NEXT: orr w9, w9, #0x180 -; CHECK-NEXT: orr w10, w10, w11, lsl #16 -; CHECK-NEXT: str w9, [x0] -; CHECK-NEXT: strb w11, [x8, #2] -; CHECK-NEXT: strh w10, [x8] +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: orr w8, w8, #0x180 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %aa = load i56, ptr %a, align 1 %b = or i56 %aa, 384 @@ -147,16 +141,10 @@ define void @i56_and_or(ptr %a) { ; CHECK-LABEL: i56_and_or: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: ldrh w10, [x8, #4]! -; CHECK-NEXT: ldrb w11, [x8, #2] -; CHECK-NEXT: orr w9, w9, #0x180 -; CHECK-NEXT: and w9, w9, #0xffffff80 -; CHECK-NEXT: orr w10, w10, w11, lsl #16 -; CHECK-NEXT: strb w11, [x8, #2] -; CHECK-NEXT: str w9, [x0] -; CHECK-NEXT: strh w10, [x8] +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: orr w8, w8, #0x180 +; CHECK-NEXT: and w8, w8, #0xffffff80 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %b = load i56, ptr %a, align 1 %c = and i56 %b, -128 @@ -168,17 +156,18 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) { ; CHECK-LABEL: i56_insert_bit: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: ldr w11, [x0] -; CHECK-NEXT: ldrh w9, [x8, #4]! -; CHECK-NEXT: ldrb w10, [x8, #2] -; CHECK-NEXT: orr w9, w9, w10, lsl #16 -; CHECK-NEXT: strb w10, [x8, #2] -; CHECK-NEXT: orr x11, x11, x9, lsl #32 -; CHECK-NEXT: and x11, x11, #0xffffffffffffdfff -; CHECK-NEXT: strh w9, [x8] -; CHECK-NEXT: orr w11, w11, w1, lsl #13 -; CHECK-NEXT: str w11, [x0] +; CHECK-NEXT: ldrb w8, [x0, #6] +; CHECK-NEXT: ldrh w9, [x0, #4] +; CHECK-NEXT: ldr w10, [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: orr x8, x10, x8, lsl #32 +; CHECK-NEXT: and x8, x8, #0xffffffffffffdfff +; CHECK-NEXT: lsr x9, x8, #48 +; CHECK-NEXT: lsr x10, x8, #32 +; CHECK-NEXT: orr w8, w8, w1, lsl #13 +; CHECK-NEXT: strb w9, [x0, #6] +; CHECK-NEXT: strh w10, [x0, #4] +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %extbit = zext i1 %bit to i56 %b = load i56, ptr %a, align 1 diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll --- a/llvm/test/CodeGen/AArch64/arm64-rev.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll @@ -60,8 +60,7 @@ ; CHECK-LABEL: test_rev_w_srl16_load: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: rev w8, w8 -; CHECK-NEXT: lsr w0, w8, #16 +; CHECK-NEXT: rev16 w0, w8 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_rev_w_srl16_load: @@ -129,8 +128,7 @@ ; CHECK-LABEL: test_rev_x_srl32_load: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: rev x8, x8 -; CHECK-NEXT: lsr x0, x8, #32 +; CHECK-NEXT: rev32 x0, x8 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_rev_x_srl32_load: diff --git a/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll b/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll --- a/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll +++ b/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll @@ -34,8 +34,9 @@ define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp { ; CHECK-LABEL: extendedLeftShiftcharToshortBy8: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: add w8, w0, #1 -; CHECK-NEXT: sbfiz w0, w8, #8, #8 +; CHECK-NEXT: lsl w8, w0, #8 +; CHECK-NEXT: add w8, w8, #256 +; CHECK-NEXT: sxth w0, w8 ; CHECK-NEXT: ret entry: %inc = add i8 %a, 1 @@ -328,8 +329,9 @@ define i64 @sign_extend_inreg_isdef32(i64) { ; CHECK-LABEL: sign_extend_inreg_isdef32: ; CHECK: ; %bb.0: -; CHECK-NEXT: sbfx x8, x0, #32, #16 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: lsr x8, x0, #16 +; CHECK-NEXT: and w8, w8, #0xffff0000 +; CHECK-NEXT: asr w0, w8, #16 ; CHECK-NEXT: ret %2 = lshr i64 %0, 32 %3 = shl i64 %2, 16 diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -49,12 +49,12 @@ ; FALLBACK-NOT: remark:{{.*}} sabdl2_8h define <8 x i16> @sabdl2_8h(ptr %A, ptr %B) nounwind { -; DAG-LABEL: sabdl2_8h: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: sabdl.8h v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: sabdl2_8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: sabdl.8h v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabdl2_8h: ; GISEL: // %bb.0: @@ -62,7 +62,7 @@ ; GISEL-NEXT: ldr q1, [x1] ; GISEL-NEXT: ext.16b v0, v0, v0, #8 ; GISEL-NEXT: ext.16b v1, v1, v0, #8 -; GISEL-NEXT: sabdl.8h v0, v0, v1 +; GISEL-NEXT: sabdl.8h v0, v0, v1 ; GISEL-NEXT: ret %load1 = load <16 x i8>, ptr %A %load2 = load <16 x i8>, ptr %B @@ -75,12 +75,12 @@ ; FALLBACK-NOT: remark:{{.*}} sabdl2_4s define <4 x i32> @sabdl2_4s(ptr %A, ptr %B) nounwind { -; DAG-LABEL: sabdl2_4s: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: sabdl.4s v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: sabdl2_4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: sabdl.4s v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabdl2_4s: ; GISEL: // %bb.0: @@ -101,12 +101,12 @@ ; FALLBACK-NOT: remark:{{.*}} sabdl2_2d define <2 x i64> @sabdl2_2d(ptr %A, ptr %B) nounwind { -; DAG-LABEL: sabdl2_2d: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: sabdl.2d v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: sabdl2_2d: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: sabdl.2d v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabdl2_2d: ; GISEL: // %bb.0: @@ -172,12 +172,12 @@ ; FALLBACK-NOT: remark:{{.*}} uabdl2_8h define <8 x i16> @uabdl2_8h(ptr %A, ptr %B) nounwind { -; DAG-LABEL: uabdl2_8h: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: uabdl.8h v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: uabdl2_8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: uabdl.8h v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabdl2_8h: ; GISEL: // %bb.0: @@ -185,7 +185,7 @@ ; GISEL-NEXT: ldr q1, [x1] ; GISEL-NEXT: ext.16b v0, v0, v0, #8 ; GISEL-NEXT: ext.16b v1, v1, v0, #8 -; GISEL-NEXT: uabdl.8h v0, v0, v1 +; GISEL-NEXT: uabdl.8h v0, v0, v1 ; GISEL-NEXT: ret %load1 = load <16 x i8>, ptr %A %load2 = load <16 x i8>, ptr %B @@ -199,12 +199,12 @@ ; FALLBACK-NOT: remark:{{.*}} uabdl2_4s define <4 x i32> @uabdl2_4s(ptr %A, ptr %B) nounwind { -; DAG-LABEL: uabdl2_4s: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: uabdl.4s v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: uabdl2_4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: uabdl.4s v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabdl2_4s: ; GISEL: // %bb.0: @@ -212,7 +212,7 @@ ; GISEL-NEXT: ldr q1, [x1] ; GISEL-NEXT: ext.16b v0, v0, v0, #8 ; GISEL-NEXT: ext.16b v1, v1, v0, #8 -; GISEL-NEXT: uabdl.4s v0, v0, v1 +; GISEL-NEXT: uabdl.4s v0, v0, v1 ; GISEL-NEXT: ret %load1 = load <8 x i16>, ptr %A %load2 = load <8 x i16>, ptr %B @@ -225,12 +225,12 @@ ; FALLBACK-NOT: remark:{{.*}} uabdl2_2d define <2 x i64> @uabdl2_2d(ptr %A, ptr %B) nounwind { -; DAG-LABEL: uabdl2_2d: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: uabdl.2d v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: uabdl2_2d: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: uabdl.2d v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabdl2_2d: ; GISEL: // %bb.0: @@ -238,7 +238,7 @@ ; GISEL-NEXT: ldr q1, [x1] ; GISEL-NEXT: ext.16b v0, v0, v0, #8 ; GISEL-NEXT: ext.16b v1, v1, v0, #8 -; GISEL-NEXT: uabdl.2d v0, v0, v1 +; GISEL-NEXT: uabdl.2d v0, v0, v1 ; GISEL-NEXT: ret %load1 = load <4 x i32>, ptr %A %load2 = load <4 x i32>, ptr %B @@ -276,9 +276,20 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: uabd16b_rdx_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: uabdl.8h v2, v0, v1 -; CHECK-NEXT: uabal2.8h v2, v0, v1 -; CHECK-NEXT: uaddlv.8h s0, v2 +; CHECK-NEXT: usubl.8h v2, v0, v1 +; CHECK-NEXT: usubl2.8h v0, v0, v1 +; CHECK-NEXT: sshll2.4s v1, v2, #0 +; CHECK-NEXT: sshll2.4s v3, v0, #0 +; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: sshll.4s v2, v2, #0 +; CHECK-NEXT: abs.4s v0, v0 +; CHECK-NEXT: abs.4s v3, v3 +; CHECK-NEXT: abs.4s v1, v1 +; CHECK-NEXT: abs.4s v2, v2 +; CHECK-NEXT: add.4s v1, v1, v3 +; CHECK-NEXT: add.4s v0, v2, v0 +; CHECK-NEXT: add.4s v0, v0, v1 +; CHECK-NEXT: addv.4s s0, v0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %aext = zext <16 x i8> %a to <16 x i32> @@ -294,9 +305,20 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: sabd16b_rdx_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sabdl.8h v2, v0, v1 -; CHECK-NEXT: sabal2.8h v2, v0, v1 -; CHECK-NEXT: uaddlv.8h s0, v2 +; CHECK-NEXT: ssubl.8h v2, v0, v1 +; CHECK-NEXT: ssubl2.8h v0, v0, v1 +; CHECK-NEXT: sshll2.4s v1, v2, #0 +; CHECK-NEXT: sshll2.4s v3, v0, #0 +; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: sshll.4s v2, v2, #0 +; CHECK-NEXT: abs.4s v0, v0 +; CHECK-NEXT: abs.4s v3, v3 +; CHECK-NEXT: abs.4s v1, v1 +; CHECK-NEXT: abs.4s v2, v2 +; CHECK-NEXT: add.4s v1, v1, v3 +; CHECK-NEXT: add.4s v0, v2, v0 +; CHECK-NEXT: add.4s v0, v0, v1 +; CHECK-NEXT: addv.4s s0, v0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %aext = sext <16 x i8> %a to <16 x i32> @@ -1033,13 +1055,13 @@ ; FALLBACK-NOT: remark:{{.*}} sabal2_8h define <8 x i16> @sabal2_8h(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: sabal2_8h: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: sabal.8h v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: sabal2_8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: sabal.8h v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabal2_8h: ; GISEL: // %bb.0: @@ -1063,13 +1085,13 @@ ; FALLBACK-NOT: remark:{{.*}} sabal2_4s define <4 x i32> @sabal2_4s(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: sabal2_4s: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: sabal.4s v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: sabal2_4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: sabal.4s v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabal2_4s: ; GISEL: // %bb.0: @@ -1093,13 +1115,13 @@ ; FALLBACK-NOT: remark:{{.*}} sabal2_2d define <2 x i64> @sabal2_2d(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: sabal2_2d: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: sabal.2d v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: sabal2_2d: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: sabal.2d v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabal2_2d: ; GISEL: // %bb.0: @@ -1201,13 +1223,13 @@ ; FALLBACK-NOT: remark:{{.*}} uabal2_8h define <8 x i16> @uabal2_8h(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: uabal2_8h: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: uabal.8h v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: uabal2_8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: uabal.8h v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabal2_8h: ; GISEL: // %bb.0: @@ -1231,13 +1253,13 @@ ; FALLBACK-NOT: remark:{{.*}} uabal2_4s define <4 x i32> @uabal2_4s(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: uabal2_4s: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: uabal.4s v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: uabal2_4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: uabal.4s v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabal2_4s: ; GISEL: // %bb.0: @@ -1261,13 +1283,13 @@ ; FALLBACK-NOT: remark:{{.*}} uabal2_2d define <2 x i64> @uabal2_2d(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: uabal2_2d: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: uabal.2d v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: uabal2_2d: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: uabal.2d v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabal2_2d: ; GISEL: // %bb.0: @@ -1624,12 +1646,18 @@ ; FALLBACK-NOT: remark:{{.*}} uabdl_from_extract_dup define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { -; CHECK-LABEL: uabdl_from_extract_dup: -; CHECK: // %bb.0: -; CHECK-NEXT: dup.2s v1, w0 +; DAG-LABEL: uabdl_from_extract_dup: +; DAG: // %bb.0: +; DAG-NEXT: dup.2s v1, w0 +; DAG-NEXT: uabdl.2d v0, v0, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uabdl_from_extract_dup: +; GISEL: // %bb.0: +; GISEL-NEXT: dup.2s v1, w0 ; GISEL-NEXT: ext.16b v0, v0, v0, #0 -; CHECK-NEXT: uabdl.2d v0, v0, v1 -; CHECK-NEXT: ret +; GISEL-NEXT: uabdl.2d v0, v0, v1 +; GISEL-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 @@ -1642,11 +1670,11 @@ ; FALLBACK-NOT: remark:{{.*}} uabdl2_from_extract_dup define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { -; DAG-LABEL: uabdl2_from_extract_dup: -; DAG: // %bb.0: -; DAG-NEXT: dup.4s v1, w0 -; DAG-NEXT: uabdl2.2d v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: uabdl2_from_extract_dup: +; DAG: // %bb.0: +; DAG-NEXT: dup.4s v1, w0 +; DAG-NEXT: uabdl2.2d v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabdl2_from_extract_dup: ; GISEL: // %bb.0: @@ -1666,12 +1694,18 @@ ; FALLBACK-NOT: remark:{{.*}} sabdl_from_extract_dup define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { -; CHECK-LABEL: sabdl_from_extract_dup: -; CHECK: // %bb.0: -; CHECK-NEXT: dup.2s v1, w0 +; DAG-LABEL: sabdl_from_extract_dup: +; DAG: // %bb.0: +; DAG-NEXT: dup.2s v1, w0 +; DAG-NEXT: sabdl.2d v0, v0, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: sabdl_from_extract_dup: +; GISEL: // %bb.0: +; GISEL-NEXT: dup.2s v1, w0 ; GISEL-NEXT: ext.16b v0, v0, v0, #0 -; CHECK-NEXT: sabdl.2d v0, v0, v1 -; CHECK-NEXT: ret +; GISEL-NEXT: sabdl.2d v0, v0, v1 +; GISEL-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 @@ -1684,11 +1718,11 @@ ; FALLBACK-NOT: remark:{{.*}} sabdl2_from_extract_dup define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { -; DAG-LABEL: sabdl2_from_extract_dup: -; DAG: // %bb.0: -; DAG-NEXT: dup.4s v1, w0 -; DAG-NEXT: sabdl2.2d v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: sabdl2_from_extract_dup: +; DAG: // %bb.0: +; DAG-NEXT: dup.4s v1, w0 +; DAG-NEXT: sabdl2.2d v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabdl2_from_extract_dup: ; GISEL: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -329,7 +329,11 @@ define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD8b: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.8b v0, v0, v1 +; CHECK-NEXT: sshll.8h v0, v0, #0 +; CHECK-NEXT: sshll.8h v1, v1, #0 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.8h v0, v1, v0 +; CHECK-NEXT: shrn.8b v0, v0, #1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <8 x i8> %src1 to <8 x i16> @@ -345,7 +349,11 @@ define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD4h: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.4h v0, v0, v1 +; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: sshll.4s v1, v1, #0 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.4s v0, v1, v0 +; CHECK-NEXT: shrn.4h v0, v0, #1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <4 x i16> %src1 to <4 x i32> @@ -361,7 +369,11 @@ define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD2s: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.2s v0, v0, v1 +; CHECK-NEXT: sshll.2d v0, v0, #0 +; CHECK-NEXT: sshll.2d v1, v1, #0 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.2d v0, v1, v0 +; CHECK-NEXT: shrn.2s v0, v0, #1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <2 x i32> %src1 to <2 x i64> @@ -377,8 +389,17 @@ define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD16b: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.16b v0, v0, v1 -; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: sshll.8h v2, v0, #0 +; CHECK-NEXT: sshll.8h v3, v1, #0 +; CHECK-NEXT: sshll2.8h v0, v0, #0 +; CHECK-NEXT: mvn.16b v2, v2 +; CHECK-NEXT: sshll2.8h v1, v1, #0 +; CHECK-NEXT: sub.8h v2, v3, v2 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.8h v0, v1, v0 +; CHECK-NEXT: shrn.8b v1, v2, #1 +; CHECK-NEXT: shrn2.16b v1, v0, #1 +; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> @@ -393,8 +414,17 @@ define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD8h: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.8h v0, v0, v1 -; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: sshll.4s v2, v0, #0 +; CHECK-NEXT: sshll.4s v3, v1, #0 +; CHECK-NEXT: sshll2.4s v0, v0, #0 +; CHECK-NEXT: mvn.16b v2, v2 +; CHECK-NEXT: sshll2.4s v1, v1, #0 +; CHECK-NEXT: sub.4s v2, v3, v2 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.4s v0, v1, v0 +; CHECK-NEXT: shrn.4h v1, v2, #1 +; CHECK-NEXT: shrn2.8h v1, v0, #1 +; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> %sextsrc2 = sext <8 x i16> %src2 to <8 x i32> @@ -409,8 +439,17 @@ define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD4s: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.4s v0, v0, v1 -; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: sshll.2d v2, v0, #0 +; CHECK-NEXT: sshll.2d v3, v1, #0 +; CHECK-NEXT: sshll2.2d v0, v0, #0 +; CHECK-NEXT: mvn.16b v2, v2 +; CHECK-NEXT: sshll2.2d v1, v1, #0 +; CHECK-NEXT: sub.2d v2, v3, v2 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.2d v0, v1, v0 +; CHECK-NEXT: shrn.2s v1, v2, #1 +; CHECK-NEXT: shrn2.4s v1, v0, #1 +; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -1004,7 +1043,9 @@ ; CHECK-NEXT: shl.2s v1, v1, #24 ; CHECK-NEXT: sshr.2s v0, v0, #24 ; CHECK-NEXT: sshr.2s v1, v1, #24 -; CHECK-NEXT: srhadd.2s v0, v0, v1 +; CHECK-NEXT: mvn.8b v0, v0 +; CHECK-NEXT: sub.2s v0, v1, v0 +; CHECK-NEXT: sshr.2s v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <2 x i8> %src1 to <2 x i16> %zextsrc2 = sext <2 x i8> %src2 to <2 x i16> @@ -1020,7 +1061,9 @@ ; CHECK-NEXT: movi d2, #0x0000ff000000ff ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: urhadd.2s v0, v0, v1 +; CHECK-NEXT: mvn.8b v0, v0 +; CHECK-NEXT: sub.2s v0, v1, v0 +; CHECK-NEXT: ushr.2s v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i8> %src1 to <2 x i16> %zextsrc2 = zext <2 x i8> %src2 to <2 x i16> @@ -1057,7 +1100,9 @@ ; CHECK-NEXT: movi d2, #0x0000ff000000ff ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: urhadd.2s v0, v0, v1 +; CHECK-NEXT: mvn.8b v0, v0 +; CHECK-NEXT: sub.2s v0, v1, v0 +; CHECK-NEXT: ushr.2s v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i8> %src1 to <2 x i16> %zextsrc2 = zext <2 x i8> %src2 to <2 x i16> @@ -1304,6 +1349,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: movi.8b v2, #7 ; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: bic.8h v1, #255, lsl #8 ; CHECK-NEXT: xtn.8b v1, v1 ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: uhadd.8b v0, v0, v1 diff --git a/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll b/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll --- a/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll +++ b/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll @@ -40,10 +40,10 @@ ; CHECK-NEXT: stp x28, x27, [sp, #384] ; 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w27, -8 ; CHECK-NEXT: .cfi_offset w28, -16 -; CHECK-NEXT: ldr q0, [x0, #272] ; CHECK-NEXT: ldr x8, [x0, #288] -; CHECK-NEXT: stur q0, [sp, #216] +; CHECK-NEXT: ldr q0, [x0, #272] ; CHECK-NEXT: str x8, [sp, #232] +; CHECK-NEXT: stur q0, [sp, #216] ; CHECK-NEXT: ldp x28, x27, [sp, #384] ; 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #400 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -406,7 +406,7 @@ define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { ; CHECK-LABEL: smlal2d_chain_with_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov w8, #257 // =0x101 ; CHECK-NEXT: dup.2d v3, x8 ; CHECK-NEXT: smlal.2d v3, v0, v2 ; CHECK-NEXT: mvn.8b v0, v2 @@ -475,7 +475,7 @@ define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { ; CHECK-LABEL: smlsl2d_chain_with_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov w8, #257 // =0x101 ; CHECK-NEXT: dup.2d v3, x8 ; CHECK-NEXT: smlsl.2d v3, v0, v2 ; CHECK-NEXT: mvn.8b v0, v2 @@ -685,7 +685,7 @@ define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { ; CHECK-LABEL: umlal2d_chain_with_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov w8, #257 // =0x101 ; CHECK-NEXT: dup.2d v3, x8 ; CHECK-NEXT: umlal.2d v3, v0, v2 ; CHECK-NEXT: mvn.8b v0, v2 @@ -754,7 +754,7 @@ define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { ; CHECK-LABEL: umlsl2d_chain_with_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov w8, #257 // =0x101 ; CHECK-NEXT: dup.2d v3, x8 ; CHECK-NEXT: umlsl.2d v3, v0, v2 ; CHECK-NEXT: mvn.8b v0, v2 @@ -2416,7 +2416,15 @@ ; CHECK-LABEL: vmulq_built_dup_fromsmall_test: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mul.8h v0, v0, v1[0] +; CHECK-NEXT: mov.16b v2, v1 +; CHECK-NEXT: mov.h v2[1], v1[0] +; CHECK-NEXT: mov.h v2[2], v1[0] +; CHECK-NEXT: mov.h v2[3], v1[0] +; CHECK-NEXT: mov.h v2[4], v1[0] +; CHECK-NEXT: mov.h v2[5], v1[0] +; CHECK-NEXT: mov.h v2[6], v1[0] +; CHECK-NEXT: mov.h v2[7], v1[0] +; CHECK-NEXT: mul.8h v0, v0, v2 ; CHECK-NEXT: ret %vget_lane = extractelement <4 x i16> %b, i32 0 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll --- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll +++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll @@ -152,8 +152,8 @@ ; CHECK-NEXT: add x19, x19, :lo12:Pod ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl copy_pod -; CHECK-NEXT: str d0, [x19] ; CHECK-NEXT: str d1, [x19, #8] +; CHECK-NEXT: str d0, [x19] ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 8 @@ -186,8 +186,8 @@ ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: mov x1, x19 ; CHECK-NEXT: bl copy_notcxx14aggregate -; CHECK-NEXT: ldp d0, d1, [sp] -; CHECK-NEXT: stp d0, d1, [x19] +; CHECK-NEXT: ldp d1, d0, [sp] +; CHECK-NEXT: stp d1, d0, [x19] ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 24 diff --git a/llvm/test/CodeGen/AArch64/arm64-xaluo.ll b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll --- a/llvm/test/CodeGen/AArch64/arm64-xaluo.ll +++ b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll @@ -102,7 +102,7 @@ define zeroext i1 @saddo4.i32(i32 %v1, ptr %res) { ; SDAG-LABEL: saddo4.i32: ; SDAG: // %bb.0: // %entry -; SDAG-NEXT: mov w8, #16777215 +; SDAG-NEXT: mov w8, #16777215 // =0xffffff ; SDAG-NEXT: adds w8, w0, w8 ; SDAG-NEXT: cset w0, vs ; SDAG-NEXT: str w8, [x1] @@ -110,7 +110,7 @@ ; ; FAST-LABEL: saddo4.i32: ; FAST: // %bb.0: // %entry -; FAST-NEXT: mov w8, #16777215 +; FAST-NEXT: mov w8, #16777215 // =0xffffff ; FAST-NEXT: adds w8, w0, w8 ; FAST-NEXT: cset w9, vs ; FAST-NEXT: and w0, w9, #0x1 @@ -119,7 +119,7 @@ ; ; GISEL-LABEL: saddo4.i32: ; GISEL: // %bb.0: // %entry -; GISEL-NEXT: mov w8, #16777215 +; GISEL-NEXT: mov w8, #16777215 // =0xffffff ; GISEL-NEXT: adds w8, w0, w8 ; GISEL-NEXT: cset w0, vs ; GISEL-NEXT: str w8, [x1] @@ -1327,25 +1327,27 @@ ; SDAG-LABEL: uaddo.selectboth.i8: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: and w8, w0, #0xff -; SDAG-NEXT: mov w9, #10 ; SDAG-NEXT: add w8, w8, w1, uxtb -; SDAG-NEXT: tst w8, #0x100 +; SDAG-NEXT: lsr w9, w8, #8 +; SDAG-NEXT: cmp w9, #0 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret ; ; FAST-LABEL: uaddo.selectboth.i8: ; FAST: // %bb.0: // %entry ; FAST-NEXT: and w8, w0, #0xff -; FAST-NEXT: mov w9, #10 ; FAST-NEXT: add w8, w8, w1, uxtb -; FAST-NEXT: tst w8, #0x100 +; FAST-NEXT: lsr w9, w8, #8 +; FAST-NEXT: cmp w9, #0 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret ; ; GISEL-LABEL: uaddo.selectboth.i8: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: and w8, w1, #0xff -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: add w8, w8, w0, uxtb ; GISEL-NEXT: cmp w8, w8, uxtb ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1362,7 +1364,7 @@ ; SDAG-LABEL: saddo.selectboth.i8: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: sxtb w8, w0 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: add w8, w8, w1, sxtb ; SDAG-NEXT: cmp w8, w8, sxtb ; SDAG-NEXT: csel w0, w8, w9, ne @@ -1371,7 +1373,7 @@ ; FAST-LABEL: saddo.selectboth.i8: ; FAST: // %bb.0: // %entry ; FAST-NEXT: sxtb w8, w0 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: add w8, w8, w1, sxtb ; FAST-NEXT: cmp w8, w8, sxtb ; FAST-NEXT: csel w0, w8, w9, ne @@ -1380,7 +1382,7 @@ ; GISEL-LABEL: saddo.selectboth.i8: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: sxtb w8, w1 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: add w8, w8, w0, sxtb ; GISEL-NEXT: cmp w8, w8, sxtb ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1397,25 +1399,27 @@ ; SDAG-LABEL: uaddo.selectboth.i16: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: and w8, w0, #0xffff -; SDAG-NEXT: mov w9, #10 ; SDAG-NEXT: add w8, w8, w1, uxth -; SDAG-NEXT: tst w8, #0x10000 +; SDAG-NEXT: lsr w9, w8, #16 +; SDAG-NEXT: cmp w9, #0 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret ; ; FAST-LABEL: uaddo.selectboth.i16: ; FAST: // %bb.0: // %entry ; FAST-NEXT: and w8, w0, #0xffff -; FAST-NEXT: mov w9, #10 ; FAST-NEXT: add w8, w8, w1, uxth -; FAST-NEXT: tst w8, #0x10000 +; FAST-NEXT: lsr w9, w8, #16 +; FAST-NEXT: cmp w9, #0 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret ; ; GISEL-LABEL: uaddo.selectboth.i16: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: and w8, w1, #0xffff -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: add w8, w8, w0, uxth ; GISEL-NEXT: cmp w8, w8, uxth ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1432,7 +1436,7 @@ ; SDAG-LABEL: saddo.selectboth.i16: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: sxth w8, w0 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: add w8, w8, w1, sxth ; SDAG-NEXT: cmp w8, w8, sxth ; SDAG-NEXT: csel w0, w8, w9, ne @@ -1441,7 +1445,7 @@ ; FAST-LABEL: saddo.selectboth.i16: ; FAST: // %bb.0: // %entry ; FAST-NEXT: sxth w8, w0 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: add w8, w8, w1, sxth ; FAST-NEXT: cmp w8, w8, sxth ; FAST-NEXT: csel w0, w8, w9, ne @@ -1450,7 +1454,7 @@ ; GISEL-LABEL: saddo.selectboth.i16: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: sxth w8, w1 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: add w8, w8, w0, sxth ; GISEL-NEXT: cmp w8, w8, sxth ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1467,21 +1471,21 @@ ; SDAG-LABEL: uaddo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: adds w8, w0, w1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, hs ; SDAG-NEXT: ret ; ; FAST-LABEL: uaddo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: adds w8, w0, w1 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, hs ; FAST-NEXT: ret ; ; GISEL-LABEL: uaddo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: adds w8, w0, w1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, hs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel w0, w8, w10, ne @@ -1498,21 +1502,21 @@ ; SDAG-LABEL: saddo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: adds w8, w0, w1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, vs ; SDAG-NEXT: ret ; ; FAST-LABEL: saddo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: adds w8, w0, w1 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, vs ; FAST-NEXT: ret ; ; GISEL-LABEL: saddo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: adds w8, w0, w1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, vs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel w0, w8, w10, ne @@ -1529,21 +1533,21 @@ ; SDAG-LABEL: uaddo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: adds x8, x0, x1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel x0, x8, x9, hs ; SDAG-NEXT: ret ; ; FAST-LABEL: uaddo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: adds x8, x0, x1 -; FAST-NEXT: mov x9, #10 +; FAST-NEXT: mov x9, #10 // =0xa ; FAST-NEXT: csel x0, x8, x9, hs ; FAST-NEXT: ret ; ; GISEL-LABEL: uaddo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: adds x8, x0, x1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, hs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel x0, x8, x10, ne @@ -1560,21 +1564,21 @@ ; SDAG-LABEL: saddo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: adds x8, x0, x1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel x0, x8, x9, vs ; SDAG-NEXT: ret ; ; FAST-LABEL: saddo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: adds x8, x0, x1 -; FAST-NEXT: mov x9, #10 +; FAST-NEXT: mov x9, #10 // =0xa ; FAST-NEXT: csel x0, x8, x9, vs ; FAST-NEXT: ret ; ; GISEL-LABEL: saddo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: adds x8, x0, x1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, vs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel x0, x8, x10, ne @@ -1591,7 +1595,7 @@ ; SDAG-LABEL: usubo.selectboth.i8: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: and w8, w0, #0xff -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: sub w8, w8, w1, uxtb ; SDAG-NEXT: tst w8, #0xffffff00 ; SDAG-NEXT: csel w0, w8, w9, ne @@ -1600,7 +1604,7 @@ ; FAST-LABEL: usubo.selectboth.i8: ; FAST: // %bb.0: // %entry ; FAST-NEXT: and w8, w0, #0xff -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: sub w8, w8, w1, uxtb ; FAST-NEXT: tst w8, #0xffffff00 ; FAST-NEXT: csel w0, w8, w9, ne @@ -1609,7 +1613,7 @@ ; GISEL-LABEL: usubo.selectboth.i8: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: and w8, w0, #0xff -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: sub w8, w8, w1, uxtb ; GISEL-NEXT: cmp w8, w8, uxtb ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1626,7 +1630,7 @@ ; CHECK-LABEL: ssubo.selectboth.i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: mov w9, #10 +; CHECK-NEXT: mov w9, #10 // =0xa ; CHECK-NEXT: sub w8, w8, w1, sxtb ; CHECK-NEXT: cmp w8, w8, sxtb ; CHECK-NEXT: csel w0, w8, w9, ne @@ -1643,7 +1647,7 @@ ; SDAG-LABEL: usubo.selectboth.i16: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: and w8, w0, #0xffff -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: sub w8, w8, w1, uxth ; SDAG-NEXT: tst w8, #0xffff0000 ; SDAG-NEXT: csel w0, w8, w9, ne @@ -1652,7 +1656,7 @@ ; FAST-LABEL: usubo.selectboth.i16: ; FAST: // %bb.0: // %entry ; FAST-NEXT: and w8, w0, #0xffff -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: sub w8, w8, w1, uxth ; FAST-NEXT: tst w8, #0xffff0000 ; FAST-NEXT: csel w0, w8, w9, ne @@ -1661,7 +1665,7 @@ ; GISEL-LABEL: usubo.selectboth.i16: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: and w8, w0, #0xffff -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: sub w8, w8, w1, uxth ; GISEL-NEXT: cmp w8, w8, uxth ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1678,7 +1682,7 @@ ; CHECK-LABEL: ssubo.selectboth.i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: mov w9, #10 +; CHECK-NEXT: mov w9, #10 // =0xa ; CHECK-NEXT: sub w8, w8, w1, sxth ; CHECK-NEXT: cmp w8, w8, sxth ; CHECK-NEXT: csel w0, w8, w9, ne @@ -1695,21 +1699,21 @@ ; SDAG-LABEL: usubo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: subs w8, w0, w1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, lo ; SDAG-NEXT: ret ; ; FAST-LABEL: usubo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: subs w8, w0, w1 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, lo ; FAST-NEXT: ret ; ; GISEL-LABEL: usubo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: subs w8, w0, w1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, lo ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel w0, w8, w10, ne @@ -1726,21 +1730,21 @@ ; SDAG-LABEL: ssubo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: subs w8, w0, w1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, vs ; SDAG-NEXT: ret ; ; FAST-LABEL: ssubo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: subs w8, w0, w1 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, vs ; FAST-NEXT: ret ; ; GISEL-LABEL: ssubo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: subs w8, w0, w1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, vs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel w0, w8, w10, ne @@ -1757,21 +1761,21 @@ ; SDAG-LABEL: usubo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: subs x8, x0, x1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel x0, x8, x9, lo ; SDAG-NEXT: ret ; ; FAST-LABEL: usubo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: subs x8, x0, x1 -; FAST-NEXT: mov x9, #10 +; FAST-NEXT: mov x9, #10 // =0xa ; FAST-NEXT: csel x0, x8, x9, lo ; FAST-NEXT: ret ; ; GISEL-LABEL: usubo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: subs x8, x0, x1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, lo ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel x0, x8, x10, ne @@ -1788,21 +1792,21 @@ ; SDAG-LABEL: ssubo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: subs x8, x0, x1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel x0, x8, x9, vs ; SDAG-NEXT: ret ; ; FAST-LABEL: ssubo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: subs x8, x0, x1 -; FAST-NEXT: mov x9, #10 +; FAST-NEXT: mov x9, #10 // =0xa ; FAST-NEXT: csel x0, x8, x9, vs ; FAST-NEXT: ret ; ; GISEL-LABEL: ssubo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: subs x8, x0, x1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, vs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel x0, x8, x10, ne @@ -1822,7 +1826,7 @@ ; SDAG-NEXT: and w8, w1, #0xff ; SDAG-NEXT: and w9, w0, #0xff ; SDAG-NEXT: mul w8, w9, w8 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: tst w8, #0xff00 ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret @@ -1832,7 +1836,7 @@ ; FAST-NEXT: and w8, w1, #0xff ; FAST-NEXT: and w9, w0, #0xff ; FAST-NEXT: mul w8, w9, w8 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: tst w8, #0xff00 ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret @@ -1842,7 +1846,7 @@ ; GISEL-NEXT: and w8, w0, #0xff ; GISEL-NEXT: and w9, w1, #0xff ; GISEL-NEXT: mul w8, w8, w9 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: cmp w8, w8, uxtb ; GISEL-NEXT: csel w0, w8, w9, ne ; GISEL-NEXT: ret @@ -1860,7 +1864,7 @@ ; SDAG-NEXT: sxtb w8, w1 ; SDAG-NEXT: sxtb w9, w0 ; SDAG-NEXT: mul w8, w9, w8 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: cmp w8, w8, sxtb ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret @@ -1870,7 +1874,7 @@ ; FAST-NEXT: sxtb w8, w1 ; FAST-NEXT: sxtb w9, w0 ; FAST-NEXT: mul w8, w9, w8 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: cmp w8, w8, sxtb ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret @@ -1880,7 +1884,7 @@ ; GISEL-NEXT: sxtb w8, w0 ; GISEL-NEXT: sxtb w9, w1 ; GISEL-NEXT: mul w8, w8, w9 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: cmp w8, w8, sxtb ; GISEL-NEXT: csel w0, w8, w9, ne ; GISEL-NEXT: ret @@ -1898,7 +1902,7 @@ ; SDAG-NEXT: and w8, w1, #0xffff ; SDAG-NEXT: and w9, w0, #0xffff ; SDAG-NEXT: mul w8, w9, w8 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: tst w8, #0xffff0000 ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret @@ -1908,7 +1912,7 @@ ; FAST-NEXT: and w8, w1, #0xffff ; FAST-NEXT: and w9, w0, #0xffff ; FAST-NEXT: mul w8, w9, w8 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: tst w8, #0xffff0000 ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret @@ -1918,7 +1922,7 @@ ; GISEL-NEXT: and w8, w0, #0xffff ; GISEL-NEXT: and w9, w1, #0xffff ; GISEL-NEXT: mul w8, w8, w9 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: cmp w8, w8, uxth ; GISEL-NEXT: csel w0, w8, w9, ne ; GISEL-NEXT: ret @@ -1936,7 +1940,7 @@ ; SDAG-NEXT: sxth w8, w1 ; SDAG-NEXT: sxth w9, w0 ; SDAG-NEXT: mul w8, w9, w8 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: cmp w8, w8, sxth ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret @@ -1946,7 +1950,7 @@ ; FAST-NEXT: sxth w8, w1 ; FAST-NEXT: sxth w9, w0 ; FAST-NEXT: mul w8, w9, w8 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: cmp w8, w8, sxth ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret @@ -1956,7 +1960,7 @@ ; GISEL-NEXT: sxth w8, w0 ; GISEL-NEXT: sxth w9, w1 ; GISEL-NEXT: mul w8, w8, w9 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: cmp w8, w8, sxth ; GISEL-NEXT: csel w0, w8, w9, ne ; GISEL-NEXT: ret @@ -1972,7 +1976,7 @@ ; SDAG-LABEL: umulo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: umull x9, w0, w1 -; SDAG-NEXT: mov w8, #10 +; SDAG-NEXT: mov w8, #10 // =0xa ; SDAG-NEXT: tst x9, #0xffffffff00000000 ; SDAG-NEXT: csel w0, w9, w8, ne ; SDAG-NEXT: ret @@ -1980,7 +1984,7 @@ ; FAST-LABEL: umulo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: umull x9, w0, w1 -; FAST-NEXT: mov w8, #10 +; FAST-NEXT: mov w8, #10 // =0xa ; FAST-NEXT: tst x9, #0xffffffff00000000 ; FAST-NEXT: csel w0, w9, w8, ne ; FAST-NEXT: ret @@ -1988,7 +1992,7 @@ ; GISEL-LABEL: umulo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: umull x9, w0, w1 -; GISEL-NEXT: mov w8, #10 +; GISEL-NEXT: mov w8, #10 // =0xa ; GISEL-NEXT: mul w10, w0, w1 ; GISEL-NEXT: lsr x9, x9, #32 ; GISEL-NEXT: cmp w9, #0 @@ -2006,7 +2010,7 @@ ; SDAG-LABEL: smulo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: smull x9, w0, w1 -; SDAG-NEXT: mov w8, #10 +; SDAG-NEXT: mov w8, #10 // =0xa ; SDAG-NEXT: cmp x9, w9, sxtw ; SDAG-NEXT: csel w0, w9, w8, ne ; SDAG-NEXT: ret @@ -2014,7 +2018,7 @@ ; FAST-LABEL: smulo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: smull x9, w0, w1 -; FAST-NEXT: mov w8, #10 +; FAST-NEXT: mov w8, #10 // =0xa ; FAST-NEXT: cmp x9, w9, sxtw ; FAST-NEXT: csel w0, w9, w8, ne ; FAST-NEXT: ret @@ -2022,7 +2026,7 @@ ; GISEL-LABEL: smulo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: smull x9, w0, w1 -; GISEL-NEXT: mov w8, #10 +; GISEL-NEXT: mov w8, #10 // =0xa ; GISEL-NEXT: mul w10, w0, w1 ; GISEL-NEXT: asr x9, x9, #32 ; GISEL-NEXT: cmp w9, w10, asr #31 @@ -2040,7 +2044,7 @@ ; SDAG-LABEL: umulo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: umulh x9, x0, x1 -; SDAG-NEXT: mov w8, #10 +; SDAG-NEXT: mov w8, #10 // =0xa ; SDAG-NEXT: mul x10, x0, x1 ; SDAG-NEXT: cmp xzr, x9 ; SDAG-NEXT: csel x0, x10, x8, ne @@ -2049,7 +2053,7 @@ ; FAST-LABEL: umulo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: umulh x9, x0, x1 -; FAST-NEXT: mov x8, #10 +; FAST-NEXT: mov x8, #10 // =0xa ; FAST-NEXT: mul x10, x0, x1 ; FAST-NEXT: cmp xzr, x9 ; FAST-NEXT: csel x0, x10, x8, ne @@ -2058,7 +2062,7 @@ ; GISEL-LABEL: umulo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: umulh x9, x0, x1 -; GISEL-NEXT: mov w8, #10 +; GISEL-NEXT: mov w8, #10 // =0xa ; GISEL-NEXT: mul x10, x0, x1 ; GISEL-NEXT: cmp x9, #0 ; GISEL-NEXT: csel x0, x10, x8, ne @@ -2075,7 +2079,7 @@ ; SDAG-LABEL: smulo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: mul x9, x0, x1 -; SDAG-NEXT: mov w8, #10 +; SDAG-NEXT: mov w8, #10 // =0xa ; SDAG-NEXT: smulh x10, x0, x1 ; SDAG-NEXT: cmp x10, x9, asr #63 ; SDAG-NEXT: csel x0, x9, x8, ne @@ -2084,7 +2088,7 @@ ; FAST-LABEL: smulo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: mul x9, x0, x1 -; FAST-NEXT: mov x8, #10 +; FAST-NEXT: mov x8, #10 // =0xa ; FAST-NEXT: smulh x10, x0, x1 ; FAST-NEXT: cmp x10, x9, asr #63 ; FAST-NEXT: csel x0, x9, x8, ne @@ -2093,7 +2097,7 @@ ; GISEL-LABEL: smulo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: mul x9, x0, x1 -; GISEL-NEXT: mov w8, #10 +; GISEL-NEXT: mov w8, #10 // =0xa ; GISEL-NEXT: smulh x10, x0, x1 ; GISEL-NEXT: cmp x10, x9, asr #63 ; GISEL-NEXT: csel x0, x9, x8, ne @@ -2120,7 +2124,7 @@ ; FAST-LABEL: saddo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn w0, w1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, vs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2155,7 +2159,7 @@ ; FAST-LABEL: saddo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn x0, x1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, vs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2190,7 +2194,7 @@ ; FAST-LABEL: uaddo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn w0, w1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, hs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2225,7 +2229,7 @@ ; FAST-LABEL: uaddo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn x0, x1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, hs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2260,7 +2264,7 @@ ; FAST-LABEL: ssubo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmp w0, w1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, vs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2295,7 +2299,7 @@ ; FAST-LABEL: ssubo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmp x0, x1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, vs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2330,7 +2334,7 @@ ; FAST-LABEL: usubo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmp w0, w1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, lo ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2365,7 +2369,7 @@ ; FAST-LABEL: usubo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmp x0, x1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, lo ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2401,7 +2405,7 @@ ; FAST-LABEL: smulo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: smull x9, w0, w1 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: cmp x9, w9, sxtw ; FAST-NEXT: cset w9, ne ; FAST-NEXT: bic w8, w8, w9 @@ -2442,7 +2446,7 @@ ; FAST-LABEL: smulo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: mul x9, x0, x1 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: smulh x10, x0, x1 ; FAST-NEXT: cmp x10, x9, asr #63 ; FAST-NEXT: cset w9, ne @@ -2481,7 +2485,7 @@ ; FAST-LABEL: smulo2.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn x0, x0 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: cset w9, vs ; FAST-NEXT: bic w8, w8, w9 ; FAST-NEXT: and w0, w8, #0x1 @@ -2517,7 +2521,7 @@ ; FAST-LABEL: umulo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: umull x9, w0, w1 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: tst x9, #0xffffffff00000000 ; FAST-NEXT: cset w9, ne ; FAST-NEXT: bic w8, w8, w9 @@ -2556,7 +2560,7 @@ ; FAST-LABEL: umulo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: umulh x9, x0, x1 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: cmp xzr, x9 ; FAST-NEXT: cset w9, ne ; FAST-NEXT: bic w8, w8, w9 @@ -2593,7 +2597,7 @@ ; FAST-LABEL: umulo2.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn x0, x0 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: cset w9, hs ; FAST-NEXT: bic w8, w8, w9 ; FAST-NEXT: and w0, w8, #0x1 @@ -2621,17 +2625,17 @@ define i8 @pr60530() { ; SDAG-LABEL: pr60530: ; SDAG: // %bb.0: -; SDAG-NEXT: mov w0, #-1 +; SDAG-NEXT: mov w0, #-1 // =0xffffffff ; SDAG-NEXT: ret ; ; FAST-LABEL: pr60530: ; FAST: // %bb.0: -; FAST-NEXT: mov w0, #-1 +; FAST-NEXT: mov w0, #-1 // =0xffffffff ; FAST-NEXT: ret ; ; GISEL-LABEL: pr60530: ; GISEL: // %bb.0: -; GISEL-NEXT: mov w8, #1 +; GISEL-NEXT: mov w8, #1 // =0x1 ; GISEL-NEXT: sbfx w0, w8, #0, #1 ; GISEL-NEXT: ret %1 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 0, i8 1) diff --git a/llvm/test/CodeGen/AArch64/arm64_32-neon.ll b/llvm/test/CodeGen/AArch64/arm64_32-neon.ll --- a/llvm/test/CodeGen/AArch64/arm64_32-neon.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32-neon.ll @@ -1,22 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=arm64_32-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s define <2 x double> @test_insert_elt(<2 x double> %vec, double %val) { ; CHECK-LABEL: test_insert_elt: -; CHECK: mov.d v0[0], v1[0] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov.d v0[0], v1[0] +; CHECK-NEXT: ret %res = insertelement <2 x double> %vec, double %val, i32 0 ret <2 x double> %res } define void @test_split_16B(<4 x float> %val, ptr %addr) { ; CHECK-LABEL: test_split_16B: -; CHECK: str q0, [x0] +; CHECK: ; %bb.0: +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret store <4 x float> %val, ptr %addr, align 8 ret void } define void @test_split_16B_splat(<4 x i32>, ptr %addr) { ; CHECK-LABEL: test_split_16B_splat: -; CHECK: str {{q[0-9]+}} +; CHECK: ; %bb.0: +; CHECK-NEXT: movi.4s v0, #42 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret %vec.tmp0 = insertelement <4 x i32> undef, i32 42, i32 0 %vec.tmp1 = insertelement <4 x i32> %vec.tmp0, i32 42, i32 1 @@ -33,7 +42,9 @@ declare {%vec, %vec} @llvm.aarch64.neon.ld2r.v2f64.p0(ptr) define {%vec, %vec} @test_neon_load(ptr %addr) { ; CHECK-LABEL: test_neon_load: -; CHECK: ld2r.2d { v0, v1 }, [x0] +; CHECK: ; %bb.0: +; CHECK-NEXT: ld2r.2d { v0, v1 }, [x0] +; CHECK-NEXT: ret %res = call {%vec, %vec} @llvm.aarch64.neon.ld2r.v2f64.p0(ptr %addr) ret {%vec, %vec} %res } @@ -41,7 +52,11 @@ declare {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0(%vec, %vec, i64, ptr) define {%vec, %vec} @test_neon_load_lane(ptr %addr, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_load_lane: -; CHECK: ld2.d { v0, v1 }[0], [x0] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ld2.d { v0, v1 }[0], [x0] +; CHECK-NEXT: ret %res = call {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0(%vec %in1, %vec %in2, i64 0, ptr %addr) ret {%vec, %vec} %res } @@ -49,7 +64,11 @@ declare void @llvm.aarch64.neon.st2.v2f64.p0(%vec, %vec, ptr) define void @test_neon_store(ptr %addr, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_store: -; CHECK: st2.2d { v0, v1 }, [x0] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: st2.2d { v0, v1 }, [x0] +; CHECK-NEXT: ret call void @llvm.aarch64.neon.st2.v2f64.p0(%vec %in1, %vec %in2, ptr %addr) ret void } @@ -57,7 +76,11 @@ declare void @llvm.aarch64.neon.st2lane.v2f64.p0(%vec, %vec, i64, ptr) define void @test_neon_store_lane(ptr %addr, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_store_lane: -; CHECK: st2.d { v0, v1 }[1], [x0] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: st2.d { v0, v1 }[1], [x0] +; CHECK-NEXT: ret call void @llvm.aarch64.neon.st2lane.v2f64.p0(%vec %in1, %vec %in2, i64 1, ptr %addr) ret void } @@ -65,8 +88,11 @@ declare {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0(ptr) define {{%vec, %vec}, ptr} @test_neon_load_post(ptr %addr, i32 %offset) { ; CHECK-LABEL: test_neon_load_post: -; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1 -; CHECK: ld2.2d { v0, v1 }, [x0], [[OFFSET]] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: ld2.2d { v0, v1 }, [x0], x8 +; CHECK-NEXT: ret %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0(ptr %addr) @@ -79,8 +105,13 @@ define {{%vec, %vec}, ptr} @test_neon_load_post_lane(ptr %addr, i32 %offset, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_load_post_lane: -; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1 -; CHECK: ld2.d { v0, v1 }[1], [x0], [[OFFSET]] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ld2.d { v0, v1 }[1], [x0], x8 +; CHECK-NEXT: ret %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0(%vec %in1, %vec %in2, i64 1, ptr %addr) @@ -93,8 +124,13 @@ define ptr @test_neon_store_post(ptr %addr, i32 %offset, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_store_post: -; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1 -; CHECK: st2.2d { v0, v1 }, [x0], [[OFFSET]] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: st2.2d { v0, v1 }, [x0], x8 +; CHECK-NEXT: ret call void @llvm.aarch64.neon.st2.v2f64.p0(%vec %in1, %vec %in2, ptr %addr) @@ -105,8 +141,13 @@ define ptr @test_neon_store_post_lane(ptr %addr, i32 %offset, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_store_post_lane: -; CHECK: sxtw [[OFFSET:x[0-9]+]], w1 -; CHECK: st2.d { v0, v1 }[0], [x0], [[OFFSET]] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: st2.d { v0, v1 }[0], [x0], x8 +; CHECK-NEXT: ret call void @llvm.aarch64.neon.st2lane.v2f64.p0(%vec %in1, %vec %in2, i64 0, ptr %addr) @@ -119,8 +160,11 @@ ; rather than an intrinsic. define {%vec, ptr} @test_neon_ld1_post_lane(ptr %addr, i32 %offset, %vec %in) { ; CHECK-LABEL: test_neon_ld1_post_lane: -; CHECK: sbfiz [[OFFSET:x[0-9]+]], x1, #3, #32 -; CHECK: ld1.d { v0 }[0], [x0], [[OFFSET]] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sbfiz x8, x1, #3, #32 +; CHECK-NEXT: ld1.d { v0 }[0], [x0], x8 +; CHECK-NEXT: ret %loaded = load double, ptr %addr, align 8 %newvec = insertelement %vec %in, double %loaded, i32 0 @@ -135,7 +179,9 @@ define {{%vec, %vec}, ptr} @test_neon_load_post_exact(ptr %addr) { ; CHECK-LABEL: test_neon_load_post_exact: -; CHECK: ld2.2d { v0, v1 }, [x0], #32 +; CHECK: ; %bb.0: +; CHECK-NEXT: ld2.2d { v0, v1 }, [x0], #32 +; CHECK-NEXT: ret %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0(ptr %addr) @@ -148,7 +194,10 @@ define {%vec, ptr} @test_neon_ld1_post_lane_exact(ptr %addr, %vec %in) { ; CHECK-LABEL: test_neon_ld1_post_lane_exact: -; CHECK: ld1.d { v0 }[0], [x0], #8 +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr d1, [x0], #8 +; CHECK-NEXT: mov.d v0[0], v1[0] +; CHECK-NEXT: ret %loaded = load double, ptr %addr, align 8 %newvec = insertelement %vec %in, double %loaded, i32 0 @@ -165,9 +214,10 @@ ; address wraps. We cannot use post-indexed addressing. define {%vec, ptr} @test_neon_ld1_notpost_lane_exact(ptr %addr, %vec %in) { ; CHECK-LABEL: test_neon_ld1_notpost_lane_exact: -; CHECK-NOT: ld1.d { {{v[0-9]+}} }[0], [{{x[0-9]+|sp}}], #8 -; CHECK: add w0, w0, #8 -; CHECK: ret +; CHECK: ; %bb.0: +; CHECK-NEXT: ld1.d { v0 }[0], [x0] +; CHECK-NEXT: add w0, w0, #8 +; CHECK-NEXT: ret %loaded = load double, ptr %addr, align 8 %newvec = insertelement %vec %in, double %loaded, i32 0 @@ -182,9 +232,10 @@ define {%vec, ptr} @test_neon_ld1_notpost_lane(ptr %addr, i32 %offset, %vec %in) { ; CHECK-LABEL: test_neon_ld1_notpost_lane: -; CHECK-NOT: ld1.d { {{v[0-9]+}} }[0], [{{x[0-9]+|sp}}], {{x[0-9]+|sp}} -; CHECK: add w0, w0, w1, lsl #3 -; CHECK: ret +; CHECK: ; %bb.0: +; CHECK-NEXT: ld1.d { v0 }[0], [x0] +; CHECK-NEXT: add w0, w0, w1, lsl #3 +; CHECK-NEXT: ret %loaded = load double, ptr %addr, align 8 %newvec = insertelement %vec %in, double %loaded, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64_32.ll b/llvm/test/CodeGen/AArch64/arm64_32.ll --- a/llvm/test/CodeGen/AArch64/arm64_32.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=arm64_32-apple-ios7.0 %s -filetype=obj -o - -disable-post-ra -frame-pointer=non-leaf | \ ; RUN: llvm-objdump --private-headers - | \ ; RUN: FileCheck %s --check-prefix=CHECK-MACHO @@ -13,11 +14,24 @@ @var_got = external global i8 define ptr @test_global_addr() { -; CHECK-LABEL: test_global_addr: -; CHECK: adrp [[PAGE:x[0-9]+]], _var32@PAGE -; CHECK-OPT: add x0, [[PAGE]], _var32@PAGEOFF -; CHECK-FAST: add [[TMP:x[0-9]+]], [[PAGE]], _var32@PAGEOFF -; CHECK-FAST: and x0, [[TMP]], #0xffffffff +; CHECK-OPT-LABEL: test_global_addr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: Lloh0: +; CHECK-OPT-NEXT: adrp x0, _var32@PAGE +; CHECK-OPT-NEXT: Lloh1: +; CHECK-OPT-NEXT: add x0, x0, _var32@PAGEOFF +; CHECK-OPT-NEXT: ret +; CHECK-OPT-NEXT: .loh AdrpAdd Lloh0, Lloh1 +; +; CHECK-FAST-LABEL: test_global_addr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: Lloh0: +; CHECK-FAST-NEXT: adrp x8, _var32@PAGE +; CHECK-FAST-NEXT: Lloh1: +; CHECK-FAST-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-FAST-NEXT: and x0, x8, #0xffffffff +; CHECK-FAST-NEXT: ret +; CHECK-FAST-NEXT: .loh AdrpAdd Lloh0, Lloh1 ret ptr @var32 } @@ -25,19 +39,36 @@ ; gets truncated to 32-bits, it's free. No need to zero out higher bits of that ; register. define i64 @test_global_addr_extension() { -; CHECK-LABEL: test_global_addr_extension: -; CHECK: adrp [[PAGE:x[0-9]+]], _var32@PAGE -; CHECK: add x0, [[PAGE]], _var32@PAGEOFF -; CHECK-NOT: and -; CHECK: ret +; CHECK-OPT-LABEL: test_global_addr_extension: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: Lloh2: +; CHECK-OPT-NEXT: adrp x0, _var32@PAGE +; CHECK-OPT-NEXT: Lloh3: +; CHECK-OPT-NEXT: add x0, x0, _var32@PAGEOFF +; CHECK-OPT-NEXT: ret +; CHECK-OPT-NEXT: .loh AdrpAdd Lloh2, Lloh3 +; +; CHECK-FAST-LABEL: test_global_addr_extension: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: Lloh2: +; CHECK-FAST-NEXT: adrp x8, _var32@PAGE +; CHECK-FAST-NEXT: Lloh3: +; CHECK-FAST-NEXT: add x0, x8, _var32@PAGEOFF +; CHECK-FAST-NEXT: ret +; CHECK-FAST-NEXT: .loh AdrpAdd Lloh2, Lloh3 ret i64 ptrtoint(ptr @var32 to i64) } define i32 @test_global_value() { ; CHECK-LABEL: test_global_value: -; CHECK: adrp x[[PAGE:[0-9]+]], _var32@PAGE -; CHECK: ldr w0, [x[[PAGE]], _var32@PAGEOFF] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr w0, [x8, _var32@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5 %val = load i32, ptr @var32, align 4 ret i32 %val } @@ -45,9 +76,15 @@ ; Because the addition may wrap, it is not safe to use "ldr w0, [xN, #32]" here. define i32 @test_unsafe_indexed_add() { ; CHECK-LABEL: test_unsafe_indexed_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #32 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh6, Lloh7 %addr_int = ptrtoint ptr @var32 to i32 %addr_plus_32 = add i32 %addr_int, 32 %addr = inttoptr i32 %addr_plus_32 to ptr @@ -59,9 +96,15 @@ ; 32-bytes below 2^32, and we can use the load this time. define i32 @test_safe_indexed_add() { ; CHECK-LABEL: test_safe_indexed_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh9: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #32 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh8, Lloh9 %addr_int = ptrtoint ptr @var32 to i64 %addr_plus_32 = add nuw i64 %addr_int, 32 %addr = inttoptr i64 %addr_plus_32 to ptr @@ -71,9 +114,11 @@ define i32 @test_safe_indexed_or(i32 %in) { ; CHECK-LABEL: test_safe_indexed_or: -; CHECK: and [[TMP:w[0-9]+]], {{w[0-9]+}}, #0xfffffff0 -; CHECK: orr w[[ADDR:[0-9]+]], [[TMP]], #0x4 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: and w8, w0, #0xfffffff0 +; CHECK-NEXT: orr w8, w8, #0x4 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret %addr_int = and i32 %in, -16 %addr_plus_4 = or i32 %addr_int, 4 %addr = inttoptr i32 %addr_plus_4 to ptr @@ -87,10 +132,15 @@ ; "sext(base) + sext(offset) == base + offset". define i32 @test_unsafe_nsw_indexed_add() { ; CHECK-LABEL: test_unsafe_nsw_indexed_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32 -; CHECK-NOT: ubfx -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh10: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh11: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #32 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh10, Lloh11 %addr_int = ptrtoint ptr @var32 to i32 %addr_plus_32 = add nsw i32 %addr_int, 32 %addr = inttoptr i32 %addr_plus_32 to ptr @@ -101,9 +151,15 @@ ; Because the addition may wrap, it is not safe to use "ldr w0, [xN, #32]" here. define i32 @test_unsafe_unscaled_add() { ; CHECK-LABEL: test_unsafe_unscaled_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh12: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh13: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #3 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh12, Lloh13 %addr_int = ptrtoint ptr @var32 to i32 %addr_plus_3 = add i32 %addr_int, 3 %addr = inttoptr i32 %addr_plus_3 to ptr @@ -115,9 +171,15 @@ ; 32-bytes below 2^32, and we can use the load this time. define i32 @test_safe_unscaled_add() { ; CHECK-LABEL: test_safe_unscaled_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh14: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh15: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #3 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh14, Lloh15 %addr_int = ptrtoint ptr @var32 to i32 %addr_plus_3 = add nuw i32 %addr_int, 3 %addr = inttoptr i32 %addr_plus_3 to ptr @@ -130,10 +192,15 @@ ; "sext(base) + sext(offset) == base + offset". define i32 @test_unsafe_nsw_unscaled_add() { ; CHECK-LABEL: test_unsafe_nsw_unscaled_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3 -; CHECK-NOT: ubfx -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh16: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh17: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #3 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh16, Lloh17 %addr_int = ptrtoint ptr @var32 to i32 %addr_plus_3 = add nsw i32 %addr_int, 3 %addr = inttoptr i32 %addr_plus_3 to ptr @@ -145,9 +212,15 @@ ; here. define i32 @test_unsafe_negative_unscaled_add() { ; CHECK-LABEL: test_unsafe_negative_unscaled_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: sub w[[ADDR:[0-9]+]], w[[VAR32]], #3 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh18: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh19: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: sub w8, w8, #3 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh18, Lloh19 %addr_int = ptrtoint ptr @var32 to i32 %addr_minus_3 = add i32 %addr_int, -3 %addr = inttoptr i32 %addr_minus_3 to ptr @@ -156,24 +229,39 @@ } define ptr @test_got_addr() { -; CHECK-LABEL: test_got_addr: -; CHECK: adrp x[[PAGE:[0-9]+]], _var_got@GOTPAGE -; CHECK-OPT: ldr w0, [x[[PAGE]], _var_got@GOTPAGEOFF] -; CHECK-FAST: ldr w[[TMP:[0-9]+]], [x[[PAGE]], _var_got@GOTPAGEOFF] -; CHECK-FAST: and x0, x[[TMP]], #0xffffffff +; CHECK-OPT-LABEL: test_got_addr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: Lloh20: +; CHECK-OPT-NEXT: adrp x0, _var_got@GOTPAGE +; CHECK-OPT-NEXT: Lloh21: +; CHECK-OPT-NEXT: ldr w0, [x0, _var_got@GOTPAGEOFF] +; CHECK-OPT-NEXT: ret +; CHECK-OPT-NEXT: .loh AdrpLdrGot Lloh20, Lloh21 +; +; CHECK-FAST-LABEL: test_got_addr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: Lloh20: +; CHECK-FAST-NEXT: adrp x8, _var_got@GOTPAGE +; CHECK-FAST-NEXT: Lloh21: +; CHECK-FAST-NEXT: ldr w8, [x8, _var_got@GOTPAGEOFF] +; CHECK-FAST-NEXT: and x0, x8, #0xffffffff +; CHECK-FAST-NEXT: ret +; CHECK-FAST-NEXT: .loh AdrpLdrGot Lloh20, Lloh21 ret ptr @var_got } define float @test_va_arg_f32(ptr %list) { ; CHECK-LABEL: test_va_arg_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: add w9, w8, #8 +; CHECK-NEXT: str w9, [x0] +; CHECK-NEXT: ldr d0, [x8] +; CHECK-NEXT: fcvt s0, d0 +; CHECK-NEXT: ret -; CHECK: ldr w[[START:[0-9]+]], [x0] -; CHECK: add [[AFTER:w[0-9]+]], w[[START]], #8 -; CHECK: str [[AFTER]], [x0] ; Floating point arguments get promoted to double as per C99. -; CHECK: ldr [[DBL:d[0-9]+]], [x[[START]]] -; CHECK: fcvt s0, [[DBL]] %res = va_arg ptr %list, float ret float %res } @@ -181,13 +269,15 @@ ; Interesting point is that the slot is 4 bytes. define i8 @test_va_arg_i8(ptr %list) { ; CHECK-LABEL: test_va_arg_i8: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: add w9, w8, #4 +; CHECK-NEXT: str w9, [x0] +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret -; CHECK: ldr w[[START:[0-9]+]], [x0] -; CHECK: add [[AFTER:w[0-9]+]], w[[START]], #4 -; CHECK: str [[AFTER]], [x0] ; i8 gets promoted to int (again, as per C99). -; CHECK: ldr w0, [x[[START]]] %res = va_arg ptr %list, i8 ret i8 %res @@ -197,16 +287,18 @@ ; bytes). define i64 @test_va_arg_i64(ptr %list) { ; CHECK-LABEL: test_va_arg_i64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: add x8, x8, #7 +; CHECK-NEXT: and x8, x8, #0x1fffffff8 +; CHECK-NEXT: add w9, w8, #8 +; CHECK-NEXT: str w9, [x0] +; CHECK-NEXT: ldr x0, [x8] +; CHECK-NEXT: ret ; Update the list for the next user (minimum slot size is 4, but the actual ; argument is 8 which had better be reflected!) -; CHECK: ldr w[[UNALIGNED_START:[0-9]+]], [x0] -; CHECK: add [[ALIGN_TMP:x[0-9]+]], x[[UNALIGNED_START]], #7 -; CHECK: and x[[START:[0-9]+]], [[ALIGN_TMP]], #0x1fffffff8 -; CHECK: add w[[AFTER:[0-9]+]], w[[START]], #8 -; CHECK: str w[[AFTER]], [x0] -; CHECK: ldr x0, [x[[START]]] %res = va_arg ptr %list, i64 ret i64 %res @@ -214,14 +306,47 @@ declare void @bar(...) define void @test_va_call(i8 %l, i8 %r, float %in, ptr %ptr) { -; CHECK-LABEL: test_va_call: -; CHECK: add [[SUM:w[0-9]+]], {{w[0-9]+}}, w1 +; CHECK-OPT-LABEL: test_va_call: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: sub sp, sp, #64 +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 64 +; CHECK-OPT-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-OPT-NEXT: add x29, sp, #48 +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: add w8, w0, w1 +; CHECK-OPT-NEXT: str w2, [sp, #32] +; CHECK-OPT-NEXT: str xzr, [sp, #24] +; CHECK-OPT-NEXT: str s0, [sp, #16] +; CHECK-OPT-NEXT: str xzr, [sp, #8] +; CHECK-OPT-NEXT: str w8, [sp] +; CHECK-OPT-NEXT: bl _bar +; CHECK-OPT-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-OPT-NEXT: add sp, sp, #64 +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_va_call: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: sub sp, sp, #64 +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 64 +; CHECK-FAST-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-FAST-NEXT: add x29, sp, #48 +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: sxtb w8, w0 +; CHECK-FAST-NEXT: add w8, w8, w1, sxtb +; CHECK-FAST-NEXT: str w2, [sp, #32] +; CHECK-FAST-NEXT: str xzr, [sp, #24] +; CHECK-FAST-NEXT: str s0, [sp, #16] +; CHECK-FAST-NEXT: str xzr, [sp, #8] +; CHECK-FAST-NEXT: str w8, [sp] +; CHECK-FAST-NEXT: bl _bar +; CHECK-FAST-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-FAST-NEXT: add sp, sp, #64 +; CHECK-FAST-NEXT: ret -; CHECK-DAG: str w2, [sp, #32] -; CHECK-DAG: str xzr, [sp, #24] -; CHECK-DAG: str s0, [sp, #16] -; CHECK-DAG: str xzr, [sp, #8] -; CHECK-DAG: str [[SUM]], [sp] ; Add them to ensure real promotion occurs. %sum = add i8 %l, %r @@ -232,10 +357,30 @@ declare ptr @llvm.frameaddress(i32) define ptr @test_frameaddr() { -; CHECK-LABEL: test_frameaddr: -; CHECK-OPT: ldr x0, [x29] -; CHECK-FAST: ldr [[TMP:x[0-9]+]], [x29] -; CHECK-FAST: and x0, [[TMP]], #0xffffffff +; CHECK-OPT-LABEL: test_frameaddr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 16 +; CHECK-OPT-NEXT: mov x29, sp +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: ldr x0, [x29] +; CHECK-OPT-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_frameaddr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 16 +; CHECK-FAST-NEXT: mov x29, sp +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: ldr x8, [x29] +; CHECK-FAST-NEXT: and x0, x8, #0xffffffff +; CHECK-FAST-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: ret %val = call ptr @llvm.frameaddress(i32 1) ret ptr %val } @@ -243,28 +388,77 @@ declare ptr @llvm.returnaddress(i32) define ptr @test_toplevel_returnaddr() { -; CHECK-LABEL: test_toplevel_returnaddr: -; CHECK-OPT: mov x0, x30 -; CHECK-FAST: and x0, x30, #0xffffffff +; CHECK-OPT-LABEL: test_toplevel_returnaddr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: hint #7 +; CHECK-OPT-NEXT: mov x0, x30 +; CHECK-OPT-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_toplevel_returnaddr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: hint #7 +; CHECK-FAST-NEXT: and x0, x30, #0xffffffff +; CHECK-FAST-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: ret %val = call ptr @llvm.returnaddress(i32 0) ret ptr %val } define ptr @test_deep_returnaddr() { -; CHECK-LABEL: test_deep_returnaddr: -; CHECK: ldr x[[FRAME_REC:[0-9]+]], [x29] -; CHECK-OPT: ldr x30, [x[[FRAME_REC]], #8] -; CHECK-OPT: hint #7 -; CHECK-OPT: mov x0, x30 -; CHECK-FAST: ldr [[TMP:x[0-9]+]], [x[[FRAME_REC]], #8] -; CHECK-FAST: and x0, [[TMP]], #0xffffffff +; CHECK-OPT-LABEL: test_deep_returnaddr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 16 +; CHECK-OPT-NEXT: mov x29, sp +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: ldr x8, [x29] +; CHECK-OPT-NEXT: ldr x30, [x8, #8] +; CHECK-OPT-NEXT: hint #7 +; CHECK-OPT-NEXT: mov x0, x30 +; CHECK-OPT-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_deep_returnaddr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 16 +; CHECK-FAST-NEXT: mov x29, sp +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: ldr x8, [x29] +; CHECK-FAST-NEXT: ldr x30, [x8, #8] +; CHECK-FAST-NEXT: hint #7 +; CHECK-FAST-NEXT: and x0, x30, #0xffffffff +; CHECK-FAST-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: ret %val = call ptr @llvm.returnaddress(i32 1) ret ptr %val } define void @test_indirect_call(ptr %func) { ; CHECK-LABEL: test_indirect_call: -; CHECK: blr x0 +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: blr x0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret call void() %func() ret void } @@ -272,9 +466,17 @@ ; Safe to use the unextended address here define void @test_indirect_safe_call(ptr %weird_funcs) { ; CHECK-LABEL: test_indirect_safe_call: -; CHECK: add w[[ADDR32:[0-9]+]], w0, #4 -; CHECK-OPT-NOT: ubfx -; CHECK: blr x[[ADDR32]] +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add w8, w0, #4 +; CHECK-NEXT: blr x8 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret %addr = getelementptr i32, ptr %weird_funcs, i32 1 call void() %addr() ret void @@ -283,14 +485,16 @@ declare void @simple() define void @test_simple_tail_call() { ; CHECK-LABEL: test_simple_tail_call: -; CHECK: b _simple +; CHECK: ; %bb.0: +; CHECK-NEXT: b _simple tail call void @simple() ret void } define void @test_indirect_tail_call(ptr %func) { ; CHECK-LABEL: test_indirect_tail_call: -; CHECK: br x0 +; CHECK: ; %bb.0: +; CHECK-NEXT: br x0 tail call void() %func() ret void } @@ -298,9 +502,9 @@ ; Safe to use the unextended address here define void @test_indirect_safe_tail_call(ptr %weird_funcs) { ; CHECK-LABEL: test_indirect_safe_tail_call: -; CHECK: add w[[ADDR32:[0-9]+]], w0, #4 -; CHECK-OPT-NOT: ubfx -; CHECK-OPT: br x[[ADDR32]] +; CHECK: ; %bb.0: +; CHECK-NEXT: add w0, w0, #4 +; CHECK-NEXT: br x0 %addr = getelementptr i32, ptr %weird_funcs, i32 1 tail call void() %addr() ret void @@ -312,14 +516,20 @@ define i32 @test_in_smallstruct_low([3 x i32] %in) { ; CHECK-LABEL: test_in_smallstruct_low: -; CHECK: mov x0, x1 +; CHECK: ; %bb.0: +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret %val = extractvalue [3 x i32] %in, 2 ret i32 %val } define i32 @test_in_smallstruct_high([3 x i32] %in) { ; CHECK-LABEL: test_in_smallstruct_high: -; CHECK: lsr x0, x0, #32 +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr x0, x0, #32 +; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret %val = extractvalue [3 x i32] %in, 1 ret i32 %val } @@ -329,15 +539,19 @@ ; be incompatible with the armv7k ABI. define i32 @test_in_smallstruct_stack([8 x i64], i32, [3 x i32] %in) { ; CHECK-LABEL: test_in_smallstruct_stack: -; CHECK: ldr w0, [sp, #4] +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr w0, [sp, #4] +; CHECK-NEXT: ret %val = extractvalue [3 x i32] %in, 0 ret i32 %val } define [2 x i32] @test_ret_smallstruct([3 x i32] %in) { ; CHECK-LABEL: test_ret_smallstruct: -; CHECK: mov x0, #1 -; CHECK: movk x0, #2, lsl #32 +; CHECK: ; %bb.0: +; CHECK-NEXT: mov x0, #1 ; =0x1 +; CHECK-NEXT: movk x0, #2, lsl #32 +; CHECK-NEXT: ret ret [2 x i32] [i32 1, i32 2] } @@ -345,11 +559,20 @@ declare void @smallstruct_callee([4 x i32]) define void @test_call_smallstruct() { ; CHECK-LABEL: test_call_smallstruct: -; CHECK: mov x0, #1 -; CHECK: movk x0, #2, lsl #32 -; CHECK: mov x1, #3 -; CHECK: movk x1, #4, lsl #32 -; CHECK: bl _smallstruct_callee +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x0, #1 ; =0x1 +; CHECK-NEXT: movk x0, #2, lsl #32 +; CHECK-NEXT: mov x1, #3 ; =0x3 +; CHECK-NEXT: movk x1, #4, lsl #32 +; CHECK-NEXT: bl _smallstruct_callee +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret call void @smallstruct_callee([4 x i32] [i32 1, i32 2, i32 3, i32 4]) ret void @@ -358,9 +581,21 @@ declare void @smallstruct_callee_stack([8 x i64], i32, [2 x i32]) define void @test_call_smallstruct_stack() { ; CHECK-LABEL: test_call_smallstruct_stack: -; CHECK: mov [[VAL:x[0-9]+]], #1 -; CHECK: movk [[VAL]], #2, lsl #32 -; CHECK: stur [[VAL]], [sp, #4] +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, #1 ; =0x1 +; CHECK-NEXT: movk x8, #2, lsl #32 +; CHECK-NEXT: stur x8, [sp, #4] +; CHECK-NEXT: bl _smallstruct_callee_stack +; CHECK-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret call void @smallstruct_callee_stack([8 x i64] undef, i32 undef, [2 x i32] [i32 1, i32 2]) ret void @@ -369,8 +604,18 @@ declare [3 x i32] @returns_smallstruct() define i32 @test_use_smallstruct_low() { ; CHECK-LABEL: test_use_smallstruct_low: -; CHECK: bl _returns_smallstruct -; CHECK: mov x0, x1 +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: bl _returns_smallstruct +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret %struct = call [3 x i32] @returns_smallstruct() %val = extractvalue [3 x i32] %struct, 2 @@ -379,8 +624,18 @@ define i32 @test_use_smallstruct_high() { ; CHECK-LABEL: test_use_smallstruct_high: -; CHECK: bl _returns_smallstruct -; CHECK: lsr x0, x0, #32 +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: bl _returns_smallstruct +; CHECK-NEXT: lsr x0, x0, #32 +; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret %struct = call [3 x i32] @returns_smallstruct() %val = extractvalue [3 x i32] %struct, 1 @@ -391,10 +646,19 @@ ; be marked as unavailable and subsequent GPR arguments should also be on the ; stack. Obviously the struct itself should be passed entirely on the stack. define i32 @test_smallstruct_padding([7 x i64], [4 x i32] %struct, i32 %in) { -; CHECK-LABEL: test_smallstruct_padding: -; CHECK-DAG: ldr [[IN:w[0-9]+]], [sp, #16] -; CHECK-DAG: ldr [[LHS:w[0-9]+]], [sp] -; CHECK: add w0, [[LHS]], [[IN]] +; CHECK-OPT-LABEL: test_smallstruct_padding: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: ldr w8, [sp, #16] +; CHECK-OPT-NEXT: ldr w9, [sp] +; CHECK-OPT-NEXT: add w0, w9, w8 +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_smallstruct_padding: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: ldr w8, [sp] +; CHECK-FAST-NEXT: ldr w9, [sp, #16] +; CHECK-FAST-NEXT: add w0, w8, w9 +; CHECK-FAST-NEXT: ret %lhs = extractvalue [4 x i32] %struct, 0 %sum = add i32 %lhs, %in ret i32 %sum @@ -403,17 +667,31 @@ declare void @take_small_smallstruct(i64, [1 x i32]) define void @test_small_smallstruct() { ; CHECK-LABEL: test_small_smallstruct: -; CHECK-DAG: mov w0, #1 -; CHECK-DAG: mov w1, #2 -; CHECK: bl _take_small_smallstruct +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov w0, #1 ; =0x1 +; CHECK-NEXT: mov w1, #2 ; =0x2 +; CHECK-NEXT: bl _take_small_smallstruct +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret call void @take_small_smallstruct(i64 1, [1 x i32] [i32 2]) ret void } define void @test_bare_frameaddr(ptr %addr) { ; CHECK-LABEL: test_bare_frameaddr: -; CHECK: add x[[LOCAL:[0-9]+]], sp, #{{[0-9]+}} -; CHECK: str w[[LOCAL]], +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #15 +; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %ptr = alloca i8 store ptr %ptr, ptr %addr, align 4 @@ -422,15 +700,29 @@ define void @test_sret_use(ptr sret([8 x i64]) %out) { ; CHECK-LABEL: test_sret_use: -; CHECK: str xzr, [x8] +; CHECK: ; %bb.0: +; CHECK-NEXT: str xzr, [x8] +; CHECK-NEXT: ret store i64 0, ptr %out ret void } define i64 @test_sret_call() { ; CHECK-LABEL: test_sret_call: -; CHECK: mov x8, sp -; CHECK: bl _test_sret_use +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: bl _test_sret_use +; CHECK-NEXT: ldr x0, [sp] +; CHECK-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ret %arr = alloca [8 x i64] call void @test_sret_use(ptr sret([8 x i64]) %arr) @@ -440,16 +732,27 @@ define double @test_constpool() { ; CHECK-LABEL: test_constpool: -; CHECK: adrp x[[PAGE:[0-9]+]], [[POOL:lCPI[0-9]+_[0-9]+]]@PAGE -; CHECK: ldr d0, [x[[PAGE]], [[POOL]]@PAGEOFF] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh22: +; CHECK-NEXT: adrp x8, lCPI37_0@PAGE +; CHECK-NEXT: Lloh23: +; CHECK-NEXT: ldr d0, [x8, lCPI37_0@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh23 ret double 1.0e-6 } define ptr @test_blockaddress() { ; CHECK-LABEL: test_blockaddress: -; CHECK: [[BLOCK:Ltmp[0-9]+]]: -; CHECK: adrp x[[PAGE:[0-9]+]], lCPI{{[0-9]+_[0-9]+}}@PAGE -; CHECK: ldr x0, [x[[PAGE]], lCPI{{[0-9]+_[0-9]+}}@PAGEOFF] +; CHECK: ; %bb.0: +; CHECK-NEXT: Ltmp7: ; Block address taken +; CHECK-NEXT: ; %bb.1: ; %dest +; CHECK-NEXT: Lloh24: +; CHECK-NEXT: adrp x0, lCPI38_0@PAGE +; CHECK-NEXT: Lloh25: +; CHECK-NEXT: ldr x0, [x0, lCPI38_0@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh24, Lloh25 br label %dest dest: ret ptr blockaddress(@test_blockaddress, %dest) @@ -457,7 +760,24 @@ define ptr @test_indirectbr(ptr %dest) { ; CHECK-LABEL: test_indirectbr: -; CHECK: br x0 +; CHECK: ; %bb.0: +; CHECK-NEXT: br x0 +; CHECK-NEXT: Ltmp8: ; Block address taken +; CHECK-NEXT: LBB39_1: ; %true +; CHECK-NEXT: Lloh26: +; CHECK-NEXT: adrp x0, lCPI39_0@PAGE +; CHECK-NEXT: Lloh27: +; CHECK-NEXT: ldr x0, [x0, lCPI39_0@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: Ltmp9: ; Block address taken +; CHECK-NEXT: LBB39_2: ; %false +; CHECK-NEXT: Lloh28: +; CHECK-NEXT: adrp x0, lCPI39_1@PAGE +; CHECK-NEXT: Lloh29: +; CHECK-NEXT: ldr x0, [x0, lCPI39_1@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh26, Lloh27 +; CHECK-NEXT: .loh AdrpLdr Lloh28, Lloh29 indirectbr ptr %dest, [label %true, label %false] true: @@ -471,7 +791,12 @@ ; claim the FI in the process -- it doesn't need extending. define float @test_frameindex_offset_load() { ; CHECK-LABEL: test_frameindex_offset_load: -; CHECK: ldr s0, [sp, #4] +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr s0, [sp, #4] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %arr = alloca float, i32 4, align 8 %addr = getelementptr inbounds float, ptr %arr, i32 1 @@ -481,10 +806,15 @@ define void @test_unaligned_frameindex_offset_store() { ; CHECK-LABEL: test_unaligned_frameindex_offset_store: -; CHECK: mov x[[TMP:[0-9]+]], sp -; CHECK: orr w[[ADDR:[0-9]+]], w[[TMP]], #0x2 -; CHECK: mov [[VAL:w[0-9]+]], #42 -; CHECK: str [[VAL]], [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: orr w8, w8, #0x2 +; CHECK-NEXT: mov w9, #42 ; =0x2a +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %arr = alloca [4 x i32] %addr.int = ptrtoint ptr %arr to i32 @@ -497,9 +827,11 @@ define {i64, ptr} @test_pre_idx(ptr %addr) { ; CHECK-LABEL: test_pre_idx: +; CHECK: ; %bb.0: +; CHECK-NEXT: add w1, w0, #8 +; CHECK-NEXT: ldr x0, [x1] +; CHECK-NEXT: ret -; CHECK: add w[[ADDR:[0-9]+]], w0, #8 -; CHECK: ldr x0, [x[[ADDR]]] %addr.int = ptrtoint ptr %addr to i32 %addr.next.int = add nuw i32 %addr.int, 8 %addr.next = inttoptr i32 %addr.next.int to ptr @@ -515,8 +847,10 @@ ; %addr wraps round to 0. define {i64, ptr} @test_invalid_pre_idx(ptr %addr) { ; CHECK-LABEL: test_invalid_pre_idx: -; CHECK: add w1, w0, #8 -; CHECK: ldr x0, [x1] +; CHECK: ; %bb.0: +; CHECK-NEXT: add w1, w0, #8 +; CHECK-NEXT: ldr x0, [x1] +; CHECK-NEXT: ret %addr.next = getelementptr i64, ptr %addr, i32 1 %val = load i64, ptr %addr.next @@ -528,24 +862,81 @@ declare void @callee(ptr) define void @test_stack_guard() ssp { -; CHECK-LABEL: test_stack_guard: -; CHECK: adrp x[[GUARD_GOTPAGE:[0-9]+]], ___stack_chk_guard@GOTPAGE -; CHECK: ldr w[[GUARD_ADDR:[0-9]+]], [x[[GUARD_GOTPAGE]], ___stack_chk_guard@GOTPAGEOFF] -; CHECK: ldr [[GUARD_VAL:w[0-9]+]], [x[[GUARD_ADDR]]] -; CHECK: stur [[GUARD_VAL]], [x29, #[[GUARD_OFFSET:-[0-9]+]]] - -; CHECK: add x0, sp, #{{[0-9]+}} -; CHECK: bl _callee - -; CHECK-OPT: adrp x[[GUARD_GOTPAGE:[0-9]+]], ___stack_chk_guard@GOTPAGE -; CHECK-OPT: ldr w[[GUARD_ADDR:[0-9]+]], [x[[GUARD_GOTPAGE]], ___stack_chk_guard@GOTPAGEOFF] -; CHECK-OPT: ldr [[GUARD_VAL:w[0-9]+]], [x[[GUARD_ADDR]]] -; CHECK-OPT: ldur [[NEW_VAL:w[0-9]+]], [x29, #[[GUARD_OFFSET]]] -; CHECK-OPT: cmp [[GUARD_VAL]], [[NEW_VAL]] -; CHECK-OPT: b.ne [[FAIL:LBB[0-9]+_[0-9]+]] - -; CHECK-OPT: [[FAIL]]: -; CHECK-OPT-NEXT: bl ___stack_chk_fail +; CHECK-OPT-LABEL: test_stack_guard: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: sub sp, sp, #64 +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 64 +; CHECK-OPT-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-OPT-NEXT: add x29, sp, #48 +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: Lloh30: +; CHECK-OPT-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE +; CHECK-OPT-NEXT: Lloh31: +; CHECK-OPT-NEXT: ldr w8, [x8, ___stack_chk_guard@GOTPAGEOFF] +; CHECK-OPT-NEXT: Lloh32: +; CHECK-OPT-NEXT: ldr w8, [x8] +; CHECK-OPT-NEXT: stur w8, [x29, #-4] +; CHECK-OPT-NEXT: add x0, sp, #12 +; CHECK-OPT-NEXT: bl _callee +; CHECK-OPT-NEXT: Lloh33: +; CHECK-OPT-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE +; CHECK-OPT-NEXT: Lloh34: +; CHECK-OPT-NEXT: ldr w8, [x8, ___stack_chk_guard@GOTPAGEOFF] +; CHECK-OPT-NEXT: Lloh35: +; CHECK-OPT-NEXT: ldr w8, [x8] +; CHECK-OPT-NEXT: ldur w9, [x29, #-4] +; CHECK-OPT-NEXT: cmp w8, w9 +; CHECK-OPT-NEXT: b.ne LBB44_2 +; CHECK-OPT-NEXT: ; %bb.1: +; CHECK-OPT-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-OPT-NEXT: add sp, sp, #64 +; CHECK-OPT-NEXT: ret +; CHECK-OPT-NEXT: LBB44_2: +; CHECK-OPT-NEXT: bl ___stack_chk_fail +; CHECK-OPT-NEXT: .loh AdrpLdrGotLdr Lloh33, Lloh34, Lloh35 +; CHECK-OPT-NEXT: .loh AdrpLdrGotLdr Lloh30, Lloh31, Lloh32 +; +; CHECK-FAST-LABEL: test_stack_guard: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: sub sp, sp, #64 +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 64 +; CHECK-FAST-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-FAST-NEXT: add x29, sp, #48 +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: Lloh30: +; CHECK-FAST-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE +; CHECK-FAST-NEXT: Lloh31: +; CHECK-FAST-NEXT: ldr w8, [x8, ___stack_chk_guard@GOTPAGEOFF] +; CHECK-FAST-NEXT: Lloh32: +; CHECK-FAST-NEXT: ldr w8, [x8] +; CHECK-FAST-NEXT: stur w8, [x29, #-4] +; CHECK-FAST-NEXT: add x0, sp, #12 +; CHECK-FAST-NEXT: bl _callee +; CHECK-FAST-NEXT: Lloh33: +; CHECK-FAST-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE +; CHECK-FAST-NEXT: Lloh34: +; CHECK-FAST-NEXT: ldr w8, [x8, ___stack_chk_guard@GOTPAGEOFF] +; CHECK-FAST-NEXT: Lloh35: +; CHECK-FAST-NEXT: ldr w8, [x8] +; CHECK-FAST-NEXT: ldur w9, [x29, #-4] +; CHECK-FAST-NEXT: and x8, x8, #0xffffffff +; CHECK-FAST-NEXT: cmp x8, x9 +; CHECK-FAST-NEXT: b.ne LBB44_2 +; CHECK-FAST-NEXT: ; %bb.1: ; %SP_return +; CHECK-FAST-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-FAST-NEXT: add sp, sp, #64 +; CHECK-FAST-NEXT: ret +; CHECK-FAST-NEXT: LBB44_2: ; %CallStackCheckFailBlk +; CHECK-FAST-NEXT: bl ___stack_chk_fail +; CHECK-FAST-NEXT: .loh AdrpLdrGotLdr Lloh33, Lloh34, Lloh35 +; CHECK-FAST-NEXT: .loh AdrpLdrGotLdr Lloh30, Lloh31, Lloh32 + + + %arr = alloca [8 x i32] call void @callee(ptr %arr) ret void @@ -556,9 +947,62 @@ @_ZTI8Whatever = external global i8 define void @test_landingpad_marshalling() personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: test_landingpad_marshalling: -; CHECK-OPT: mov x2, x1 -; CHECK-OPT: mov x1, x0 -; CHECK: bl _eat_landingpad_args +; CHECK: Lfunc_begin0: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 155, ___gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 16, Lexception0 +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: Ltmp3: +; CHECK-NEXT: bl _callee +; CHECK-NEXT: Ltmp4: +; CHECK-NEXT: ; %bb.1: ; %done +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: LBB45_2: ; %lpad +; CHECK-NEXT: Ltmp5: +; CHECK-NEXT: mov x2, x1 +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: ; kill: def $w2 killed $w2 killed $x2 +; CHECK-NEXT: bl _eat_landingpad_args +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: Lfunc_end0: +; CHECK-NEXT: .cfi_endproc +; CHECK-NEXT: .section __TEXT,__gcc_except_tab +; CHECK-NEXT: .p2align 2, 0x0 +; CHECK-NEXT: GCC_except_table45: +; CHECK-NEXT: Lexception0: +; CHECK-NEXT: .byte 255 ; @LPStart Encoding = omit +; CHECK-NEXT: .byte 155 ; @TType Encoding = indirect pcrel sdata4 +; CHECK-NEXT: .uleb128 Lttbase0-Lttbaseref0 +; CHECK-NEXT: Lttbaseref0: +; CHECK-NEXT: .byte 1 ; Call site Encoding = uleb128 +; CHECK-NEXT: .uleb128 Lcst_end0-Lcst_begin0 +; CHECK-NEXT: Lcst_begin0: +; CHECK-NEXT: .uleb128 Ltmp3-Lfunc_begin0 ; >> Call Site 1 << +; CHECK-NEXT: .uleb128 Ltmp4-Ltmp3 ; Call between Ltmp3 and Ltmp4 +; CHECK-NEXT: .uleb128 Ltmp5-Lfunc_begin0 ; jumps to Ltmp5 +; CHECK-NEXT: .byte 1 ; On action: 1 +; CHECK-NEXT: .uleb128 Ltmp4-Lfunc_begin0 ; >> Call Site 2 << +; CHECK-NEXT: .uleb128 Lfunc_end0-Ltmp4 ; Call between Ltmp4 and Lfunc_end0 +; CHECK-NEXT: .byte 0 ; has no landing pad +; CHECK-NEXT: .byte 0 ; On action: cleanup +; CHECK-NEXT: Lcst_end0: +; CHECK-NEXT: .byte 1 ; >> Action Record 1 << +; CHECK-NEXT: ; Catch TypeInfo 1 +; CHECK-NEXT: .byte 0 ; No further actions +; CHECK-NEXT: .p2align 2, 0x0 +; CHECK-NEXT: ; >> Catch TypeInfos << +; CHECK-NEXT: Ltmp10: ; TypeInfo 1 +; CHECK-NEXT: .long __ZTI8Whatever@GOT-Ltmp10 +; CHECK-NEXT: Lttbase0: +; CHECK-NEXT: .p2align 2, 0x0 invoke void @callee(ptr undef) to label %done unwind label %lpad lpad: ; preds = %entry @@ -575,10 +1019,19 @@ define void @test_dynamic_stackalloc() { ; CHECK-LABEL: test_dynamic_stackalloc: -; CHECK: sub [[REG:x[0-9]+]], sp, #32 -; CHECK: mov sp, [[REG]] -; CHECK-OPT-NOT: ubfx -; CHECK: bl _callee +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x0, sp, #32 +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: bl _callee +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret br label %next next: @@ -589,8 +1042,12 @@ define void @test_asm_memory(ptr %base.addr) { ; CHECK-LABEL: test_asm_memory: -; CHECK: add w[[ADDR:[0-9]+]], w0, #4 -; CHECK: str wzr, [x[[ADDR]] +; CHECK: ; %bb.0: +; CHECK-NEXT: add w8, w0, #4 +; CHECK-NEXT: ; InlineAsm Start +; CHECK-NEXT: str wzr, [x8] +; CHECK-NEXT: ; InlineAsm End +; CHECK-NEXT: ret %addr = getelementptr i32, ptr %base.addr, i32 1 call void asm sideeffect "str wzr, $0", "*m"(ptr elementtype(i32) %addr) ret void @@ -598,8 +1055,12 @@ define void @test_unsafe_asm_memory(i64 %val) { ; CHECK-LABEL: test_unsafe_asm_memory: -; CHECK: and x[[ADDR:[0-9]+]], x0, #0xffffffff -; CHECK: str wzr, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffff +; CHECK-NEXT: ; InlineAsm Start +; CHECK-NEXT: str wzr, [x8] +; CHECK-NEXT: ; InlineAsm End +; CHECK-NEXT: ret %addr_int = trunc i64 %val to i32 %addr = inttoptr i32 %addr_int to ptr call void asm sideeffect "str wzr, $0", "*m"(ptr elementtype(i32) %addr) @@ -608,14 +1069,22 @@ define [9 x ptr] @test_demoted_return(ptr %in) { ; CHECK-LABEL: test_demoted_return: -; CHECK: str w0, [x8, #32] +; CHECK: ; %bb.0: +; CHECK-NEXT: stp w8, w0, [x8, #28] +; CHECK-NEXT: stp w8, w8, [x8, #20] +; CHECK-NEXT: stp w8, w8, [x8, #12] +; CHECK-NEXT: stp w8, w8, [x8, #4] +; CHECK-NEXT: str w8, [x8] +; CHECK-NEXT: ret %res = insertvalue [9 x ptr] undef, ptr %in, 8 ret [9 x ptr] %res } define ptr @test_inttoptr(i64 %in) { ; CHECK-LABEL: test_inttoptr: -; CHECK: and x0, x0, #0xffffffff +; CHECK: ; %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffff +; CHECK-NEXT: ret %res = inttoptr i64 %in to ptr ret ptr %res } @@ -623,16 +1092,18 @@ declare i32 @llvm.get.dynamic.area.offset.i32() define i32 @test_dynamic_area() { ; CHECK-LABEL: test_dynamic_area: -; CHECK: mov w0, wzr +; CHECK: ; %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret %res = call i32 @llvm.get.dynamic.area.offset.i32() ret i32 %res } define void @test_pointer_vec_store(ptr %addr) { ; CHECK-LABEL: test_pointer_vec_store: -; CHECK: str xzr, [x0] -; CHECK-NOT: str -; CHECK-NOT: stp +; CHECK: ; %bb.0: +; CHECK-NEXT: str xzr, [x0] +; CHECK-NEXT: ret store <2 x ptr> zeroinitializer, ptr %addr, align 16 ret void @@ -640,28 +1111,58 @@ define <2 x ptr> @test_pointer_vec_load(ptr %addr) { ; CHECK-LABEL: test_pointer_vec_load: -; CHECK: ldr d[[TMP:[0-9]+]], [x0] -; CHECK: ushll.2d v0, v[[TMP]], #0 +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ushll.2d v0, v0, #0 +; CHECK-NEXT: ret %val = load <2 x ptr>, ptr %addr, align 16 ret <2 x ptr> %val } define void @test_inline_asm_mem_pointer(ptr %in) { ; CHECK-LABEL: test_inline_asm_mem_pointer: -; CHECK: str w0, +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #12 +; CHECK-NEXT: str w0, [sp, #12] +; CHECK-NEXT: ; InlineAsm Start +; CHECK-NEXT: ldr x0, [x8] +; CHECK-NEXT: ; InlineAsm End +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret tail call void asm sideeffect "ldr x0, $0", "rm"(ptr %in) ret void } define void @test_struct_hi(i32 %hi) nounwind { -; CHECK-LABEL: test_struct_hi: -; CHECK: mov w[[IN:[0-9]+]], w0 -; CHECK: bl _get_int -; CHECK-FAST-NEXT: mov w[[DST:[0-9]+]], w0 -; CHECK-FAST-NEXT: orr x0, x[[DST]], x[[IN]], lsl #32 -; CHECK-OPT-NEXT: bfi x0, x[[IN]], #32, #32 -; CHECK-NEXT: bl _take_pair +; CHECK-OPT-LABEL: test_struct_hi: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-OPT-NEXT: add x29, sp, #16 +; CHECK-OPT-NEXT: mov w19, w0 +; CHECK-OPT-NEXT: bl _get_int +; CHECK-OPT-NEXT: bfi x0, x19, #32, #32 +; CHECK-OPT-NEXT: bl _take_pair +; CHECK-OPT-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-OPT-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_struct_hi: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-FAST-NEXT: add x29, sp, #16 +; CHECK-FAST-NEXT: mov w19, w0 +; CHECK-FAST-NEXT: bl _get_int +; CHECK-FAST-NEXT: mov w8, w0 +; CHECK-FAST-NEXT: orr x0, x8, x19, lsl #32 +; CHECK-FAST-NEXT: bl _take_pair +; CHECK-FAST-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-FAST-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: ret %val.64 = call i64 @get_int() %val.32 = trunc i64 %val.64 to i32 @@ -675,16 +1176,55 @@ declare i64 @get_int() define i1 @test_icmp_ptr(ptr %in) { -; CHECK-LABEL: test_icmp_ptr -; CHECK: ubfx x0, x0, #31, #1 +; CHECK-LABEL: test_icmp_ptr: +; CHECK: ; %bb.0: +; CHECK-NEXT: ubfx x0, x0, #31, #1 +; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret %res = icmp slt ptr %in, null ret i1 %res } define void @test_multiple_icmp_ptr(ptr %l, ptr %r) { -; CHECK-LABEL: test_multiple_icmp_ptr: -; CHECK: tbnz w0, #31, [[FALSEBB:LBB[0-9]+_[0-9]+]] -; CHECK: tbnz w1, #31, [[FALSEBB]] +; CHECK-OPT-LABEL: test_multiple_icmp_ptr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: tbnz w0, #31, LBB57_3 +; CHECK-OPT-NEXT: ; %bb.1: +; CHECK-OPT-NEXT: tbnz w1, #31, LBB57_3 +; CHECK-OPT-NEXT: ; %bb.2: ; %true +; CHECK-OPT-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 16 +; CHECK-OPT-NEXT: mov x29, sp +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: bl _bar +; CHECK-OPT-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: LBB57_3: ; %false +; CHECK-OPT-NEXT: .cfi_def_cfa wsp, 0 +; CHECK-OPT-NEXT: .cfi_same_value w30 +; CHECK-OPT-NEXT: .cfi_same_value w29 +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_multiple_icmp_ptr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: tbnz w0, #31, LBB57_3 +; CHECK-FAST-NEXT: ; %bb.1: ; %.cond.split +; CHECK-FAST-NEXT: tbnz w1, #31, LBB57_3 +; CHECK-FAST-NEXT: ; %bb.2: ; %true +; CHECK-FAST-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 16 +; CHECK-FAST-NEXT: mov x29, sp +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: bl _bar +; CHECK-FAST-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: LBB57_3: ; %false +; CHECK-FAST-NEXT: .cfi_def_cfa wsp, 0 +; CHECK-FAST-NEXT: .cfi_same_value w30 +; CHECK-FAST-NEXT: .cfi_same_value w29 +; CHECK-FAST-NEXT: ret %tst1 = icmp sgt ptr %l, inttoptr (i32 -1 to ptr) %tst2 = icmp sgt ptr %r, inttoptr (i32 -1 to ptr) %tst = and i1 %tst1, %tst2 @@ -699,9 +1239,45 @@ } define void @test_multiple_icmp_ptr_select(ptr %l, ptr %r) { -; CHECK-LABEL: test_multiple_icmp_ptr_select: -; CHECK: tbnz w0, #31, [[FALSEBB:LBB[0-9]+_[0-9]+]] -; CHECK: tbnz w1, #31, [[FALSEBB]] +; CHECK-OPT-LABEL: test_multiple_icmp_ptr_select: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: tbnz w0, #31, LBB58_3 +; CHECK-OPT-NEXT: ; %bb.1: +; CHECK-OPT-NEXT: tbnz w1, #31, LBB58_3 +; CHECK-OPT-NEXT: ; %bb.2: ; %true +; CHECK-OPT-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 16 +; CHECK-OPT-NEXT: mov x29, sp +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: bl _bar +; CHECK-OPT-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: LBB58_3: ; %false +; CHECK-OPT-NEXT: .cfi_def_cfa wsp, 0 +; CHECK-OPT-NEXT: .cfi_same_value w30 +; CHECK-OPT-NEXT: .cfi_same_value w29 +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_multiple_icmp_ptr_select: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: tbnz w0, #31, LBB58_3 +; CHECK-FAST-NEXT: ; %bb.1: ; %.cond.split +; CHECK-FAST-NEXT: tbnz w1, #31, LBB58_3 +; CHECK-FAST-NEXT: ; %bb.2: ; %true +; CHECK-FAST-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 16 +; CHECK-FAST-NEXT: mov x29, sp +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: bl _bar +; CHECK-FAST-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: LBB58_3: ; %false +; CHECK-FAST-NEXT: .cfi_def_cfa wsp, 0 +; CHECK-FAST-NEXT: .cfi_same_value w30 +; CHECK-FAST-NEXT: .cfi_same_value w29 +; CHECK-FAST-NEXT: ret %tst1 = icmp sgt ptr %l, inttoptr (i32 -1 to ptr) %tst2 = icmp sgt ptr %r, inttoptr (i32 -1 to ptr) %tst = select i1 %tst1, i1 %tst2, i1 false @@ -716,25 +1292,31 @@ } define ptr @test_gep_nonpow2(ptr %a0, i32 %a1) { -; CHECK-LABEL: test_gep_nonpow2: -; CHECK-OPT: mov w[[SIZE:[0-9]+]], #18 -; CHECK-OPT-NEXT: smaddl x0, w1, w[[SIZE]], x0 -; CHECK-OPT-NEXT: ret - -; CHECK-FAST: mov w[[SIZE:[0-9]+]], #18 -; CHECK-FAST-NEXT: smaddl [[TMP:x[0-9]+]], w1, w[[SIZE]], x0 -; CHECK-FAST-NEXT: and x0, [[TMP]], #0xffffffff -; CHECK-FAST-NEXT: ret +; CHECK-OPT-LABEL: test_gep_nonpow2: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: mov w8, #18 ; =0x12 +; CHECK-OPT-NEXT: smaddl x0, w1, w8, x0 +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_gep_nonpow2: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: mov w8, #18 ; =0x12 +; CHECK-FAST-NEXT: smaddl x8, w1, w8, x0 +; CHECK-FAST-NEXT: and x0, x8, #0xffffffff +; CHECK-FAST-NEXT: ret + %tmp0 = getelementptr inbounds { [18 x i8] }, ptr %a0, i32 %a1 ret ptr %tmp0 } define void @test_memset(i64 %in, i8 %value) { ; CHECK-LABEL: test_memset: -; CHECK-DAG: and x8, x0, #0xffffffff -; CHECK-DAG: lsr x2, x0, #32 -; CHECK-DAG: mov x0, x8 -; CHECK: b _memset +; CHECK: ; %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffff +; CHECK-NEXT: lsr x2, x0, #32 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ; kill: def $w2 killed $w2 killed $x2 +; CHECK-NEXT: b _memset %ptr.i32 = trunc i64 %in to i32 %size.64 = lshr i64 %in, 32 @@ -746,9 +1328,11 @@ define void @test_bzero(i64 %in) { ; CHECK-LABEL: test_bzero: -; CHECK-DAG: lsr x1, x0, #32 -; CHECK-DAG: and x0, x0, #0xffffffff -; CHECK: b _bzero +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr x1, x0, #32 +; CHECK-NEXT: and x0, x0, #0xffffffff +; CHECK-NEXT: ; kill: def $w1 killed $w1 killed $x1 +; CHECK-NEXT: b _bzero %ptr.i32 = trunc i64 %in to i32 %size.64 = lshr i64 %in, 32 diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll --- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll @@ -6,8 +6,8 @@ ; CHECK-LABEL: varargs_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: stp x1, x2, [x4, #-24]! -; CHECK-NEXT: str x3, [x4, #16] +; CHECK-NEXT: str x1, [x4, #-24]! +; CHECK-NEXT: stp x2, x3, [x4, #8] ; CHECK-NEXT: str x4, [sp, #8] ; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret @@ -35,12 +35,12 @@ ; CHECK-NEXT: sub sp, sp, #48 ; CHECK-NEXT: mov x4, sp ; CHECK-NEXT: add x8, sp, #16 -; CHECK-NEXT: mov x9, #4617315517961601024 -; CHECK-NEXT: mov x0, #4607182418800017408 -; CHECK-NEXT: mov w1, #2 -; CHECK-NEXT: mov x2, #4613937818241073152 -; CHECK-NEXT: mov w3, #4 -; CHECK-NEXT: mov w5, #16 +; CHECK-NEXT: mov x9, #4617315517961601024 // =0x4014000000000000 +; CHECK-NEXT: mov x0, #4607182418800017408 // =0x3ff0000000000000 +; CHECK-NEXT: mov w1, #2 // =0x2 +; CHECK-NEXT: mov x2, #4613937818241073152 // =0x4008000000000000 +; CHECK-NEXT: mov w3, #4 // =0x4 +; CHECK-NEXT: mov w5, #16 // =0x10 ; CHECK-NEXT: stp xzr, x30, [sp, #24] // 8-byte Folded Spill ; CHECK-NEXT: stp x8, xzr, [sp, #8] ; CHECK-NEXT: str x9, [sp] @@ -71,13 +71,13 @@ ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov x4, sp -; CHECK-NEXT: mov x8, #4618441417868443648 +; CHECK-NEXT: mov x8, #4618441417868443648 // =0x4018000000000000 ; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: add x3, sp, #32 -; CHECK-NEXT: mov x0, #4607182418800017408 -; CHECK-NEXT: mov x1, #4611686018427387904 -; CHECK-NEXT: mov x2, #4613937818241073152 -; CHECK-NEXT: mov w5, #16 +; CHECK-NEXT: mov x0, #4607182418800017408 // =0x3ff0000000000000 +; CHECK-NEXT: mov x1, #4611686018427387904 // =0x4000000000000000 +; CHECK-NEXT: mov x2, #4613937818241073152 // =0x4008000000000000 +; CHECK-NEXT: mov w5, #16 // =0x10 ; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill ; CHECK-NEXT: stp q0, q0, [sp, #16] ; CHECK-NEXT: stp x9, x8, [sp] diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll --- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -45,7 +45,7 @@ ; ; LSE-LABEL: test_rmw_add_8: ; LSE: // %bb.0: // %entry -; LSE-NEXT: mov w8, #1 +; LSE-NEXT: mov w8, #1 // =0x1 ; LSE-NEXT: ldaddalb w8, w0, [x0] ; LSE-NEXT: ret entry: @@ -94,7 +94,7 @@ ; ; LSE-LABEL: test_rmw_add_16: ; LSE: // %bb.0: // %entry -; LSE-NEXT: mov w8, #1 +; LSE-NEXT: mov w8, #1 // =0x1 ; LSE-NEXT: ldaddalh w8, w0, [x0] ; LSE-NEXT: ret entry: @@ -143,7 +143,7 @@ ; ; LSE-LABEL: test_rmw_add_32: ; LSE: // %bb.0: // %entry -; LSE-NEXT: mov w8, #1 +; LSE-NEXT: mov w8, #1 // =0x1 ; LSE-NEXT: ldaddal w8, w0, [x0] ; LSE-NEXT: ret entry: @@ -192,7 +192,7 @@ ; ; LSE-LABEL: test_rmw_add_64: ; LSE: // %bb.0: // %entry -; LSE-NEXT: mov w8, #1 +; LSE-NEXT: mov w8, #1 // =0x1 ; LSE-NEXT: // kill: def $x8 killed $w8 ; LSE-NEXT: ldaddal x8, x0, [x0] ; LSE-NEXT: ret @@ -215,37 +215,35 @@ ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB4_2 Depth 2 -; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload -; NOLSE-NEXT: adds x14, x11, #1 -; NOLSE-NEXT: cinc x15, x13, hs +; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: adds x14, x10, #1 +; NOLSE-NEXT: cinc x15, x11, hs ; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB4_1 Depth=1 ; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x10, x12, [x9] -; NOLSE-NEXT: cmp x10, x11 -; NOLSE-NEXT: cset w8, ne -; NOLSE-NEXT: cmp x12, x13 -; NOLSE-NEXT: cinc w8, w8, ne -; NOLSE-NEXT: cbnz w8, .LBB4_4 +; NOLSE-NEXT: ldaxp x9, x8, [x13] +; NOLSE-NEXT: cmp x9, x10 +; NOLSE-NEXT: cset w12, ne +; NOLSE-NEXT: cmp x8, x11 +; NOLSE-NEXT: cinc w12, w12, ne +; NOLSE-NEXT: cbnz w12, .LBB4_4 ; NOLSE-NEXT: // %bb.3: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 -; NOLSE-NEXT: stlxp w8, x14, x15, [x9] -; NOLSE-NEXT: cbnz w8, .LBB4_2 +; NOLSE-NEXT: stlxp w12, x14, x15, [x13] +; NOLSE-NEXT: cbnz w12, .LBB4_2 ; NOLSE-NEXT: b .LBB4_5 ; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 -; NOLSE-NEXT: stlxp w8, x10, x12, [x9] -; NOLSE-NEXT: cbnz w8, .LBB4_2 +; NOLSE-NEXT: stlxp w12, x9, x8, [x13] +; NOLSE-NEXT: cbnz w12, .LBB4_2 ; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 -; NOLSE-NEXT: mov x8, x12 +; NOLSE-NEXT: subs x11, x8, x11 +; NOLSE-NEXT: ccmp x9, x10, #0, eq ; NOLSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill -; NOLSE-NEXT: mov x9, x10 ; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill -; NOLSE-NEXT: subs x12, x12, x13 -; NOLSE-NEXT: ccmp x10, x11, #0, eq ; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill ; NOLSE-NEXT: b.ne .LBB4_1 @@ -605,41 +603,39 @@ ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB9_2 Depth 2 -; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload -; NOLSE-NEXT: mov w8, w11 -; NOLSE-NEXT: mvn w10, w8 -; NOLSE-NEXT: // implicit-def: $x8 +; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload ; NOLSE-NEXT: mov w8, w10 +; NOLSE-NEXT: mvn w9, w8 +; NOLSE-NEXT: // implicit-def: $x8 +; NOLSE-NEXT: mov w8, w9 ; NOLSE-NEXT: orr x14, x8, #0xfffffffffffffffe -; NOLSE-NEXT: mov x15, #-1 +; NOLSE-NEXT: mov x15, #-1 // =0xffffffffffffffff ; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB9_1 Depth=1 ; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x10, x12, [x9] -; NOLSE-NEXT: cmp x10, x11 -; NOLSE-NEXT: cset w8, ne -; NOLSE-NEXT: cmp x12, x13 -; NOLSE-NEXT: cinc w8, w8, ne -; NOLSE-NEXT: cbnz w8, .LBB9_4 +; NOLSE-NEXT: ldaxp x9, x8, [x13] +; NOLSE-NEXT: cmp x9, x10 +; NOLSE-NEXT: cset w12, ne +; NOLSE-NEXT: cmp x8, x11 +; NOLSE-NEXT: cinc w12, w12, ne +; NOLSE-NEXT: cbnz w12, .LBB9_4 ; NOLSE-NEXT: // %bb.3: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 -; NOLSE-NEXT: stlxp w8, x14, x15, [x9] -; NOLSE-NEXT: cbnz w8, .LBB9_2 +; NOLSE-NEXT: stlxp w12, x14, x15, [x13] +; NOLSE-NEXT: cbnz w12, .LBB9_2 ; NOLSE-NEXT: b .LBB9_5 ; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 -; NOLSE-NEXT: stlxp w8, x10, x12, [x9] -; NOLSE-NEXT: cbnz w8, .LBB9_2 +; NOLSE-NEXT: stlxp w12, x9, x8, [x13] +; NOLSE-NEXT: cbnz w12, .LBB9_2 ; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1 -; NOLSE-NEXT: mov x8, x12 +; NOLSE-NEXT: subs x11, x8, x11 +; NOLSE-NEXT: ccmp x9, x10, #0, eq ; NOLSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill -; NOLSE-NEXT: mov x9, x10 ; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill -; NOLSE-NEXT: subs x12, x12, x13 -; NOLSE-NEXT: ccmp x10, x11, #0, eq ; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill ; NOLSE-NEXT: b.ne .LBB9_1 @@ -672,7 +668,7 @@ ; LSE-NEXT: // implicit-def: $x9 ; LSE-NEXT: mov w9, w12 ; LSE-NEXT: orr x2, x9, #0xfffffffffffffffe -; LSE-NEXT: mov x9, #-1 +; LSE-NEXT: mov x9, #-1 // =0xffffffffffffffff ; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 ; LSE-NEXT: mov x3, x9 ; LSE-NEXT: caspal x0, x1, x2, x3, [x8] diff --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll --- a/llvm/test/CodeGen/AArch64/bcmp.ll +++ b/llvm/test/CodeGen/AArch64/bcmp.ll @@ -6,7 +6,7 @@ define i1 @bcmp0(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 0) %r = icmp eq i32 %cr, 0 @@ -418,7 +418,7 @@ ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov w2, #89 +; CHECK-NEXT: mov w2, #89 // =0x59 ; CHECK-NEXT: bl bcmp ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cset w0, eq @@ -432,10 +432,11 @@ define i1 @bcmp_zext(i32 %0, i32 %1, i8 %2, i8 %3) { ; CHECK-LABEL: bcmp_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w2, #0xff -; CHECK-NEXT: and w9, w3, #0xff -; CHECK-NEXT: cmp w1, w0 -; CHECK-NEXT: ccmp w9, w8, #0, eq +; CHECK-NEXT: eor w8, w3, w2 +; CHECK-NEXT: eor w9, w1, w0 +; CHECK-NEXT: and w8, w8, #0xff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %5 = xor i32 %1, %0 @@ -449,14 +450,12 @@ define i1 @bcmp_i8(i8 %a0, i8 %b0, i8 %a1, i8 %b1, i8 %a2, i8 %b2) { ; CHECK-LABEL: bcmp_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w9, w1, #0xff -; CHECK-NEXT: and w8, w2, #0xff -; CHECK-NEXT: and w10, w3, #0xff -; CHECK-NEXT: cmp w9, w0, uxtb -; CHECK-NEXT: ccmp w10, w8, #0, eq -; CHECK-NEXT: and w8, w4, #0xff -; CHECK-NEXT: and w9, w5, #0xff -; CHECK-NEXT: ccmp w9, w8, #0, eq +; CHECK-NEXT: eor w8, w1, w0 +; CHECK-NEXT: eor w9, w3, w2 +; CHECK-NEXT: eor w10, w5, w4 +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: tst w8, #0xff ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %xor0 = xor i8 %b0, %a0 @@ -471,14 +470,12 @@ define i1 @bcmp_i16(i16 %a0, i16 %b0, i16 %a1, i16 %b1, i16 %a2, i16 %b2) { ; CHECK-LABEL: bcmp_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: and w9, w1, #0xffff -; CHECK-NEXT: and w8, w2, #0xffff -; CHECK-NEXT: and w10, w3, #0xffff -; CHECK-NEXT: cmp w9, w0, uxth -; CHECK-NEXT: ccmp w10, w8, #0, eq -; CHECK-NEXT: and w8, w4, #0xffff -; CHECK-NEXT: and w9, w5, #0xffff -; CHECK-NEXT: ccmp w9, w8, #0, eq +; CHECK-NEXT: eor w8, w1, w0 +; CHECK-NEXT: eor w9, w3, w2 +; CHECK-NEXT: eor w10, w5, w4 +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: tst w8, #0xffff ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %xor0 = xor i16 %b0, %a0 @@ -496,13 +493,14 @@ ; CHECK-NEXT: cmp x2, x0 ; CHECK-NEXT: ccmp x3, x1, #0, eq ; CHECK-NEXT: ldp x9, x8, [sp] -; CHECK-NEXT: ccmp x6, x4, #0, eq -; CHECK-NEXT: ldp x10, x11, [sp, #16] +; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: cmp x6, x4 +; CHECK-NEXT: ldp x11, x12, [sp, #16] ; CHECK-NEXT: ccmp x7, x5, #0, eq -; CHECK-NEXT: cset w12, ne -; CHECK-NEXT: cmp x10, x9 -; CHECK-NEXT: ccmp x11, x8, #0, eq -; CHECK-NEXT: csinc w0, w12, wzr, eq +; CHECK-NEXT: csinc w10, w10, wzr, eq +; CHECK-NEXT: cmp x11, x9 +; CHECK-NEXT: ccmp x12, x8, #0, eq +; CHECK-NEXT: csinc w0, w10, wzr, eq ; CHECK-NEXT: ret %xor0 = xor i128 %b0, %a0 %xor1 = xor i128 %b1, %a1 @@ -516,15 +514,12 @@ define i1 @bcmp_i42(i42 %a0, i42 %b0, i42 %a1, i42 %b1, i42 %a2, i42 %b2) { ; CHECK-LABEL: bcmp_i42: ; CHECK: // %bb.0: -; CHECK-NEXT: and x9, x0, #0x3ffffffffff -; CHECK-NEXT: and x10, x1, #0x3ffffffffff -; CHECK-NEXT: and x8, x2, #0x3ffffffffff -; CHECK-NEXT: and x11, x3, #0x3ffffffffff -; CHECK-NEXT: cmp x10, x9 -; CHECK-NEXT: and x9, x5, #0x3ffffffffff -; CHECK-NEXT: ccmp x11, x8, #0, eq -; CHECK-NEXT: and x8, x4, #0x3ffffffffff -; CHECK-NEXT: ccmp x9, x8, #0, eq +; CHECK-NEXT: eor x8, x1, x0 +; CHECK-NEXT: eor x9, x3, x2 +; CHECK-NEXT: eor x10, x5, x4 +; CHECK-NEXT: orr x8, x8, x9 +; CHECK-NEXT: orr x8, x8, x10 +; CHECK-NEXT: tst x8, #0x3ffffffffff ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %xor0 = xor i42 %b0, %a0 diff --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll --- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll +++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll @@ -22,7 +22,7 @@ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh w10, [x9, #72] ; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: ubfx x11, x10, #8, #24 +; CHECK-NEXT: lsr w11, w10, #8 ; CHECK-NEXT: cset w12, ne ; CHECK-NEXT: csel w8, w8, w11, eq ; CHECK-NEXT: ldr x11, [x9, #8] @@ -90,7 +90,7 @@ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh w10, [x9, #72] ; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: ubfx x11, x10, #8, #24 +; CHECK-NEXT: lsr w11, w10, #8 ; CHECK-NEXT: cset w12, ne ; CHECK-NEXT: csel w8, w8, w11, eq ; CHECK-NEXT: ldr x11, [x9, #8] diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll --- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll +++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll @@ -267,9 +267,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff ; CHECK-NEXT: lsl w8, w8, #8 -; CHECK-NEXT: mov w9, w8 -; CHECK-NEXT: bfxil w9, w0, #0, #8 -; CHECK-NEXT: orr w0, w8, w9, lsl #16 +; CHECK-NEXT: orr w8, w8, w0, lsl #16 +; CHECK-NEXT: bfxil w8, w0, #0, #8 +; CHECK-NEXT: lsl w0, w8, #8 ; CHECK-NEXT: ret %conv = zext i8 %a to i32 ; 0 0 0 A %shl = shl i32 %b, 8 ; B2 B1 B0 0 diff --git a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll --- a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll @@ -9,16 +9,18 @@ ; CHECK: // %bb.0: // %bb ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: bfi x10, x0, #1, #3 +; CHECK-NEXT: umov w10, v0.h[0] ; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: dup v0.4h, v0.h[0] ; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: dup v1.8h, w9 -; CHECK-NEXT: str q0, [sp] -; CHECK-NEXT: ld1 { v1.h }[1], [x10] -; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: bfi x9, x8, #1, #3 +; CHECK-NEXT: dup v2.4h, w10 +; CHECK-NEXT: str q1, [sp] +; CHECK-NEXT: ld1 { v2.h }[1], [x9] +; CHECK-NEXT: str d0, [x8, #8] +; CHECK-NEXT: str d2, [x8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll --- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -22,11 +22,11 @@ define <16 x i8> @test2(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ld1r { v1.8b }, [x1] -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: dup v0.8b, w8 -; CHECK-NEXT: mov v1.b[7], w8 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: ld1r { v1.16b }, [x1] +; CHECK-NEXT: ld1r { v0.16b }, [x0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -42,9 +42,9 @@ define <16 x i8> @test3(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ld1r { v0.8b }, [x0] -; CHECK-NEXT: ld1r { v1.8b }, [x1] -; CHECK-NEXT: zip1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ld1r { v0.16b }, [x1] +; CHECK-NEXT: ld1r { v1.16b }, [x0] +; CHECK-NEXT: zip1 v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -209,12 +209,12 @@ define <4 x i32> @test12(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test12: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ld1r { v0.2s }, [x0] ; CHECK-NEXT: ldr w8, [x1] -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: mov v1.s[0], w8 +; CHECK-NEXT: ld1r { v0.4s }, [x0] +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: zip1 v0.4s, v1.4s, v0.4s ; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i32, ptr %a, align 1 diff --git a/llvm/test/CodeGen/AArch64/cmp-bool.ll b/llvm/test/CodeGen/AArch64/cmp-bool.ll --- a/llvm/test/CodeGen/AArch64/cmp-bool.ll +++ b/llvm/test/CodeGen/AArch64/cmp-bool.ll @@ -25,8 +25,9 @@ define void @bool_ne(i1 zeroext %a, i1 zeroext %b, ptr nocapture %c) nounwind { ; CHECK-LABEL: bool_ne: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w0, w1 -; CHECK-NEXT: b.eq .LBB1_2 +; CHECK-NEXT: eor w8, w0, w1 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: b.ne .LBB1_2 ; CHECK-NEXT: // %bb.1: // %if.then ; CHECK-NEXT: br x2 ; CHECK-NEXT: .LBB1_2: // %if.end diff --git a/llvm/test/CodeGen/AArch64/cmp-const-max.ll b/llvm/test/CodeGen/AArch64/cmp-const-max.ll --- a/llvm/test/CodeGen/AArch64/cmp-const-max.ll +++ b/llvm/test/CodeGen/AArch64/cmp-const-max.ll @@ -1,11 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -verify-machineinstrs -aarch64-enable-atomic-cfg-tidy=0 < %s -mtriple=aarch64-none-eabihf -fast-isel=false | FileCheck %s define i32 @ule_64_max(i64 %p) { -entry: ; CHECK-LABEL: ule_64_max: -; CHECK: cmn x0, #1 -; CHECK: b.hi [[RET_ZERO:.LBB[0-9]+_[0-9]+]] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbnz wzr, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %ret_one +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %ret_zero +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: %cmp = icmp ule i64 %p, 18446744073709551615 ; 0xffffffffffffffff br i1 %cmp, label %ret_one, label %ret_zero @@ -13,16 +20,21 @@ ret i32 1 ret_zero: -; CHECK: [[RET_ZERO]]: -; CHECK-NEXT: mov w0, wzr ret i32 0 } define i32 @ugt_64_max(i64 %p) { -entry: ; CHECK-LABEL: ugt_64_max: -; CHECK: cmn x0, #1 -; CHECK: b.ls [[RET_ZERO:.LBB[0-9]+_[0-9]+]] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: cbnz w8, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %ret_one +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_2: // %ret_zero +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: %cmp = icmp ugt i64 %p, 18446744073709551615 ; 0xffffffffffffffff br i1 %cmp, label %ret_one, label %ret_zero @@ -30,7 +42,5 @@ ret i32 1 ret_zero: -; CHECK: [[RET_ZERO]]: -; CHECK-NEXT: mov w0, wzr ret i32 0 } diff --git a/llvm/test/CodeGen/AArch64/combine-andintoload.ll b/llvm/test/CodeGen/AArch64/combine-andintoload.ll --- a/llvm/test/CodeGen/AArch64/combine-andintoload.ll +++ b/llvm/test/CodeGen/AArch64/combine-andintoload.ll @@ -232,15 +232,15 @@ ; CHECK-LABEL: load8_and16_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: and w8, w1, w8 -; CHECK-NEXT: and x0, x8, #0xff +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: and x0, x1, x8 ; CHECK-NEXT: ret ; ; CHECKBE-LABEL: load8_and16_zext: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ldrb w8, [x0] -; CHECKBE-NEXT: and w8, w1, w8 -; CHECKBE-NEXT: and x0, x8, #0xff +; CHECKBE-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECKBE-NEXT: and x0, x1, x8 ; CHECKBE-NEXT: ret %x = load i8, ptr %p, align 4 %xz = zext i8 %x to i64 @@ -415,10 +415,10 @@ ; CHECK-NEXT: ldrb w8, [x0, x2] ; CHECK-NEXT: and w10, w3, #0x7 ; CHECK-NEXT: ldrb w9, [x1, x2] -; CHECK-NEXT: mov w11, #8 +; CHECK-NEXT: mov w11, #8 // =0x8 ; CHECK-NEXT: sub w10, w11, w10 ; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: mov w9, #5 +; CHECK-NEXT: mov w9, #5 // =0x5 ; CHECK-NEXT: lsr w8, w8, w10 ; CHECK-NEXT: tst w8, w9 ; CHECK-NEXT: cset w0, eq @@ -429,10 +429,10 @@ ; CHECKBE-NEXT: ldrb w8, [x0, x2] ; CHECKBE-NEXT: and w10, w3, #0x7 ; CHECKBE-NEXT: ldrb w9, [x1, x2] -; CHECKBE-NEXT: mov w11, #8 +; CHECKBE-NEXT: mov w11, #8 // =0x8 ; CHECKBE-NEXT: sub w10, w11, w10 ; CHECKBE-NEXT: eor w8, w9, w8 -; CHECKBE-NEXT: mov w9, #5 +; CHECKBE-NEXT: mov w9, #5 // =0x5 ; CHECKBE-NEXT: lsr w8, w8, w10 ; CHECKBE-NEXT: tst w8, w9 ; CHECKBE-NEXT: cset w0, eq diff --git a/llvm/test/CodeGen/AArch64/combine-mul.ll b/llvm/test/CodeGen/AArch64/combine-mul.ll --- a/llvm/test/CodeGen/AArch64/combine-mul.ll +++ b/llvm/test/CodeGen/AArch64/combine-mul.ll @@ -66,7 +66,7 @@ define i8 @one_demanded_bit(i8 %x) { ; CHECK-LABEL: one_demanded_bit: ; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, #6 +; CHECK-NEXT: neg w8, w0, lsl #6 ; CHECK-NEXT: orr w0, w8, #0xffffffbf ; CHECK-NEXT: ret %m = mul i8 %x, 192 ; 0b1100_0000 @@ -77,7 +77,7 @@ define <2 x i64> @one_demanded_bit_splat(<2 x i64> %x) { ; CHECK-LABEL: one_demanded_bit_splat: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w8, #32 // =0x20 ; CHECK-NEXT: shl v0.2d, v0.2d, #5 ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -131,7 +131,7 @@ define <2 x i64> @squared_demanded_2_low_bits_splat(<2 x i64> %x) { ; CHECK-LABEL: squared_demanded_2_low_bits_splat: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-2 +; CHECK-NEXT: mov x8, #-2 // =0xfffffffffffffffe ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -201,93 +201,85 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) { ; CHECK-LABEL: abp90c12: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr s21, [sp, #32] -; CHECK-NEXT: add x9, sp, #48 -; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: ldr s23, [sp, #40] -; CHECK-NEXT: add x11, sp, #56 +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 +; CHECK-NEXT: add x10, sp, #48 +; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: ldr s2, [sp] -; CHECK-NEXT: add x10, sp, #16 -; CHECK-NEXT: ld1 { v21.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ld1 { v23.s }[1], [x11] +; CHECK-NEXT: ldr s2, [sp, #32] +; CHECK-NEXT: ldr s23, [sp, #8] +; CHECK-NEXT: add x11, sp, #24 +; CHECK-NEXT: ldr s21, [sp, #40] ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-NEXT: ldr s22, [sp, #96] -; CHECK-NEXT: add x11, sp, #24 -; CHECK-NEXT: ld1 { v2.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #72 -; CHECK-NEXT: mov v1.s[1], v3.s[0] -; CHECK-NEXT: ld1 { v21.s }[2], [x9] -; CHECK-NEXT: ldr s24, [sp, #8] -; CHECK-NEXT: add x9, sp, #112 -; CHECK-NEXT: ld1 { v23.s }[2], [x10] -; CHECK-NEXT: add x10, sp, #80 ; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 -; CHECK-NEXT: ldr s18, [sp, #128] ; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 -; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 -; CHECK-NEXT: mov v1.s[2], v5.s[0] +; CHECK-NEXT: ld1 { v2.s }[1], [x10] +; CHECK-NEXT: add x10, sp, #56 +; CHECK-NEXT: mov v0.s[2], v4.s[0] +; CHECK-NEXT: ldr s4, [sp] +; CHECK-NEXT: ldr s18, [sp, #128] +; CHECK-NEXT: ld1 { v23.s }[1], [x11] +; CHECK-NEXT: add x11, sp, #144 +; CHECK-NEXT: ld1 { v4.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #64 +; CHECK-NEXT: ldr s22, [sp, #96] +; CHECK-NEXT: mov v1.s[1], v3.s[0] +; CHECK-NEXT: ld1 { v21.s }[1], [x10] +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ld1 { v18.s }[1], [x11] +; CHECK-NEXT: add x11, sp, #72 ; CHECK-NEXT: ldr s20, [sp, #104] -; CHECK-NEXT: ld1 { v24.s }[1], [x11] -; CHECK-NEXT: add x11, sp, #88 +; CHECK-NEXT: add x10, sp, #80 ; CHECK-NEXT: ld1 { v22.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #144 -; CHECK-NEXT: ld1 { v21.s }[3], [x10] -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: mov v0.s[2], v4.s[0] -; CHECK-NEXT: ld1 { v23.s }[3], [x11] -; CHECK-NEXT: ld1 { v18.s }[1], [x9] -; CHECK-NEXT: add x11, sp, #152 -; CHECK-NEXT: ld1 { v20.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: add x9, sp, #160 +; CHECK-NEXT: mov v1.s[2], v5.s[0] +; CHECK-NEXT: ld1 { v21.s }[2], [x11] +; CHECK-NEXT: add x11, sp, #120 +; CHECK-NEXT: ldr s16, [sp, #136] +; CHECK-NEXT: ld1 { v18.s }[2], [x9] +; CHECK-NEXT: add x9, sp, #88 +; CHECK-NEXT: ld1 { v2.s }[3], [x10] +; CHECK-NEXT: add x10, sp, #152 +; CHECK-NEXT: ld1 { v20.s }[1], [x11] +; CHECK-NEXT: add x11, sp, #168 ; CHECK-NEXT: mov v1.s[3], v7.s[0] -; CHECK-NEXT: ldr s17, [sp, #136] +; CHECK-NEXT: ld1 { v21.s }[3], [x9] +; CHECK-NEXT: ld1 { v16.s }[1], [x10] +; CHECK-NEXT: add x10, sp, #176 +; CHECK-NEXT: mov v0.s[3], v6.s[0] +; CHECK-NEXT: ldr s17, [sp, #200] +; CHECK-NEXT: fmul v3.4s, v20.4s, v23.4s ; CHECK-NEXT: ldr s19, [sp, #192] +; CHECK-NEXT: fmul v5.4s, v21.4s, v1.4s ; CHECK-NEXT: add x9, sp, #208 -; CHECK-NEXT: mov v0.s[3], v6.s[0] -; CHECK-NEXT: ld1 { v18.s }[2], [x10] -; CHECK-NEXT: ld1 { v17.s }[1], [x11] -; CHECK-NEXT: add x10, sp, #176 -; CHECK-NEXT: fmul v3.4s, v23.4s, v1.4s -; CHECK-NEXT: ld1 { v19.s }[1], [x9] -; CHECK-NEXT: fmul v4.4s, v20.4s, v24.4s -; CHECK-NEXT: add x9, sp, #168 -; CHECK-NEXT: fmul v1.4s, v21.4s, v1.4s +; CHECK-NEXT: ld1 { v16.s }[2], [x11] +; CHECK-NEXT: add x11, sp, #184 +; CHECK-NEXT: fmul v1.4s, v2.4s, v1.4s ; CHECK-NEXT: ld1 { v18.s }[3], [x10] -; CHECK-NEXT: fmul v5.4s, v22.4s, v24.4s -; CHECK-NEXT: ldr s16, [sp, #200] -; CHECK-NEXT: ld1 { v17.s }[2], [x9] -; CHECK-NEXT: add x11, sp, #216 +; CHECK-NEXT: fmul v6.4s, v22.4s, v23.4s +; CHECK-NEXT: add x10, sp, #216 ; CHECK-NEXT: fneg v3.4s, v3.4s -; CHECK-NEXT: add x9, sp, #184 -; CHECK-NEXT: fneg v4.4s, v4.4s -; CHECK-NEXT: fmla v1.4s, v0.4s, v23.4s -; CHECK-NEXT: fmla v5.4s, v2.4s, v20.4s -; CHECK-NEXT: ld1 { v16.s }[1], [x11] -; CHECK-NEXT: ld1 { v17.s }[3], [x9] -; CHECK-NEXT: fmla v3.4s, v0.4s, v21.4s -; CHECK-NEXT: fmla v4.4s, v2.4s, v22.4s +; CHECK-NEXT: ld1 { v19.s }[1], [x9] +; CHECK-NEXT: fneg v5.4s, v5.4s +; CHECK-NEXT: ld1 { v16.s }[3], [x11] +; CHECK-NEXT: fmla v1.4s, v0.4s, v21.4s +; CHECK-NEXT: ld1 { v17.s }[1], [x10] +; CHECK-NEXT: fmla v6.4s, v4.4s, v20.4s +; CHECK-NEXT: fmla v3.4s, v4.4s, v22.4s +; CHECK-NEXT: fmla v5.4s, v0.4s, v2.4s ; CHECK-NEXT: fsub v0.4s, v18.4s, v1.4s -; CHECK-NEXT: fsub v1.4s, v19.4s, v5.4s +; CHECK-NEXT: fsub v1.4s, v19.4s, v6.4s ; CHECK-NEXT: fadd v2.4s, v17.4s, v3.4s -; CHECK-NEXT: fadd v3.4s, v16.4s, v4.4s -; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 -; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 -; CHECK-NEXT: trn2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ext v4.16b, v0.16b, v4.16b, #12 -; CHECK-NEXT: zip2 v3.4s, v0.4s, v2.4s -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 -; CHECK-NEXT: zip1 v0.4s, v0.4s, v2.4s -; CHECK-NEXT: rev64 v4.4s, v4.4s +; CHECK-NEXT: fadd v3.4s, v16.4s, v5.4s +; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v2.4s, v0.4s, v3.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: stp q2, q1, [x8, #16] ; CHECK-NEXT: str q0, [x8] -; CHECK-NEXT: trn2 v4.4s, v4.4s, v5.4s -; CHECK-NEXT: ext v1.16b, v4.16b, v1.16b, #8 -; CHECK-NEXT: mov v3.d[1], v4.d[0] -; CHECK-NEXT: stp q3, q1, [x8, #16] ; CHECK-NEXT: ret entry: %ar = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> diff --git a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll --- a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll +++ b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll @@ -27,10 +27,7 @@ define i64 @g(ptr %p) { ; CHECK-LABEL: g: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0, #8] -; CHECK-NEXT: add x9, x8, x8 -; CHECK-NEXT: add x8, x9, x8 -; CHECK-NEXT: sub x0, x8, x8 +; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: ret %vec = load <2 x i64>, ptr %p, align 1 %elt = extractelement <2 x i64> %vec, i32 1 diff --git a/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll b/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll --- a/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll @@ -17,9 +17,9 @@ define <16 x i8> @fn2_vector(<16 x i8> %arg) { ; CHECK-LABEL: fn2_vector: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov v1.b[1], v0.b[1] +; CHECK-NEXT: shl v0.16b, v1.16b, #7 ; CHECK-NEXT: ret entry: %mul = mul <16 x i8> %arg, @@ -43,9 +43,9 @@ define <16 x i8> @fn2_vector_undef(<16 x i8> %arg) { ; CHECK-LABEL: fn2_vector_undef: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov v1.b[1], v0.b[1] +; CHECK-NEXT: shl v0.16b, v1.16b, #7 ; CHECK-NEXT: ret entry: %mul = mul <16 x i8> %arg, @@ -56,7 +56,7 @@ define i32 @fn1_scalar(i32 %arg) { ; CHECK-LABEL: fn1_scalar: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #1664 +; CHECK-NEXT: mov w8, #1664 // =0x680 ; CHECK-NEXT: mul w0, w0, w8 ; CHECK-NEXT: ret entry: @@ -68,7 +68,7 @@ define i32 @fn2_scalar(i32 %arg) { ; CHECK-LABEL: fn2_scalar: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #1664 +; CHECK-NEXT: mov w8, #1664 // =0x680 ; CHECK-NEXT: mul w0, w0, w8 ; CHECK-NEXT: ret entry: @@ -102,7 +102,7 @@ define i32 @fn1_scalar_opaque(i32 %arg) { ; CHECK-LABEL: fn1_scalar_opaque: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #13 +; CHECK-NEXT: mov w8, #13 // =0xd ; CHECK-NEXT: mul w8, w0, w8 ; CHECK-NEXT: lsl w0, w8, #7 ; CHECK-NEXT: ret @@ -116,7 +116,7 @@ define i32 @fn2_scalar_opaque(i32 %arg) { ; CHECK-LABEL: fn2_scalar_opaque: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #13 +; CHECK-NEXT: mov w8, #13 // =0xd ; CHECK-NEXT: mul w8, w0, w8 ; CHECK-NEXT: lsl w0, w8, #7 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/dag-combine-select.ll b/llvm/test/CodeGen/AArch64/dag-combine-select.ll --- a/llvm/test/CodeGen/AArch64/dag-combine-select.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-select.ll @@ -7,20 +7,13 @@ ; Ensure that we transform select(C0, x, select(C1, x, y)) towards ; select(C0 | C1, x, y) so we can use CMP;CCMP for the implementation. define i32 @test0(i32 %v0, i32 %v1, i32 %v2) { -; SDISEL-LABEL: test0: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w0, #7 -; SDISEL-NEXT: ccmp w1, #0, #0, ne -; SDISEL-NEXT: csel w0, w1, w2, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: test0: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, #7 -; GISEL-NEXT: csel w8, w1, w2, eq -; GISEL-NEXT: cmp w1, #0 -; GISEL-NEXT: csel w0, w1, w8, gt -; GISEL-NEXT: ret +; CHECK-LABEL: test0: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, #7 +; CHECK-NEXT: csel w8, w1, w2, eq +; CHECK-NEXT: cmp w1, #0 +; CHECK-NEXT: csel w0, w1, w8, gt +; CHECK-NEXT: ret %cmp1 = icmp eq i32 %v0, 7 %cmp2 = icmp sgt i32 %v1, 0 %sel0 = select i1 %cmp1, i32 %v1, i32 %v2 @@ -35,12 +28,13 @@ ; SDISEL-LABEL: test1: ; SDISEL: // %bb.0: ; SDISEL-NEXT: cmp w0, #7 -; SDISEL-NEXT: adrp x8, out +; SDISEL-NEXT: mov w8, #42 // =0x2a ; SDISEL-NEXT: csel w9, w1, w2, eq ; SDISEL-NEXT: cmp w9, #13 ; SDISEL-NEXT: csel w9, w1, w2, lo -; SDISEL-NEXT: cmp w0, #42 -; SDISEL-NEXT: csel w10, w1, w9, eq +; SDISEL-NEXT: ccmp w0, w8, #4, hs +; SDISEL-NEXT: adrp x8, out +; SDISEL-NEXT: csel w10, w1, w2, eq ; SDISEL-NEXT: str w9, [x8, :lo12:out] ; SDISEL-NEXT: str w10, [x8, :lo12:out] ; SDISEL-NEXT: ret @@ -73,5 +67,3 @@ store volatile i32 %cond17, ptr @out, align 4 ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll --- a/llvm/test/CodeGen/AArch64/expand-select.ll +++ b/llvm/test/CodeGen/AArch64/expand-select.ll @@ -33,24 +33,24 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: fmov s0, wzr -; CHECK-NEXT: ldp x10, x9, [sp] +; CHECK-NEXT: ldp x11, x10, [sp] ; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: ldr x11, [sp, #16] ; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s ; CHECK-NEXT: dup v1.4s, v0.s[0] +; CHECK-NEXT: fmov x9, d0 ; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: extr x8, x9, x8, #32 ; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: csel x9, x5, x9, ne -; CHECK-NEXT: csel x10, x4, x10, ne -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: csel x8, x2, x6, ne +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: csel x10, x5, x10, ne +; CHECK-NEXT: csel x11, x4, x11, ne +; CHECK-NEXT: tst w9, #0x1 +; CHECK-NEXT: csel x9, x2, x6, ne ; CHECK-NEXT: csel x12, x3, x7, ne -; CHECK-NEXT: stur x10, [x11, #12] -; CHECK-NEXT: str w9, [x11, #20] -; CHECK-NEXT: str x8, [x11] -; CHECK-NEXT: str w12, [x11, #8] +; CHECK-NEXT: stur x11, [x8, #12] +; CHECK-NEXT: str w10, [x8, #20] +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: str w12, [x8, #8] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 %cbool = icmp eq i32 %cond, 0 diff --git a/llvm/test/CodeGen/AArch64/fadd-combines.ll b/llvm/test/CodeGen/AArch64/fadd-combines.ll --- a/llvm/test/CodeGen/AArch64/fadd-combines.ll +++ b/llvm/test/CodeGen/AArch64/fadd-combines.ll @@ -132,8 +132,8 @@ define float @fadd_const_multiuse_fmf(float %x) { ; CHECK-LABEL: fadd_const_multiuse_fmf: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1109917696 -; CHECK-NEXT: mov w9, #1114374144 +; CHECK-NEXT: mov w8, #1109917696 // =0x42280000 +; CHECK-NEXT: mov w9, #1114374144 // =0x426c0000 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fadd s1, s0, s1 @@ -150,8 +150,8 @@ define float @fadd_const_multiuse_attr(float %x) { ; CHECK-LABEL: fadd_const_multiuse_attr: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1109917696 -; CHECK-NEXT: mov w9, #1114374144 +; CHECK-NEXT: mov w8, #1109917696 // =0x42280000 +; CHECK-NEXT: mov w9, #1114374144 // =0x426c0000 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fadd s1, s0, s1 @@ -245,11 +245,11 @@ define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x double> %x3, <2 x double> %x4, <2 x double> %x5, <2 x double> %x6, <2 x double> %x7, <2 x double> %x8) nounwind { ; CHECK-LABEL: fadd_fma_fmul_3: ; CHECK: // %bb.0: -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmla v2.2d, v1.2d, v0.2d -; CHECK-NEXT: fmla v2.2d, v7.2d, v6.2d -; CHECK-NEXT: fmla v2.2d, v5.2d, v4.2d -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: fmul v6.2d, v6.2d, v7.2d +; CHECK-NEXT: fmla v6.2d, v5.2d, v4.2d +; CHECK-NEXT: fmla v6.2d, v3.2d, v2.2d +; CHECK-NEXT: fmla v6.2d, v1.2d, v0.2d +; CHECK-NEXT: mov v0.16b, v6.16b ; CHECK-NEXT: ret %m1 = fmul fast <2 x double> %x1, %x2 %m2 = fmul fast <2 x double> %x3, %x4 diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll --- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll @@ -4,13 +4,9 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) { ; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v2.2s, v0.s[1] -; CHECK-NEXT: mov v1.16b, v2.16b -; CHECK-NEXT: mov v1.h[0], v0.h[1] -; CHECK-NEXT: mov v0.h[1], v2.h[0] -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h +; CHECK-NEXT: uzp2 v1.4h, v0.4h, v0.4h +; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) ret {<2 x half>, <2 x half>} %retval diff --git a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll --- a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll +++ b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll @@ -25,9 +25,9 @@ define i64 @f2() { ; CHECK-LABEL: f2: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, x1 -; CHECK-NEXT: add x8, x8, :lo12:x1 -; CHECK-NEXT: ldr x0, [x8, #24] +; CHECK-NEXT: adrp x8, x1+16 +; CHECK-NEXT: add x8, x8, :lo12:x1+16 +; CHECK-NEXT: ldr x0, [x8, #8] ; CHECK-NEXT: ret ; ; GISEL-LABEL: f2: @@ -100,7 +100,7 @@ define i64 @f6() { ; CHECK-LABEL: f6: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1048576 +; CHECK-NEXT: mov w8, #1048576 // =0x100000 ; CHECK-NEXT: adrp x9, x2 ; CHECK-NEXT: add x9, x9, :lo12:x2 ; CHECK-NEXT: ldr x0, [x9, x8] @@ -108,7 +108,7 @@ ; ; GISEL-LABEL: f6: ; GISEL: // %bb.0: -; GISEL-NEXT: mov w8, #1048576 +; GISEL-NEXT: mov w8, #1048576 // =0x100000 ; GISEL-NEXT: adrp x9, x2 ; GISEL-NEXT: add x9, x9, :lo12:x2 ; GISEL-NEXT: ldr x0, [x9, x8] diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll @@ -400,8 +400,8 @@ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lsl x9, x8, #5 ; CHECK-NEXT: add x8, x8, #1 -; CHECK-NEXT: add x10, x0, x9 -; CHECK-NEXT: add x11, x1, x9 +; CHECK-NEXT: add x10, x1, x9 +; CHECK-NEXT: add x11, x0, x9 ; CHECK-NEXT: add x9, x2, x9 ; CHECK-NEXT: cmp x8, #1000 ; CHECK-NEXT: ldp q0, q1, [x10] @@ -412,7 +412,7 @@ ; CHECK-NEXT: uzp1.8h v0, v0, v1 ; CHECK-NEXT: fcvtzu.4s v3, v3 ; CHECK-NEXT: uzp1.8h v1, v2, v3 -; CHECK-NEXT: stp q0, q1, [x9] +; CHECK-NEXT: stp q1, q0, [x9] ; CHECK-NEXT: b.eq LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat.ll b/llvm/test/CodeGen/AArch64/fpclamptosat.ll --- a/llvm/test/CodeGen/AArch64/fpclamptosat.ll +++ b/llvm/test/CodeGen/AArch64/fpclamptosat.ll @@ -35,7 +35,12 @@ define i32 @ustest_f64i32(double %x) { ; CHECK-LABEL: ustest_f64i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzu w0, d0 +; CHECK-NEXT: fcvtzs x8, d0 +; CHECK-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: csel x8, x8, x9, lt +; CHECK-NEXT: asr x9, x8, #63 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret entry: %conv = fptosi double %x to i64 @@ -78,7 +83,12 @@ define i32 @ustest_f32i32(float %x) { ; CHECK-LABEL: ustest_f32i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzu w0, s0 +; CHECK-NEXT: fcvtzs x8, s0 +; CHECK-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: csel x8, x8, x9, lt +; CHECK-NEXT: asr x9, x8, #63 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret entry: %conv = fptosi float %x to i64 @@ -134,12 +144,22 @@ ; CHECK-CVT-LABEL: ustest_f16i32: ; CHECK-CVT: // %bb.0: // %entry ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: fcvtzu w0, s0 +; CHECK-CVT-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-CVT-NEXT: fcvtzs x8, s0 +; CHECK-CVT-NEXT: cmp x8, x9 +; CHECK-CVT-NEXT: csel x8, x8, x9, lt +; CHECK-CVT-NEXT: asr x9, x8, #63 +; CHECK-CVT-NEXT: bic w0, w8, w9 ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: ustest_f16i32: ; CHECK-FP16: // %bb.0: // %entry -; CHECK-FP16-NEXT: fcvtzu w0, h0 +; CHECK-FP16-NEXT: fcvtzs x8, h0 +; CHECK-FP16-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-FP16-NEXT: cmp x8, x9 +; CHECK-FP16-NEXT: csel x8, x8, x9, lt +; CHECK-FP16-NEXT: asr x9, x8, #63 +; CHECK-FP16-NEXT: bic w0, w8, w9 ; CHECK-FP16-NEXT: ret entry: %conv = fptosi half %x to i64 @@ -396,11 +416,9 @@ ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: cmp x1, #1 -; CHECK-NEXT: csel x8, x0, xzr, lt -; CHECK-NEXT: csinc x9, x1, xzr, lt -; CHECK-NEXT: cmp xzr, x8 -; CHECK-NEXT: ngcs xzr, x9 -; CHECK-NEXT: csel x0, x8, xzr, lt +; CHECK-NEXT: csinc x8, x1, xzr, lt +; CHECK-NEXT: csel x9, x0, xzr, lt +; CHECK-NEXT: bic x0, x9, x8, asr #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -455,11 +473,9 @@ ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: cmp x1, #1 -; CHECK-NEXT: csel x8, x0, xzr, lt -; CHECK-NEXT: csinc x9, x1, xzr, lt -; CHECK-NEXT: cmp xzr, x8 -; CHECK-NEXT: ngcs xzr, x9 -; CHECK-NEXT: csel x0, x8, xzr, lt +; CHECK-NEXT: csinc x8, x1, xzr, lt +; CHECK-NEXT: csel x9, x0, xzr, lt +; CHECK-NEXT: bic x0, x9, x8, asr #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -520,11 +536,9 @@ ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: bl __fixhfti ; CHECK-NEXT: cmp x1, #1 -; CHECK-NEXT: csel x8, x0, xzr, lt -; CHECK-NEXT: csinc x9, x1, xzr, lt -; CHECK-NEXT: cmp xzr, x8 -; CHECK-NEXT: ngcs xzr, x9 -; CHECK-NEXT: csel x0, x8, xzr, lt +; CHECK-NEXT: csinc x8, x1, xzr, lt +; CHECK-NEXT: csel x9, x0, xzr, lt +; CHECK-NEXT: bic x0, x9, x8, asr #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -211,11 +211,18 @@ define <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) { ; CHECK-LABEL: test_signed_v5f64_v5i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs w0, d0 -; CHECK-NEXT: fcvtzs w1, d1 -; CHECK-NEXT: fcvtzs w2, d2 +; CHECK-NEXT: fcvtzs w8, d0 +; CHECK-NEXT: fcvtzs w9, d1 ; CHECK-NEXT: fcvtzs w3, d3 ; CHECK-NEXT: fcvtzs w4, d4 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: fcvtzs w8, d2 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: mov v0.s[3], w3 +; CHECK-NEXT: mov w1, v0.s[1] +; CHECK-NEXT: mov w2, v0.s[2] +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <5 x i32> @llvm.fptosi.sat.v5f64.v5i32(<5 x double> %f) ret <5 x i32> %x @@ -224,12 +231,22 @@ define <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) { ; CHECK-LABEL: test_signed_v6f64_v6i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs w0, d0 -; CHECK-NEXT: fcvtzs w1, d1 -; CHECK-NEXT: fcvtzs w2, d2 +; CHECK-NEXT: fcvtzs w9, d0 +; CHECK-NEXT: fcvtzs w10, d1 +; CHECK-NEXT: fcvtzs w8, d4 ; CHECK-NEXT: fcvtzs w3, d3 -; CHECK-NEXT: fcvtzs w4, d4 ; CHECK-NEXT: fcvtzs w5, d5 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: fcvtzs w9, d2 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.s[1], w10 +; CHECK-NEXT: mov v1.s[1], w5 +; CHECK-NEXT: mov v0.s[2], w9 +; CHECK-NEXT: fmov w4, s1 +; CHECK-NEXT: mov v0.s[3], w3 +; CHECK-NEXT: mov w1, v0.s[1] +; CHECK-NEXT: mov w2, v0.s[2] +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <6 x i32> @llvm.fptosi.sat.v6f64.v6i32(<6 x double> %f) ret <6 x i32> %x diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -211,11 +211,18 @@ define <5 x i32> @test_unsigned_v5f64_v5i32(<5 x double> %f) { ; CHECK-LABEL: test_unsigned_v5f64_v5i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu w0, d0 -; CHECK-NEXT: fcvtzu w1, d1 -; CHECK-NEXT: fcvtzu w2, d2 +; CHECK-NEXT: fcvtzu w8, d0 +; CHECK-NEXT: fcvtzu w9, d1 ; CHECK-NEXT: fcvtzu w3, d3 ; CHECK-NEXT: fcvtzu w4, d4 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: fcvtzu w8, d2 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: mov v0.s[3], w3 +; CHECK-NEXT: mov w1, v0.s[1] +; CHECK-NEXT: mov w2, v0.s[2] +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <5 x i32> @llvm.fptoui.sat.v5f64.v5i32(<5 x double> %f) ret <5 x i32> %x @@ -224,12 +231,22 @@ define <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) { ; CHECK-LABEL: test_unsigned_v6f64_v6i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu w0, d0 -; CHECK-NEXT: fcvtzu w1, d1 -; CHECK-NEXT: fcvtzu w2, d2 +; CHECK-NEXT: fcvtzu w9, d0 +; CHECK-NEXT: fcvtzu w10, d1 +; CHECK-NEXT: fcvtzu w8, d4 ; CHECK-NEXT: fcvtzu w3, d3 -; CHECK-NEXT: fcvtzu w4, d4 ; CHECK-NEXT: fcvtzu w5, d5 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: fcvtzu w9, d2 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.s[1], w10 +; CHECK-NEXT: mov v1.s[1], w5 +; CHECK-NEXT: mov v0.s[2], w9 +; CHECK-NEXT: fmov w4, s1 +; CHECK-NEXT: mov v0.s[3], w3 +; CHECK-NEXT: mov w1, v0.s[1] +; CHECK-NEXT: mov w2, v0.s[2] +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <6 x i32> @llvm.fptoui.sat.v6f64.v6i32(<6 x double> %f) ret <6 x i32> %x @@ -691,7 +708,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s1, v0.s[1] ; CHECK-NEXT: fcvtzu x9, s0 -; CHECK-NEXT: mov x10, #1125899906842623 +; CHECK-NEXT: mov x10, #1125899906842623 // =0x3ffffffffffff ; CHECK-NEXT: fcvtzu x8, s1 ; CHECK-NEXT: cmp x8, x10 ; CHECK-NEXT: csel x8, x8, x10, lo @@ -737,9 +754,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov w8, #1904214015 +; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x21, #68719476735 +; CHECK-NEXT: mov x21, #68719476735 // =0xfffffffff ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fmov s9, w8 @@ -788,7 +805,7 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov w8, #2139095039 +; CHECK-NEXT: mov w8, #2139095039 // =0x7f7fffff ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 @@ -904,7 +921,7 @@ ; CHECK-LABEL: test_unsigned_v4f32_v4i50: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov x8, #1125899906842623 +; CHECK-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-NEXT: mov s3, v0.s[1] ; CHECK-NEXT: fcvtzu x11, s0 ; CHECK-NEXT: mov s2, v1.s[1] @@ -967,10 +984,10 @@ ; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov w8, #1904214015 +; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x25, #68719476735 +; CHECK-NEXT: mov x25, #68719476735 // =0xfffffffff ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: csel x8, xzr, x0, lt @@ -1050,7 +1067,7 @@ ; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov w8, #2139095039 +; CHECK-NEXT: mov w8, #2139095039 // =0x7f7fffff ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: fmov s9, w8 @@ -1146,7 +1163,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu w10, d0 -; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov w8, #255 // =0xff ; CHECK-NEXT: fcvtzu w9, d1 ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: csel w9, w9, w8, lo @@ -1165,7 +1182,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu w9, d0 -; CHECK-NEXT: mov w10, #8191 +; CHECK-NEXT: mov w10, #8191 // =0x1fff ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: cmp w8, w10 ; CHECK-NEXT: csel w8, w8, w10, lo @@ -1184,7 +1201,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu w9, d0 -; CHECK-NEXT: mov w10, #65535 +; CHECK-NEXT: mov w10, #65535 // =0xffff ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: cmp w8, w10 ; CHECK-NEXT: csel w8, w8, w10, lo @@ -1203,7 +1220,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu w9, d0 -; CHECK-NEXT: mov w10, #524287 +; CHECK-NEXT: mov w10, #524287 // =0x7ffff ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: cmp w8, w10 ; CHECK-NEXT: csel w8, w8, w10, lo @@ -1236,7 +1253,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu x9, d0 -; CHECK-NEXT: mov x10, #1125899906842623 +; CHECK-NEXT: mov x10, #1125899906842623 // =0x3ffffffffffff ; CHECK-NEXT: fcvtzu x8, d1 ; CHECK-NEXT: cmp x8, x10 ; CHECK-NEXT: csel x8, x8, x10, lo @@ -1276,9 +1293,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov d0, d8 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: mov x8, #5057542381537067007 +; CHECK-NEXT: mov x8, #5057542381537067007 // =0x462fffffffffffff ; CHECK-NEXT: fcmp d8, #0.0 -; CHECK-NEXT: mov x21, #68719476735 +; CHECK-NEXT: mov x21, #68719476735 // =0xfffffffff ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: fmov d9, x8 @@ -1326,7 +1343,7 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov d0, d8 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: mov x8, #5183643171103440895 +; CHECK-NEXT: mov x8, #5183643171103440895 // =0x47efffffffffffff ; CHECK-NEXT: fcmp d8, #0.0 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -1478,7 +1495,7 @@ ; CHECK-CVT-NEXT: mov h2, v0.h[2] ; CHECK-CVT-NEXT: mov h3, v0.h[3] ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: mov x8, #1125899906842623 +; CHECK-CVT-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 @@ -1503,7 +1520,7 @@ ; CHECK-FP16-NEXT: mov h2, v0.h[2] ; CHECK-FP16-NEXT: mov h3, v0.h[3] ; CHECK-FP16-NEXT: fcvtzu x9, h0 -; CHECK-FP16-NEXT: mov x8, #1125899906842623 +; CHECK-FP16-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-FP16-NEXT: fcvtzu x10, h1 ; CHECK-FP16-NEXT: fcvtzu x11, h2 ; CHECK-FP16-NEXT: cmp x9, x8 @@ -1587,9 +1604,9 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov w8, #1904214015 +; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x25, #68719476735 +; CHECK-NEXT: mov x25, #68719476735 // =0xfffffffff ; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: csel x8, xzr, x0, lt @@ -1673,7 +1690,7 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov w8, #2139095039 +; CHECK-NEXT: mov w8, #2139095039 // =0x7f7fffff ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: fmov s9, w8 @@ -1809,7 +1826,7 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: fcvtl2 v1.4s, v0.8h ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h -; CHECK-CVT-NEXT: mov w8, #255 +; CHECK-CVT-NEXT: mov w8, #255 // =0xff ; CHECK-CVT-NEXT: mov s2, v1.s[1] ; CHECK-CVT-NEXT: mov s3, v1.s[2] ; CHECK-CVT-NEXT: mov s4, v1.s[3] @@ -1866,7 +1883,7 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: fcvtl2 v1.4s, v0.8h ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h -; CHECK-CVT-NEXT: mov w8, #8191 +; CHECK-CVT-NEXT: mov w8, #8191 // =0x1fff ; CHECK-CVT-NEXT: mov s2, v1.s[1] ; CHECK-CVT-NEXT: mov s3, v1.s[2] ; CHECK-CVT-NEXT: mov s4, v1.s[3] @@ -1923,7 +1940,7 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: fcvtl2 v1.4s, v0.8h ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h -; CHECK-CVT-NEXT: mov w8, #65535 +; CHECK-CVT-NEXT: mov w8, #65535 // =0xffff ; CHECK-CVT-NEXT: mov s2, v1.s[1] ; CHECK-CVT-NEXT: mov s3, v1.s[2] ; CHECK-CVT-NEXT: mov s4, v1.s[3] @@ -2012,7 +2029,7 @@ ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i50: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-CVT-NEXT: mov x8, #1125899906842623 +; CHECK-CVT-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-CVT-NEXT: mov h2, v0.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: mov h5, v0.h[3] @@ -2056,7 +2073,7 @@ ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i50: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-NEXT: mov x8, #1125899906842623 +; CHECK-FP16-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-FP16-NEXT: mov h2, v0.h[1] ; CHECK-FP16-NEXT: mov h3, v0.h[2] ; CHECK-FP16-NEXT: mov h5, v0.h[3] @@ -2193,9 +2210,9 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: mov w8, #1904214015 +; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x23, #68719476735 +; CHECK-NEXT: mov x23, #68719476735 // =0xfffffffff ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: csel x8, xzr, x0, lt @@ -2357,7 +2374,7 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: mov w8, #2139095039 +; CHECK-NEXT: mov w8, #2139095039 // =0x7f7fffff ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: fmov s9, w8 @@ -2559,7 +2576,7 @@ ; CHECK-CVT-NEXT: fcvtl2 v2.4s, v1.8h ; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ; CHECK-CVT-NEXT: fcvtl2 v5.4s, v0.8h -; CHECK-CVT-NEXT: mov w8, #255 +; CHECK-CVT-NEXT: mov w8, #255 // =0xff ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: mov s3, v2.s[1] ; CHECK-CVT-NEXT: mov s4, v2.s[2] @@ -2661,7 +2678,7 @@ ; CHECK-CVT-NEXT: fcvtl2 v2.4s, v0.8h ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: fcvtl2 v5.4s, v1.8h -; CHECK-CVT-NEXT: mov w8, #65535 +; CHECK-CVT-NEXT: mov w8, #65535 // =0xffff ; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ; CHECK-CVT-NEXT: mov s3, v2.s[1] ; CHECK-CVT-NEXT: mov s4, v2.s[2] @@ -2758,7 +2775,7 @@ ; CHECK-NEXT: mov d4, v3.d[1] ; CHECK-NEXT: fcvtzu w10, d3 ; CHECK-NEXT: mov d3, v2.d[1] -; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov w8, #255 // =0xff ; CHECK-NEXT: fcvtzu w12, d2 ; CHECK-NEXT: fcvtzu w13, d1 ; CHECK-NEXT: fcvtzu w9, d4 @@ -2806,7 +2823,7 @@ ; CHECK-NEXT: mov d16, v0.d[1] ; CHECK-NEXT: fcvtzu w10, d0 ; CHECK-NEXT: mov d0, v1.d[1] -; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov w8, #255 // =0xff ; CHECK-NEXT: fcvtzu w12, d1 ; CHECK-NEXT: mov d1, v2.d[1] ; CHECK-NEXT: fcvtzu w9, d16 @@ -2910,7 +2927,7 @@ ; CHECK-NEXT: mov d4, v3.d[1] ; CHECK-NEXT: fcvtzu w10, d3 ; CHECK-NEXT: mov d3, v2.d[1] -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: mov w8, #65535 // =0xffff ; CHECK-NEXT: fcvtzu w12, d2 ; CHECK-NEXT: fcvtzu w13, d1 ; CHECK-NEXT: fcvtzu w9, d4 @@ -2958,7 +2975,7 @@ ; CHECK-NEXT: mov d16, v3.d[1] ; CHECK-NEXT: fcvtzu w9, d3 ; CHECK-NEXT: mov d3, v2.d[1] -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: mov w8, #65535 // =0xffff ; CHECK-NEXT: fcvtzu w10, d2 ; CHECK-NEXT: mov d2, v1.d[1] ; CHECK-NEXT: fcvtzu w11, d1 diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll --- a/llvm/test/CodeGen/AArch64/funnel-shift.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll @@ -19,12 +19,12 @@ define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: fshl_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsr w9, w1, #1 -; CHECK-NEXT: lsl w10, w0, w2 -; CHECK-NEXT: lsr w8, w9, w8 -; CHECK-NEXT: orr w0, w10, w8 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsr w10, w1, #1 +; CHECK-NEXT: lsr w9, w10, w9 +; CHECK-NEXT: lsl w8, w0, w8 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) ret i32 %f @@ -46,7 +46,8 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; CHECK-LABEL: fshl_i128: ; CHECK: // %bb.0: -; CHECK-NEXT: tst x4, #0x40 +; CHECK-NEXT: ubfx x8, x4, #6, #1 +; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: mvn w8, w4 ; CHECK-NEXT: csel x9, x2, x3, ne ; CHECK-NEXT: csel x10, x3, x0, ne @@ -69,14 +70,14 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-LABEL: fshl_i37: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #46053 +; CHECK-NEXT: mov x9, #46053 // =0xb3e5 ; CHECK-NEXT: and x8, x2, #0x1fffffffff ; CHECK-NEXT: movk x9, #12398, lsl #16 ; CHECK-NEXT: ubfiz x10, x1, #26, #37 ; CHECK-NEXT: movk x9, #15941, lsl #32 ; CHECK-NEXT: movk x9, #1771, lsl #48 ; CHECK-NEXT: umulh x8, x8, x9 -; CHECK-NEXT: mov w9, #37 +; CHECK-NEXT: mov w9, #37 // =0x25 ; CHECK-NEXT: msub w8, w8, w9, w2 ; CHECK-NEXT: mvn w9, w8 ; CHECK-NEXT: lsl x8, x0, x8 @@ -93,7 +94,7 @@ define i7 @fshl_i7_const_fold() { ; CHECK-LABEL: fshl_i7_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #67 +; CHECK-NEXT: mov w0, #67 // =0x43 ; CHECK-NEXT: ret %f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2) ret i7 %f @@ -102,7 +103,7 @@ define i8 @fshl_i8_const_fold_overshift_1() { ; CHECK-LABEL: fshl_i8_const_fold_overshift_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #128 +; CHECK-NEXT: mov w0, #128 // =0x80 ; CHECK-NEXT: ret %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 15) ret i8 %f @@ -111,7 +112,7 @@ define i8 @fshl_i8_const_fold_overshift_2() { ; CHECK-LABEL: fshl_i8_const_fold_overshift_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #120 +; CHECK-NEXT: mov w0, #120 // =0x78 ; CHECK-NEXT: ret %f = call i8 @llvm.fshl.i8(i8 15, i8 15, i8 11) ret i8 %f @@ -164,7 +165,7 @@ define i8 @fshl_i8_const_fold() { ; CHECK-LABEL: fshl_i8_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #128 +; CHECK-NEXT: mov w0, #128 // =0x80 ; CHECK-NEXT: ret %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7) ret i8 %f @@ -177,12 +178,12 @@ define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: fshr_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsl w9, w0, #1 -; CHECK-NEXT: lsr w10, w1, w2 -; CHECK-NEXT: lsl w8, w9, w8 -; CHECK-NEXT: orr w0, w8, w10 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsl w10, w0, #1 +; CHECK-NEXT: lsr w8, w1, w8 +; CHECK-NEXT: lsl w9, w10, w9 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) ret i32 %f @@ -206,7 +207,7 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-LABEL: fshr_i37: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #46053 +; CHECK-NEXT: mov x9, #46053 // =0xb3e5 ; CHECK-NEXT: and x8, x2, #0x1fffffffff ; CHECK-NEXT: movk x9, #12398, lsl #16 ; CHECK-NEXT: lsl x10, x1, #27 @@ -214,7 +215,7 @@ ; CHECK-NEXT: lsl x11, x0, #1 ; CHECK-NEXT: movk x9, #1771, lsl #48 ; CHECK-NEXT: umulh x8, x8, x9 -; CHECK-NEXT: mov w9, #37 +; CHECK-NEXT: mov w9, #37 // =0x25 ; CHECK-NEXT: msub w8, w8, w9, w2 ; CHECK-NEXT: add w8, w8, #27 ; CHECK-NEXT: mvn w9, w8 @@ -232,7 +233,7 @@ define i7 @fshr_i7_const_fold() { ; CHECK-LABEL: fshr_i7_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #31 +; CHECK-NEXT: mov w0, #31 // =0x1f ; CHECK-NEXT: ret %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2) ret i7 %f @@ -241,7 +242,7 @@ define i8 @fshr_i8_const_fold_overshift_1() { ; CHECK-LABEL: fshr_i8_const_fold_overshift_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #254 +; CHECK-NEXT: mov w0, #254 // =0xfe ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 15) ret i8 %f @@ -250,7 +251,7 @@ define i8 @fshr_i8_const_fold_overshift_2() { ; CHECK-LABEL: fshr_i8_const_fold_overshift_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #225 +; CHECK-NEXT: mov w0, #225 // =0xe1 ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11) ret i8 %f @@ -259,7 +260,7 @@ define i8 @fshr_i8_const_fold_overshift_3() { ; CHECK-LABEL: fshr_i8_const_fold_overshift_3: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #255 +; CHECK-NEXT: mov w0, #255 // =0xff ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8) ret i8 %f @@ -303,7 +304,7 @@ define i8 @fshr_i8_const_fold() { ; CHECK-LABEL: fshr_i8_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #254 +; CHECK-NEXT: mov w0, #254 // =0xfe ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7) ret i8 %f @@ -472,12 +473,12 @@ define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) { ; CHECK-LABEL: or_shl_fshl_simplify: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsr w9, w0, #1 -; CHECK-NEXT: lsl w10, w1, w2 -; CHECK-NEXT: lsr w8, w9, w8 -; CHECK-NEXT: orr w0, w10, w8 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsr w10, w0, #1 +; CHECK-NEXT: lsr w9, w10, w9 +; CHECK-NEXT: lsl w8, w1, w8 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %shy = shl i32 %y, %s %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s) @@ -488,12 +489,12 @@ define i32 @or_lshr_fshr_simplify(i32 %x, i32 %y, i32 %s) { ; CHECK-LABEL: or_lshr_fshr_simplify: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsl w9, w0, #1 -; CHECK-NEXT: lsr w10, w1, w2 -; CHECK-NEXT: lsl w8, w9, w8 -; CHECK-NEXT: orr w0, w8, w10 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsl w10, w0, #1 +; CHECK-NEXT: lsr w8, w1, w8 +; CHECK-NEXT: lsl w9, w10, w9 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %shy = lshr i32 %y, %s %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s) diff --git a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll --- a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll @@ -12,7 +12,8 @@ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds x0, x0, #1 ; CHECK-NEXT: cinc x1, x1, hs -; CHECK-NEXT: orr x8, x1, x0, lsr #60 +; CHECK-NEXT: extr x8, x1, x0, #60 +; CHECK-NEXT: orr x8, x8, x1, lsr #60 ; CHECK-NEXT: cbnz x8, .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret @@ -31,7 +32,8 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_eq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x1, x0, lsr #17 +; CHECK-NEXT: extr x8, x1, x0, #17 +; CHECK-NEXT: orr x8, x8, x1, lsr #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -43,7 +45,8 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_ne_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x1, x0, lsr #17 +; CHECK-NEXT: extr x8, x1, x0, #17 +; CHECK-NEXT: orr x8, x8, x1, lsr #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -55,7 +58,8 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_eq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1, lsl #17 +; CHECK-NEXT: extr x8, x1, x0, #47 +; CHECK-NEXT: orr x8, x8, x0, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -67,7 +71,8 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_ne_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1, lsl #17 +; CHECK-NEXT: extr x8, x1, x0, #47 +; CHECK-NEXT: orr x8, x8, x0, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -101,7 +106,8 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: opt_setcc_expanded_shl_correct_shifts: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x1, x0, lsl #17 +; CHECK-NEXT: extr x8, x0, x1, #47 +; CHECK-NEXT: orr x8, x8, x1, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -136,12 +142,12 @@ define i1 @opt_setcc_shl_ne_zero_i256(i256 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_ne_zero_i256: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x2, x0 -; CHECK-NEXT: extr x9, x3, x2, #47 +; CHECK-NEXT: extr x8, x3, x2, #47 +; CHECK-NEXT: extr x9, x2, x1, #47 ; CHECK-NEXT: extr x10, x1, x0, #47 -; CHECK-NEXT: extr x8, x8, x1, #47 -; CHECK-NEXT: orr x9, x10, x9 -; CHECK-NEXT: orr x8, x8, x9 +; CHECK-NEXT: orr x9, x9, x0, lsl #17 +; CHECK-NEXT: orr x8, x10, x8 +; CHECK-NEXT: orr x8, x9, x8 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll --- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll +++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll @@ -65,8 +65,6 @@ ; CHECK-NEXT: uzp1 v1.16b, v3.16b, v1.16b ; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.16b, v0.16b, #7 -; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: bic w0, w9, w8 diff --git a/llvm/test/CodeGen/AArch64/insertshuffleload.ll b/llvm/test/CodeGen/AArch64/insertshuffleload.ll --- a/llvm/test/CodeGen/AArch64/insertshuffleload.ll +++ b/llvm/test/CodeGen/AArch64/insertshuffleload.ll @@ -30,8 +30,11 @@ define <8 x i16> @inserti8_first_sext(ptr %p) { ; CHECK-LABEL: inserti8_first_sext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldrsb w8, [x0] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14 +; CHECK-NEXT: mov v0.h[0], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -46,8 +49,11 @@ define <8 x i16> @inserti8_last_sext(ptr %p) { ; CHECK-LABEL: inserti8_last_sext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldrsb w8, [x0, #8] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #2 +; CHECK-NEXT: mov v0.h[7], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 8 %l1 = load <8 x i8>, ptr %p @@ -62,8 +68,11 @@ define <8 x i16> @inserti8_first_zext(ptr %p) { ; CHECK-LABEL: inserti8_first_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldrb w8, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14 +; CHECK-NEXT: mov v0.h[0], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -78,8 +87,11 @@ define <8 x i16> @inserti8_last_zext(ptr %p) { ; CHECK-LABEL: inserti8_last_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldrb w8, [x0, #8] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #2 +; CHECK-NEXT: mov v0.h[7], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 8 %l1 = load <8 x i8>, ptr %p diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -350,8 +350,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-8 -; CHECK-NEXT: mov x8, #-1 -; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] @@ -362,10 +362,10 @@ ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1w { z7.s }, p0, [sp, #7, mul vl] -; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl] +; CHECK-NEXT: st1w { z6.s }, p0, [sp, #6, mul vl] ; CHECK-NEXT: add x10, x9, x8, lsl #2 ; CHECK-NEXT: st1w { z5.s }, p0, [sp, #5, mul vl] -; CHECK-NEXT: st1w { z6.s }, p0, [sp, #6, mul vl] +; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x8, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10, #1, mul vl] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x10, #2, mul vl] @@ -455,7 +455,7 @@ ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #17 -; CHECK-NEXT: mov w10, #17 +; CHECK-NEXT: mov w10, #17 // =0x11 ; CHECK-NEXT: csel x9, x9, x10, lo ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: addvl x8, x8, #1 @@ -500,7 +500,7 @@ ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #18 -; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: mov w10, #18 // =0x12 ; CHECK-NEXT: csel x9, x9, x10, lo ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: addvl x8, x8, #1 @@ -611,7 +611,7 @@ ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #18 -; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: mov w10, #18 // =0x12 ; CHECK-NEXT: csel x9, x9, x10, lo ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: addvl x8, x8, #1 @@ -779,7 +779,7 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-8 +; CHECK-NEXT: mov x9, #-8 // =0xfffffffffffffff8 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] @@ -805,7 +805,7 @@ ; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: rdvl x8, #4 ; CHECK-NEXT: cmp x8, #68 -; CHECK-NEXT: mov w9, #68 +; CHECK-NEXT: mov w9, #68 // =0x44 ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: addvl x9, x10, #4 @@ -815,9 +815,9 @@ ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z7.s }, p0, [sp, #7, mul vl] -; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl] -; CHECK-NEXT: st1w { z5.s }, p0, [sp, #5, mul vl] ; CHECK-NEXT: st1w { z6.s }, p0, [sp, #6, mul vl] +; CHECK-NEXT: st1w { z5.s }, p0, [sp, #5, mul vl] +; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8, #2, mul vl] diff --git a/llvm/test/CodeGen/AArch64/neon-abd.ll b/llvm/test/CodeGen/AArch64/neon-abd.ll --- a/llvm/test/CodeGen/AArch64/neon-abd.ll +++ b/llvm/test/CodeGen/AArch64/neon-abd.ll @@ -53,7 +53,8 @@ ; CHECK-NEXT: shl v1.4h, v1.4h, #8 ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: sshr v1.4h, v1.4h, #8 -; CHECK-NEXT: sabd v0.4h, v0.4h, v1.4h +; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: abs v0.4h, v0.4h ; CHECK-NEXT: ret %a.sext = sext <4 x i8> %a to <4 x i16> %b.sext = sext <4 x i8> %b to <4 x i16> @@ -107,7 +108,8 @@ ; CHECK-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-NEXT: sshr v0.2s, v0.2s, #16 ; CHECK-NEXT: sshr v1.2s, v1.2s, #16 -; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: abs v0.2s, v0.2s ; CHECK-NEXT: ret %a.sext = sext <2 x i16> %a to <2 x i32> %b.sext = sext <2 x i16> %b to <2 x i32> @@ -234,7 +236,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: bic v1.4h, #255, lsl #8 -; CHECK-NEXT: uabd v0.4h, v0.4h, v1.4h +; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: abs v0.4h, v0.4h ; CHECK-NEXT: ret %a.zext = zext <4 x i8> %a to <4 x i16> %b.zext = zext <4 x i8> %b to <4 x i16> @@ -287,7 +290,8 @@ ; CHECK-NEXT: movi d2, #0x00ffff0000ffff ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b -; CHECK-NEXT: uabd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: abs v0.2s, v0.2s ; CHECK-NEXT: ret %a.zext = zext <2 x i16> %a to <2 x i32> %b.zext = zext <2 x i16> %b to <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -661,8 +661,10 @@ define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: bsl2xi32_const: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d2, #0x000000ffffffff -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.s[1], v1.s[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = and <2 x i32> %a, < i32 -1, i32 0 > %tmp2 = and <2 x i32> %b, < i32 0, i32 -1 > @@ -686,8 +688,10 @@ define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: bsl1xi64_const: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d2, #0xffffffffffffff00 -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.b[0], v1.b[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = and <1 x i64> %a, < i64 -256 > %tmp2 = and <1 x i64> %b, < i64 255 > @@ -722,9 +726,7 @@ define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: bsl2xi64_const: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI75_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI75_0] -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: mov v0.d[1], v1.d[1] ; CHECK-NEXT: ret %tmp1 = and <2 x i64> %a, < i64 -1, i64 0 > %tmp2 = and <2 x i64> %b, < i64 0, i64 -1 > diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -130,15 +130,17 @@ define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v5i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_0] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: umull2 v3.4s, v1.8h, v2.8h -; CHECK-NEXT: mov v0.s[0], v3.s[0] -; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-NEXT: mov v2.s[3], wzr +; CHECK-NEXT: umlal v2.4s, v0.4h, v1.4h +; CHECK-NEXT: addv s0, v2.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 ; CHECK-NEXT: ret @@ -156,12 +158,14 @@ define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v5i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0 -; CHECK-NEXT: mov v0.s[0], v2.s[0] -; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: mov v1.s[3], wzr +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -174,15 +178,17 @@ define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v5i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_0] ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: smull2 v3.4s, v1.8h, v2.8h -; CHECK-NEXT: mov v0.s[0], v3.s[0] -; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-NEXT: mov v2.s[3], wzr +; CHECK-NEXT: smlal v2.4s, v0.4h, v1.4h +; CHECK-NEXT: addv s0, v2.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 ; CHECK-NEXT: ret @@ -200,19 +206,21 @@ define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) { ; CHECK-LABEL: test_sdot_v5i8_double: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, .LCPI8_0 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI8_0] ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: smull2 v5.4s, v0.8h, v1.8h -; CHECK-NEXT: movi v6.2d, #0000000000000000 -; CHECK-NEXT: smull2 v7.4s, v2.8h, v3.8h -; CHECK-NEXT: mov v6.s[0], v5.s[0] -; CHECK-NEXT: mov v4.s[0], v7.s[0] -; CHECK-NEXT: smlal v6.4s, v0.4h, v1.4h +; CHECK-NEXT: smull2 v6.4s, v2.8h, v3.8h +; CHECK-NEXT: and v5.16b, v5.16b, v4.16b +; CHECK-NEXT: and v4.16b, v6.16b, v4.16b +; CHECK-NEXT: mov v5.s[3], wzr +; CHECK-NEXT: mov v4.s[3], wzr +; CHECK-NEXT: smlal v5.4s, v0.4h, v1.4h ; CHECK-NEXT: smlal v4.4s, v2.4h, v3.4h -; CHECK-NEXT: add v0.4s, v6.4s, v4.4s +; CHECK-NEXT: add v0.4s, v5.4s, v4.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -232,14 +240,16 @@ define i32 @test_sdot_v5i8_double_nomla(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) { ; CHECK-LABEL: test_sdot_v5i8_double_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: adrp x8, .LCPI9_0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0 -; CHECK-NEXT: sshll2 v5.4s, v2.8h, #0 -; CHECK-NEXT: mov v3.s[0], v4.s[0] -; CHECK-NEXT: mov v1.s[0], v5.s[0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: sshll2 v4.4s, v2.8h, #0 +; CHECK-NEXT: and v3.16b, v3.16b, v1.16b +; CHECK-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-NEXT: mov v3.s[3], wzr +; CHECK-NEXT: mov v1.s[3], wzr ; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h ; CHECK-NEXT: saddw v1.4s, v1.4s, v2.4h ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s @@ -998,27 +1008,29 @@ define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v25i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q1, q4, [x1] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ushll2 v7.8h, v4.16b, #0 -; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: adrp x8, .LCPI31_0 ; CHECK-NEXT: ushll2 v6.8h, v3.16b, #0 ; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ushll2 v7.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: ushll v5.8h, v2.8b, #0 ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: umull2 v16.4s, v3.8h, v1.8h ; CHECK-NEXT: umull v2.4s, v7.4h, v2.4h -; CHECK-NEXT: ushll v7.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-NEXT: umull2 v16.4s, v7.8h, v3.8h -; CHECK-NEXT: mov v0.s[0], v2.s[0] -; CHECK-NEXT: umull v2.4s, v7.4h, v3.4h -; CHECK-NEXT: umlal2 v16.4s, v4.8h, v5.8h -; CHECK-NEXT: umlal v0.4s, v1.4h, v6.4h -; CHECK-NEXT: umlal v2.4s, v4.4h, v5.4h -; CHECK-NEXT: umlal2 v16.4s, v1.8h, v6.8h -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v0.4s, v0.4s, v16.4s +; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI31_0] +; CHECK-NEXT: umull v1.4s, v3.4h, v1.4h +; CHECK-NEXT: umlal2 v16.4s, v0.8h, v5.8h +; CHECK-NEXT: and v2.16b, v2.16b, v7.16b +; CHECK-NEXT: mov v2.s[3], wzr +; CHECK-NEXT: umlal v1.4s, v0.4h, v5.4h +; CHECK-NEXT: umlal2 v16.4s, v6.8h, v4.8h +; CHECK-NEXT: umlal v2.4s, v6.4h, v4.4h +; CHECK-NEXT: add v0.4s, v1.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 @@ -1037,20 +1049,22 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v25i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: adrp x8, .LCPI32_0 +; CHECK-NEXT: ushll2 v3.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v4.8h, v2.16b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-NEXT: uaddl2 v5.4s, v1.8h, v2.8h -; CHECK-NEXT: mov v0.s[0], v4.s[0] -; CHECK-NEXT: uaddl v1.4s, v1.4h, v2.4h -; CHECK-NEXT: uaddw2 v2.4s, v5.4s, v3.8h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v3.4h -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: and v2.16b, v4.16b, v2.16b +; CHECK-NEXT: uaddl2 v4.4s, v0.8h, v1.8h +; CHECK-NEXT: mov v2.s[3], wzr +; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: uaddw2 v1.4s, v4.4s, v3.8h +; CHECK-NEXT: uaddw v2.4s, v2.4s, v3.4h +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -1063,27 +1077,29 @@ define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v25i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q1, q4, [x1] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: sshll2 v7.8h, v4.16b, #0 -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: adrp x8, .LCPI33_0 ; CHECK-NEXT: sshll2 v6.8h, v3.16b, #0 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: sshll2 v7.8h, v0.16b, #0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v5.8h, v2.8b, #0 ; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: smull2 v16.4s, v3.8h, v1.8h ; CHECK-NEXT: smull v2.4s, v7.4h, v2.4h -; CHECK-NEXT: sshll v7.8h, v1.8b, #0 -; CHECK-NEXT: sshll2 v1.8h, v1.16b, #0 -; CHECK-NEXT: smull2 v16.4s, v7.8h, v3.8h -; CHECK-NEXT: mov v0.s[0], v2.s[0] -; CHECK-NEXT: smull v2.4s, v7.4h, v3.4h -; CHECK-NEXT: smlal2 v16.4s, v4.8h, v5.8h -; CHECK-NEXT: smlal v0.4s, v1.4h, v6.4h -; CHECK-NEXT: smlal v2.4s, v4.4h, v5.4h -; CHECK-NEXT: smlal2 v16.4s, v1.8h, v6.8h -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v0.4s, v0.4s, v16.4s +; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI33_0] +; CHECK-NEXT: smull v1.4s, v3.4h, v1.4h +; CHECK-NEXT: smlal2 v16.4s, v0.8h, v5.8h +; CHECK-NEXT: and v2.16b, v2.16b, v7.16b +; CHECK-NEXT: mov v2.s[3], wzr +; CHECK-NEXT: smlal v1.4s, v0.4h, v5.4h +; CHECK-NEXT: smlal2 v16.4s, v6.8h, v4.8h +; CHECK-NEXT: smlal v2.4s, v6.4h, v4.4h +; CHECK-NEXT: add v0.4s, v1.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 @@ -1109,216 +1125,218 @@ ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b0, [sp, #16] ; CHECK-NEXT: add x9, sp, #24 -; CHECK-NEXT: add x10, sp, #40 -; CHECK-NEXT: add x11, sp, #128 +; CHECK-NEXT: ldr b6, [sp, #280] +; CHECK-NEXT: add x10, sp, #288 ; CHECK-NEXT: ld1 { v2.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #96 ; CHECK-NEXT: ld1 { v0.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: ldr b17, [sp, #152] -; CHECK-NEXT: fmov s4, w0 -; CHECK-NEXT: ldr b6, [sp, #280] +; CHECK-NEXT: ld1 { v6.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ldr b3, [sp, #152] ; CHECK-NEXT: add x12, sp, #224 ; CHECK-NEXT: ld1 { v2.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #104 ; CHECK-NEXT: ld1 { v0.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ld1 { v3.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #120 ; CHECK-NEXT: ldr b1, [sp, #216] -; CHECK-NEXT: mov v4.b[1], w1 -; CHECK-NEXT: ldr b3, [sp, #480] +; CHECK-NEXT: fmov s7, w0 ; CHECK-NEXT: ld1 { v2.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #120 -; CHECK-NEXT: ld1 { v0.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #48 +; CHECK-NEXT: add x8, sp, #296 +; CHECK-NEXT: ldr b4, [sp, #480] +; CHECK-NEXT: add x11, sp, #136 ; CHECK-NEXT: ld1 { v1.b }[1], [x12] -; CHECK-NEXT: mov v4.b[2], w2 -; CHECK-NEXT: ldr b18, [sp, #352] +; CHECK-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #40 ; CHECK-NEXT: ld1 { v2.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #304 +; CHECK-NEXT: mov v7.b[1], w1 +; CHECK-NEXT: ldr b16, [sp, #352] +; CHECK-NEXT: ld1 { v0.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #128 +; CHECK-NEXT: ld1 { v6.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #48 +; CHECK-NEXT: ld1 { v2.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #312 +; CHECK-NEXT: ldr b17, [sp, #552] +; CHECK-NEXT: ld1 { v0.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #56 -; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #288 +; CHECK-NEXT: ld1 { v6.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #64 +; CHECK-NEXT: ld1 { v2.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #320 +; CHECK-NEXT: mov v7.b[2], w2 ; CHECK-NEXT: ldr b20, [sp, #680] -; CHECK-NEXT: mov v4.b[3], w3 -; CHECK-NEXT: ldr b5, [sp, #144] -; CHECK-NEXT: ld1 { v2.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #160 ; CHECK-NEXT: ld1 { v0.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ld1 { v6.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #296 -; CHECK-NEXT: ld1 { v17.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #136 -; CHECK-NEXT: ld1 { v2.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #320 -; CHECK-NEXT: ld1 { v0.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #72 -; CHECK-NEXT: ld1 { v6.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #304 -; CHECK-NEXT: mov v4.b[4], w4 -; CHECK-NEXT: ld1 { v2.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #168 -; CHECK-NEXT: ld1 { v0.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #232 -; CHECK-NEXT: ld1 { v6.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #312 -; CHECK-NEXT: ld1 { v17.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #488 -; CHECK-NEXT: mov v4.b[5], w5 -; CHECK-NEXT: ld1 { v6.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #240 -; CHECK-NEXT: ld1 { v3.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #496 -; CHECK-NEXT: ld1 { v17.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #184 -; CHECK-NEXT: ld1 { v1.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #248 -; CHECK-NEXT: mov v4.b[6], w6 -; CHECK-NEXT: ld1 { v6.b }[5], [x11] +; CHECK-NEXT: add x9, sp, #168 +; CHECK-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #232 +; CHECK-NEXT: ld1 { v2.b }[7], [x11] +; CHECK-NEXT: add x11, sp, #72 ; CHECK-NEXT: ld1 { v3.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #504 -; CHECK-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #192 -; CHECK-NEXT: ld1 { v1.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #256 -; CHECK-NEXT: add x11, sp, #328 +; CHECK-NEXT: add x9, sp, #176 +; CHECK-NEXT: ld1 { v0.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #328 +; CHECK-NEXT: ld1 { v1.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #240 +; CHECK-NEXT: mov v7.b[3], w3 +; CHECK-NEXT: ldr b19, [sp, #344] ; CHECK-NEXT: ld1 { v3.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #512 -; CHECK-NEXT: ld1 { v17.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #200 -; CHECK-NEXT: ld1 { v1.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #264 -; CHECK-NEXT: mov v4.b[7], w7 -; CHECK-NEXT: ld1 { v6.b }[6], [x11] +; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: ld1 { v6.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #336 +; CHECK-NEXT: ld1 { v1.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #248 +; CHECK-NEXT: mov v7.b[4], w4 +; CHECK-NEXT: ld1 { v0.b }[7], [x11] ; CHECK-NEXT: ld1 { v3.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #520 -; CHECK-NEXT: ld1 { v17.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #208 -; CHECK-NEXT: ld1 { v1.b }[6], [x10] -; CHECK-NEXT: add x11, sp, #336 -; CHECK-NEXT: add x10, sp, #272 +; CHECK-NEXT: add x9, sp, #192 +; CHECK-NEXT: ld1 { v6.b }[7], [x10] +; CHECK-NEXT: add x10, sp, #488 +; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #256 +; CHECK-NEXT: mov v7.b[5], w5 +; CHECK-NEXT: add x11, sp, #264 ; CHECK-NEXT: ld1 { v3.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #536 -; CHECK-NEXT: ld1 { v17.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #528 -; CHECK-NEXT: sshll v19.8h, v4.8b, #0 -; CHECK-NEXT: ldr b4, [sp, #416] -; CHECK-NEXT: ld1 { v6.b }[7], [x11] -; CHECK-NEXT: add x11, sp, #688 -; CHECK-NEXT: ld1 { v3.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #424 -; CHECK-NEXT: ld1 { v1.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #360 -; CHECK-NEXT: sshll v7.8h, v2.8b, #0 -; CHECK-NEXT: ldr b2, [sp, #344] -; CHECK-NEXT: ld1 { v4.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #432 -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: ld1 { v18.b }[1], [x10] -; CHECK-NEXT: sshll v16.8h, v6.8b, #0 -; CHECK-NEXT: ld1 { v3.b }[7], [x9] -; CHECK-NEXT: sshll v6.8h, v2.8b, #0 -; CHECK-NEXT: add x9, sp, #560 -; CHECK-NEXT: smull v2.4s, v19.4h, v17.4h -; CHECK-NEXT: ld1 { v4.b }[2], [x8] -; CHECK-NEXT: smull2 v17.4s, v19.8h, v17.8h -; CHECK-NEXT: ldr b19, [sp, #552] -; CHECK-NEXT: add x8, sp, #368 -; CHECK-NEXT: add x10, sp, #440 -; CHECK-NEXT: ld1 { v20.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #696 -; CHECK-NEXT: ld1 { v19.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #376 -; CHECK-NEXT: ld1 { v18.b }[2], [x8] +; CHECK-NEXT: add x9, sp, #200 +; CHECK-NEXT: ld1 { v4.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #496 +; CHECK-NEXT: ld1 { v1.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #208 +; CHECK-NEXT: mov v7.b[6], w6 +; CHECK-NEXT: ldr b5, [sp, #144] +; CHECK-NEXT: ld1 { v3.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #360 +; CHECK-NEXT: ld1 { v4.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #560 +; CHECK-NEXT: ld1 { v1.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #528 +; CHECK-NEXT: ld1 { v16.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #368 +; CHECK-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #504 +; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #512 +; CHECK-NEXT: mov v7.b[7], w7 +; CHECK-NEXT: ld1 { v4.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #568 -; CHECK-NEXT: ld1 { v4.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #448 -; CHECK-NEXT: ld1 { v20.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #704 -; CHECK-NEXT: ld1 { v19.b }[2], [x8] +; CHECK-NEXT: ld1 { v16.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #376 +; CHECK-NEXT: sshll v18.8h, v3.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #576 -; CHECK-NEXT: ld1 { v18.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #384 -; CHECK-NEXT: smlal v2.4s, v7.4h, v16.4h ; CHECK-NEXT: ld1 { v4.b }[4], [x10] -; CHECK-NEXT: smlal2 v17.4s, v7.8h, v16.8h -; CHECK-NEXT: ldr b7, [sp, #616] -; CHECK-NEXT: ld1 { v19.b }[3], [x8] +; CHECK-NEXT: add x10, sp, #520 +; CHECK-NEXT: ld1 { v16.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #384 +; CHECK-NEXT: sshll v7.8h, v7.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #584 -; CHECK-NEXT: ld1 { v18.b }[4], [x9] +; CHECK-NEXT: ld1 { v4.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #688 +; CHECK-NEXT: ld1 { v16.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #392 -; CHECK-NEXT: add x10, sp, #456 -; CHECK-NEXT: ld1 { v20.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #400 -; CHECK-NEXT: ld1 { v19.b }[4], [x8] +; CHECK-NEXT: smull2 v3.4s, v7.8h, v18.8h +; CHECK-NEXT: ld1 { v17.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #592 -; CHECK-NEXT: ld1 { v18.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #624 -; CHECK-NEXT: ld1 { v4.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #632 -; CHECK-NEXT: ld1 { v19.b }[5], [x8] +; CHECK-NEXT: ld1 { v20.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #696 +; CHECK-NEXT: ld1 { v16.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #400 +; CHECK-NEXT: smull v7.4s, v7.4h, v18.4h +; CHECK-NEXT: ld1 { v4.b }[6], [x11] +; CHECK-NEXT: ld1 { v17.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: ld1 { v20.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #704 +; CHECK-NEXT: ld1 { v16.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #408 +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: ldr b18, [sp, #416] +; CHECK-NEXT: ld1 { v17.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #608 +; CHECK-NEXT: ld1 { v20.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #712 +; CHECK-NEXT: ld1 { v16.b }[7], [x9] +; CHECK-NEXT: add x11, sp, #424 +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: add x9, sp, #720 +; CHECK-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #432 ; CHECK-NEXT: ld1 { v20.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #720 -; CHECK-NEXT: ld1 { v18.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #408 -; CHECK-NEXT: ld1 { v7.b }[2], [x9] +; CHECK-NEXT: add x10, sp, #624 +; CHECK-NEXT: smlal v7.4s, v2.4h, v6.4h +; CHECK-NEXT: ld1 { v18.b }[1], [x11] +; CHECK-NEXT: smlal2 v3.4s, v2.8h, v6.8h +; CHECK-NEXT: add x11, sp, #536 +; CHECK-NEXT: sshll v2.8h, v16.8b, #0 +; CHECK-NEXT: sshll v6.8h, v17.8b, #0 +; CHECK-NEXT: ld1 { v20.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #728 +; CHECK-NEXT: ld1 { v18.b }[2], [x8] +; CHECK-NEXT: smull2 v16.4s, v2.8h, v6.8h +; CHECK-NEXT: add x8, sp, #440 +; CHECK-NEXT: smull v2.4s, v2.4h, v6.4h +; CHECK-NEXT: ldr b6, [sp, #616] +; CHECK-NEXT: ld1 { v20.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #736 +; CHECK-NEXT: ld1 { v18.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #448 +; CHECK-NEXT: ld1 { v6.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #632 +; CHECK-NEXT: ld1 { v4.b }[7], [x11] +; CHECK-NEXT: add x11, sp, #272 +; CHECK-NEXT: ld1 { v20.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #640 -; CHECK-NEXT: ld1 { v19.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #608 -; CHECK-NEXT: ld1 { v20.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #728 -; CHECK-NEXT: ld1 { v18.b }[7], [x11] -; CHECK-NEXT: add x11, sp, #464 -; CHECK-NEXT: ld1 { v7.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #664 -; CHECK-NEXT: ld1 { v19.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #648 -; CHECK-NEXT: ld1 { v20.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #736 -; CHECK-NEXT: sshll v16.8h, v18.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[6], [x11] -; CHECK-NEXT: ld1 { v7.b }[4], [x8] +; CHECK-NEXT: ld1 { v18.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #456 +; CHECK-NEXT: ld1 { v6.b }[2], [x10] +; CHECK-NEXT: adrp x10, .LCPI34_0 +; CHECK-NEXT: sshll v17.8h, v19.8b, #0 +; CHECK-NEXT: ld1 { v1.b }[7], [x11] +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v18.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #656 -; CHECK-NEXT: sshll v18.8h, v19.8b, #0 -; CHECK-NEXT: ld1 { v20.b }[7], [x10] -; CHECK-NEXT: smull v19.4s, v16.4h, v18.4h -; CHECK-NEXT: ld1 { v7.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #472 -; CHECK-NEXT: smull2 v16.4s, v16.8h, v18.8h -; CHECK-NEXT: ldr b18, [sp, #544] -; CHECK-NEXT: smull v5.4s, v5.4h, v6.4h -; CHECK-NEXT: ldr b6, [sp, #744] -; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[7], [x8] -; CHECK-NEXT: sshll v20.8h, v20.8b, #0 -; CHECK-NEXT: ld1 { v7.b }[6], [x9] -; CHECK-NEXT: sshll v18.8h, v18.8b, #0 -; CHECK-NEXT: add x9, sp, #672 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: smlal v19.4s, v3.4h, v20.4h -; CHECK-NEXT: smlal2 v16.4s, v3.8h, v20.8h -; CHECK-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: smull v6.4s, v18.4h, v6.4h -; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v6.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #648 +; CHECK-NEXT: sshll v19.8h, v20.8b, #0 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: smlal v2.4s, v4.4h, v19.4h +; CHECK-NEXT: ld1 { v6.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #464 +; CHECK-NEXT: smlal2 v16.4s, v4.8h, v19.8h +; CHECK-NEXT: ldr b4, [sp, #544] +; CHECK-NEXT: ldr b19, [sp, #744] +; CHECK-NEXT: ld1 { v18.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #472 +; CHECK-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #664 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: sshll v19.8h, v19.8b, #0 +; CHECK-NEXT: smull v5.4s, v5.4h, v17.4h +; CHECK-NEXT: ldr q17, [x10, :lo12:.LCPI34_0] +; CHECK-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #672 +; CHECK-NEXT: smull v4.4s, v4.4h, v19.4h +; CHECK-NEXT: ld1 { v18.b }[7], [x9] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: mov v3.s[0], v5.s[0] +; CHECK-NEXT: and v5.16b, v5.16b, v17.16b +; CHECK-NEXT: ld1 { v6.b }[7], [x8] +; CHECK-NEXT: and v4.16b, v4.16b, v17.16b +; CHECK-NEXT: mov v5.s[3], wzr +; CHECK-NEXT: mov v4.s[3], wzr ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: mov v18.s[0], v6.s[0] -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-NEXT: sshll v5.8h, v7.8b, #0 -; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h -; CHECK-NEXT: smlal v18.4s, v4.4h, v5.4h -; CHECK-NEXT: smlal2 v17.4s, v0.8h, v1.8h -; CHECK-NEXT: smlal2 v16.4s, v4.8h, v5.8h -; CHECK-NEXT: add v0.4s, v2.4s, v3.4s -; CHECK-NEXT: add v1.4s, v19.4s, v18.4s -; CHECK-NEXT: add v0.4s, v0.4s, v17.4s +; CHECK-NEXT: sshll v17.8h, v18.8b, #0 +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: smlal v5.4s, v0.4h, v1.4h +; CHECK-NEXT: smlal v4.4s, v17.4h, v6.4h +; CHECK-NEXT: smlal2 v3.4s, v0.8h, v1.8h +; CHECK-NEXT: smlal2 v16.4s, v17.8h, v6.8h +; CHECK-NEXT: add v0.4s, v7.4s, v5.4s +; CHECK-NEXT: add v1.4s, v2.4s, v4.4s +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s ; CHECK-NEXT: add v1.4s, v1.4s, v16.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s @@ -1348,112 +1366,114 @@ ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b2, [sp, #16] ; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: add x10, sp, #112 ; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: ldr b3, [sp, #480] ; CHECK-NEXT: ld1 { v0.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #96 ; CHECK-NEXT: ld1 { v2.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: mov v1.b[1], w1 -; CHECK-NEXT: add x10, sp, #488 -; CHECK-NEXT: add x11, sp, #496 -; CHECK-NEXT: ldr b4, [sp, #352] +; CHECK-NEXT: add x11, sp, #120 +; CHECK-NEXT: ldr b3, [sp, #480] +; CHECK-NEXT: add x12, sp, #488 +; CHECK-NEXT: ldr b5, [sp, #352] ; CHECK-NEXT: ld1 { v0.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #104 ; CHECK-NEXT: ld1 { v2.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #40 -; CHECK-NEXT: ld1 { v3.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #48 -; CHECK-NEXT: mov v1.b[2], w2 +; CHECK-NEXT: mov v1.b[1], w1 +; CHECK-NEXT: ld1 { v3.b }[1], [x12] +; CHECK-NEXT: add x12, sp, #496 ; CHECK-NEXT: ldr b6, [sp, #416] ; CHECK-NEXT: ld1 { v0.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #112 +; CHECK-NEXT: add x8, sp, #48 ; CHECK-NEXT: ld1 { v2.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: ld1 { v3.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #56 +; CHECK-NEXT: mov v1.b[2], w2 +; CHECK-NEXT: ld1 { v3.b }[2], [x12] +; CHECK-NEXT: add x12, sp, #424 +; CHECK-NEXT: ldr b4, [sp, #144] +; CHECK-NEXT: ld1 { v0.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #56 +; CHECK-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #136 ; CHECK-NEXT: mov v1.b[3], w3 -; CHECK-NEXT: add x12, sp, #504 -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #120 -; CHECK-NEXT: ld1 { v2.b }[4], [x10] +; CHECK-NEXT: ld1 { v6.b }[1], [x12] +; CHECK-NEXT: ldr b17, [sp, #544] +; CHECK-NEXT: ld1 { v0.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #72 +; CHECK-NEXT: ld1 { v2.b }[5], [x10] ; CHECK-NEXT: add x10, sp, #64 -; CHECK-NEXT: ldr b5, [sp, #144] ; CHECK-NEXT: mov v1.b[4], w4 -; CHECK-NEXT: ld1 { v3.b }[3], [x12] -; CHECK-NEXT: ld1 { v0.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #136 -; CHECK-NEXT: ld1 { v2.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #360 -; CHECK-NEXT: add x12, sp, #72 -; CHECK-NEXT: mov v1.b[5], w5 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 ; CHECK-NEXT: ld1 { v0.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #424 -; CHECK-NEXT: ld1 { v4.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #512 +; CHECK-NEXT: add x9, sp, #504 ; CHECK-NEXT: ld1 { v2.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #368 -; CHECK-NEXT: ld1 { v6.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #376 +; CHECK-NEXT: add x10, sp, #360 +; CHECK-NEXT: mov v1.b[5], w5 +; CHECK-NEXT: ld1 { v3.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #432 +; CHECK-NEXT: ld1 { v5.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #512 ; CHECK-NEXT: ld1 { v0.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #432 -; CHECK-NEXT: ld1 { v4.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #520 +; CHECK-NEXT: add x8, sp, #368 +; CHECK-NEXT: ld1 { v6.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #440 ; CHECK-NEXT: mov v1.b[6], w6 -; CHECK-NEXT: ld1 { v2.b }[7], [x12] -; CHECK-NEXT: ld1 { v6.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #440 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: ld1 { v3.b }[4], [x11] -; CHECK-NEXT: ld1 { v4.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #384 -; CHECK-NEXT: movi v7.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v6.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #448 +; CHECK-NEXT: ld1 { v3.b }[4], [x10] +; CHECK-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #376 +; CHECK-NEXT: add x10, sp, #520 +; CHECK-NEXT: ld1 { v2.b }[7], [x11] +; CHECK-NEXT: ld1 { v6.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #448 ; CHECK-NEXT: mov v1.b[7], w7 +; CHECK-NEXT: adrp x11, .LCPI35_0 +; CHECK-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #384 ; CHECK-NEXT: ld1 { v3.b }[5], [x10] -; CHECK-NEXT: sshll v5.4s, v5.4h, #0 -; CHECK-NEXT: ld1 { v4.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #392 ; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: ld1 { v6.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #456 -; CHECK-NEXT: mov v7.s[0], v5.s[0] +; CHECK-NEXT: ld1 { v6.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #456 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[5], [x9] -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: add x9, sp, #400 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: ld1 { v6.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #464 +; CHECK-NEXT: ldr q7, [x11, :lo12:.LCPI35_0] +; CHECK-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #392 ; CHECK-NEXT: ld1 { v3.b }[6], [x10] -; CHECK-NEXT: saddw v5.4s, v7.4s, v2.4h -; CHECK-NEXT: ld1 { v4.b }[6], [x9] -; CHECK-NEXT: saddl v7.4s, v1.4h, v0.4h ; CHECK-NEXT: add x10, sp, #536 -; CHECK-NEXT: add x9, sp, #408 -; CHECK-NEXT: ld1 { v6.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #472 -; CHECK-NEXT: add v5.4s, v7.4s, v5.4s -; CHECK-NEXT: ldr b7, [sp, #544] -; CHECK-NEXT: saddl2 v0.4s, v1.8h, v0.8h +; CHECK-NEXT: ld1 { v6.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #464 +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #400 +; CHECK-NEXT: saddl v16.4s, v1.4h, v0.4h ; CHECK-NEXT: ld1 { v3.b }[7], [x10] -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v4.b }[7], [x9] -; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v6.b }[7], [x8] -; CHECK-NEXT: saddw2 v0.4s, v0.4s, v2.8h -; CHECK-NEXT: sshll v7.4s, v7.4h, #0 +; CHECK-NEXT: ld1 { v6.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #472 +; CHECK-NEXT: saddl2 v0.4s, v1.8h, v0.8h +; CHECK-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #408 +; CHECK-NEXT: sshll v1.8h, v17.8b, #0 +; CHECK-NEXT: sshll v4.4s, v4.4h, #0 +; CHECK-NEXT: ld1 { v6.b }[7], [x9] +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-NEXT: and v4.16b, v4.16b, v7.16b +; CHECK-NEXT: and v1.16b, v1.16b, v7.16b +; CHECK-NEXT: mov v4.s[3], wzr +; CHECK-NEXT: mov v1.s[3], wzr ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: mov v1.s[0], v7.s[0] -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: saddl v7.4s, v4.4h, v3.4h -; CHECK-NEXT: saddl2 v3.4s, v4.8h, v3.8h +; CHECK-NEXT: saddl v7.4s, v5.4h, v3.4h +; CHECK-NEXT: saddl2 v3.4s, v5.8h, v3.8h +; CHECK-NEXT: saddw v4.4s, v4.4s, v2.4h ; CHECK-NEXT: saddw v1.4s, v1.4s, v6.4h -; CHECK-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-NEXT: saddw2 v0.4s, v0.4s, v2.8h ; CHECK-NEXT: saddw2 v2.4s, v3.4s, v6.8h +; CHECK-NEXT: add v4.4s, v16.4s, v4.4s ; CHECK-NEXT: add v1.4s, v7.4s, v1.4s +; CHECK-NEXT: add v0.4s, v4.4s, v0.4s ; CHECK-NEXT: add v1.4s, v1.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s @@ -1587,33 +1607,35 @@ define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v33i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b1, [x1, #32] -; CHECK-NEXT: ldr b2, [x0, #32] -; CHECK-NEXT: ldp q3, q4, [x0] +; CHECK-NEXT: ldr b0, [x1, #32] +; CHECK-NEXT: adrp x8, .LCPI41_0 +; CHECK-NEXT: ldr b1, [x0, #32] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h -; CHECK-NEXT: ushll v16.8h, v3.8b, #0 -; CHECK-NEXT: ldp q5, q6, [x1] -; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0 -; CHECK-NEXT: mov v0.s[0], v1.s[0] -; CHECK-NEXT: ushll2 v7.8h, v4.16b, #0 -; CHECK-NEXT: ushll2 v2.8h, v5.16b, #0 -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 -; CHECK-NEXT: umull2 v18.4s, v2.8h, v3.8h -; CHECK-NEXT: umull2 v1.4s, v5.8h, v16.8h +; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ushll v7.8h, v2.8b, #0 +; CHECK-NEXT: ldp q4, q5, [x1] +; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: ushll2 v6.8h, v3.16b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ushll2 v1.8h, v4.16b, #0 ; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: ushll2 v17.8h, v6.16b, #0 -; CHECK-NEXT: ushll v6.8h, v6.8b, #0 -; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h -; CHECK-NEXT: umlal2 v18.4s, v17.8h, v7.8h -; CHECK-NEXT: umlal2 v1.4s, v6.8h, v4.8h -; CHECK-NEXT: umlal v0.4s, v5.4h, v16.4h -; CHECK-NEXT: umlal v2.4s, v17.4h, v7.4h -; CHECK-NEXT: add v1.4s, v1.4s, v18.4s -; CHECK-NEXT: umlal v0.4s, v6.4h, v4.4h -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI41_0] +; CHECK-NEXT: umull2 v18.4s, v1.8h, v2.8h +; CHECK-NEXT: ushll2 v16.8h, v5.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v17.16b +; CHECK-NEXT: umull2 v17.4s, v4.8h, v7.8h +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: umlal2 v18.4s, v16.8h, v6.8h +; CHECK-NEXT: umlal2 v17.4s, v5.8h, v3.8h +; CHECK-NEXT: umlal v0.4s, v4.4h, v7.4h +; CHECK-NEXT: umlal v1.4s, v16.4h, v6.4h +; CHECK-NEXT: add v2.4s, v17.4s, v18.4s +; CHECK-NEXT: umlal v0.4s, v5.4h, v3.4h +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -1633,16 +1655,18 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v33i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b1, [x0, #32] -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v5.8h, v3.16b, #0 -; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: ldr b0, [x0, #32] +; CHECK-NEXT: adrp x8, .LCPI42_0 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v5.8h, v1.16b, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI42_0] ; CHECK-NEXT: ushll v4.8h, v2.8b, #0 ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 -; CHECK-NEXT: ushll v1.8h, v3.8b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: uaddl2 v3.4s, v5.8h, v2.8h ; CHECK-NEXT: uaddl2 v6.4s, v1.8h, v4.8h ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h @@ -1663,33 +1687,35 @@ define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v33i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b1, [x1, #32] -; CHECK-NEXT: ldr b2, [x0, #32] -; CHECK-NEXT: ldp q3, q4, [x0] +; CHECK-NEXT: ldr b0, [x1, #32] +; CHECK-NEXT: adrp x8, .LCPI43_0 +; CHECK-NEXT: ldr b1, [x0, #32] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h -; CHECK-NEXT: sshll v16.8h, v3.8b, #0 -; CHECK-NEXT: ldp q5, q6, [x1] -; CHECK-NEXT: sshll2 v3.8h, v3.16b, #0 -; CHECK-NEXT: mov v0.s[0], v1.s[0] -; CHECK-NEXT: sshll2 v7.8h, v4.16b, #0 -; CHECK-NEXT: sshll2 v2.8h, v5.16b, #0 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: smull2 v18.4s, v2.8h, v3.8h -; CHECK-NEXT: smull2 v1.4s, v5.8h, v16.8h +; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: sshll v7.8h, v2.8b, #0 +; CHECK-NEXT: ldp q4, q5, [x1] +; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: sshll2 v6.8h, v3.16b, #0 +; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: sshll2 v1.8h, v4.16b, #0 ; CHECK-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-NEXT: sshll2 v17.8h, v6.16b, #0 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: smull v2.4s, v2.4h, v3.4h -; CHECK-NEXT: smlal2 v18.4s, v17.8h, v7.8h -; CHECK-NEXT: smlal2 v1.4s, v6.8h, v4.8h -; CHECK-NEXT: smlal v0.4s, v5.4h, v16.4h -; CHECK-NEXT: smlal v2.4s, v17.4h, v7.4h -; CHECK-NEXT: add v1.4s, v1.4s, v18.4s -; CHECK-NEXT: smlal v0.4s, v6.4h, v4.4h -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI43_0] +; CHECK-NEXT: smull2 v18.4s, v1.8h, v2.8h +; CHECK-NEXT: sshll2 v16.8h, v5.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v17.16b +; CHECK-NEXT: smull2 v17.4s, v4.8h, v7.8h +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: smlal2 v18.4s, v16.8h, v6.8h +; CHECK-NEXT: smlal2 v17.4s, v5.8h, v3.8h +; CHECK-NEXT: smlal v0.4s, v4.4h, v7.4h +; CHECK-NEXT: smlal v1.4s, v16.4h, v6.4h +; CHECK-NEXT: add v2.4s, v17.4s, v18.4s +; CHECK-NEXT: smlal v0.4s, v5.4h, v3.4h +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -1712,291 +1738,293 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr b0, [sp, #80] +; CHECK-NEXT: ldr b3, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 -; CHECK-NEXT: ldr b1, [sp, #144] -; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: ldr b3, [sp, #16] -; CHECK-NEXT: add x10, sp, #104 -; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #152 -; CHECK-NEXT: ldr b4, [sp, #344] -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: ldr b6, [sp, #216] -; CHECK-NEXT: add x11, sp, #136 -; CHECK-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #160 -; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: ldr b0, [sp, #144] +; CHECK-NEXT: add x9, sp, #152 +; CHECK-NEXT: ldr b1, [sp, #16] +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ld1 { v3.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #96 +; CHECK-NEXT: ld1 { v0.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #24 -; CHECK-NEXT: mov v2.b[1], w1 -; CHECK-NEXT: ldr b17, [sp, #280] -; CHECK-NEXT: ldr b7, [sp, #408] -; CHECK-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #168 -; CHECK-NEXT: ld1 { v3.b }[1], [x9] +; CHECK-NEXT: ldr b2, [sp, #344] +; CHECK-NEXT: fmov s6, w0 +; CHECK-NEXT: ldr b16, [sp, #280] +; CHECK-NEXT: add x11, sp, #200 +; CHECK-NEXT: ld1 { v3.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #104 +; CHECK-NEXT: ld1 { v1.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: ld1 { v0.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #112 -; CHECK-NEXT: mov v2.b[2], w2 -; CHECK-NEXT: ldr b5, [sp, #208] -; CHECK-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: ld1 { v3.b }[2], [x9] +; CHECK-NEXT: ld1 { v0.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #168 +; CHECK-NEXT: ldr b7, [sp, #408] +; CHECK-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #112 +; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #40 +; CHECK-NEXT: ld1 { v0.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #176 +; CHECK-NEXT: mov v6.b[1], w1 +; CHECK-NEXT: ldr b4, [sp, #208] +; CHECK-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #120 +; CHECK-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #48 ; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: mov v2.b[3], w3 -; CHECK-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #184 -; CHECK-NEXT: ld1 { v3.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: ld1 { v0.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #48 -; CHECK-NEXT: mov v2.b[4], w4 -; CHECK-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #192 -; CHECK-NEXT: ld1 { v3.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #360 -; CHECK-NEXT: ld1 { v0.b }[6], [x9] +; CHECK-NEXT: add x10, sp, #184 +; CHECK-NEXT: mov v6.b[2], w2 +; CHECK-NEXT: ld1 { v3.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #128 +; CHECK-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #56 -; CHECK-NEXT: mov v2.b[5], w5 -; CHECK-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #200 -; CHECK-NEXT: ld1 { v3.b }[5], [x9] +; CHECK-NEXT: ld1 { v0.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #192 +; CHECK-NEXT: mov v6.b[3], w3 +; CHECK-NEXT: ld1 { v3.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #136 +; CHECK-NEXT: ld1 { v1.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #64 +; CHECK-NEXT: ld1 { v0.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: mov v6.b[4], w4 +; CHECK-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #72 +; CHECK-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #360 +; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #288 ; CHECK-NEXT: ld1 { v0.b }[7], [x11] -; CHECK-NEXT: add x11, sp, #232 -; CHECK-NEXT: mov v2.b[6], w6 +; CHECK-NEXT: add x11, sp, #368 +; CHECK-NEXT: sshll v5.8h, v3.8b, #0 +; CHECK-NEXT: ldr b3, [sp, #216] ; CHECK-NEXT: ld1 { v1.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #352 -; CHECK-NEXT: ld1 { v3.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #72 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #224 -; CHECK-NEXT: mov v2.b[7], w7 -; CHECK-NEXT: ld1 { v3.b }[7], [x9] +; CHECK-NEXT: ld1 { v2.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #416 -; CHECK-NEXT: ld1 { v6.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #288 -; CHECK-NEXT: ld1 { v4.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #368 +; CHECK-NEXT: ld1 { v16.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #296 +; CHECK-NEXT: ld1 { v3.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #232 ; CHECK-NEXT: ld1 { v7.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #424 -; CHECK-NEXT: ld1 { v17.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #296 -; CHECK-NEXT: ld1 { v6.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #240 -; CHECK-NEXT: ld1 { v4.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #376 +; CHECK-NEXT: ld1 { v2.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #376 +; CHECK-NEXT: ld1 { v16.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #304 +; CHECK-NEXT: ld1 { v3.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #240 ; CHECK-NEXT: ld1 { v7.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #432 -; CHECK-NEXT: ld1 { v17.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #304 -; CHECK-NEXT: ld1 { v6.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #248 -; CHECK-NEXT: ld1 { v4.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #384 +; CHECK-NEXT: ld1 { v2.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #384 +; CHECK-NEXT: ld1 { v16.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #312 +; CHECK-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #248 ; CHECK-NEXT: ld1 { v7.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #440 -; CHECK-NEXT: ld1 { v17.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ld1 { v6.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #256 -; CHECK-NEXT: ld1 { v4.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #392 +; CHECK-NEXT: mov v6.b[5], w5 +; CHECK-NEXT: ld1 { v2.b }[5], [x11] +; CHECK-NEXT: ld1 { v16.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #320 +; CHECK-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #256 ; CHECK-NEXT: ld1 { v7.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #448 -; CHECK-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #320 -; CHECK-NEXT: ld1 { v6.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #264 -; CHECK-NEXT: sshll v19.8h, v2.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[6], [x10] +; CHECK-NEXT: sshll v17.8h, v0.8b, #0 +; CHECK-NEXT: ldr b0, [sp, #472] +; CHECK-NEXT: ld1 { v16.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #328 +; CHECK-NEXT: ld1 { v3.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #264 ; CHECK-NEXT: ld1 { v7.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #456 -; CHECK-NEXT: ld1 { v17.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #328 -; CHECK-NEXT: ld1 { v6.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #272 -; CHECK-NEXT: sshll v2.8h, v1.8b, #0 -; CHECK-NEXT: ldr b1, [sp, #608] +; CHECK-NEXT: add x11, sp, #392 +; CHECK-NEXT: ld1 { v16.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #680 +; CHECK-NEXT: ld1 { v3.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #272 ; CHECK-NEXT: ld1 { v7.b }[6], [x9] ; CHECK-NEXT: add x9, sp, #464 -; CHECK-NEXT: ld1 { v17.b }[6], [x8] +; CHECK-NEXT: sshll v20.8h, v1.8b, #0 +; CHECK-NEXT: ld1 { v2.b }[6], [x11] +; CHECK-NEXT: sshll v1.8h, v4.8b, #0 +; CHECK-NEXT: add x11, sp, #400 +; CHECK-NEXT: ld1 { v3.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #336 -; CHECK-NEXT: ld1 { v6.b }[7], [x11] -; CHECK-NEXT: add x10, sp, #400 -; CHECK-NEXT: sshll v16.8h, v3.8b, #0 -; CHECK-NEXT: add x11, sp, #648 ; CHECK-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #624 -; CHECK-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-NEXT: adrp x9, .LCPI44_0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ld1 { v2.b }[7], [x11] +; CHECK-NEXT: ld1 { v16.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #616 -; CHECK-NEXT: sshll v21.8h, v6.8b, #0 -; CHECK-NEXT: ldr b6, [sp, #472] -; CHECK-NEXT: ld1 { v4.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #552 -; CHECK-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #488 -; CHECK-NEXT: sshll v18.8h, v17.8b, #0 -; CHECK-NEXT: ldr b17, [sp, #480] +; CHECK-NEXT: mov v6.b[6], w6 +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI44_0] +; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: add x9, sp, #488 +; CHECK-NEXT: sshll v21.8h, v3.8b, #0 +; CHECK-NEXT: add x11, sp, #632 +; CHECK-NEXT: sshll v3.8h, v16.8b, #0 +; CHECK-NEXT: ldr b16, [sp, #544] +; CHECK-NEXT: mov v6.b[7], w7 +; CHECK-NEXT: smull v1.4s, v20.4h, v3.4h +; CHECK-NEXT: smull2 v3.4s, v20.8h, v3.8h +; CHECK-NEXT: ldr b20, [sp, #608] +; CHECK-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-NEXT: sshll v19.8h, v7.8b, #0 +; CHECK-NEXT: ldr b7, [sp, #672] +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: ld1 { v20.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #624 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: sshll v3.8h, v4.8b, #0 -; CHECK-NEXT: ld1 { v17.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #496 -; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #632 -; CHECK-NEXT: sshll v4.8h, v7.8b, #0 -; CHECK-NEXT: smull v20.4s, v5.4h, v6.4h -; CHECK-NEXT: movi v7.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v17.b }[2], [x8] -; CHECK-NEXT: smull v5.4s, v16.4h, v18.4h -; CHECK-NEXT: ld1 { v1.b }[3], [x9] -; CHECK-NEXT: smull2 v16.4s, v16.8h, v18.8h -; CHECK-NEXT: ldr b18, [sp, #544] -; CHECK-NEXT: add x8, sp, #504 -; CHECK-NEXT: add x9, sp, #640 -; CHECK-NEXT: mov v7.s[0], v20.s[0] -; CHECK-NEXT: ldr b20, [sp, #672] -; CHECK-NEXT: ld1 { v18.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #680 -; CHECK-NEXT: ld1 { v17.b }[3], [x8] +; CHECK-NEXT: ld1 { v7.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #552 +; CHECK-NEXT: sshll v18.8h, v2.8b, #0 +; CHECK-NEXT: ld1 { v20.b }[2], [x8] +; CHECK-NEXT: smull2 v2.4s, v6.8h, v21.8h +; CHECK-NEXT: add x8, sp, #688 +; CHECK-NEXT: smlal v0.4s, v6.4h, v21.4h +; CHECK-NEXT: ldr b6, [sp, #480] +; CHECK-NEXT: ld1 { v16.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #696 +; CHECK-NEXT: ld1 { v7.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #560 -; CHECK-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-NEXT: ld1 { v6.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #496 +; CHECK-NEXT: ld1 { v20.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #640 +; CHECK-NEXT: ld1 { v16.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #568 +; CHECK-NEXT: ld1 { v7.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #704 +; CHECK-NEXT: ld1 { v6.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #504 +; CHECK-NEXT: ld1 { v20.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #648 +; CHECK-NEXT: ld1 { v16.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #576 +; CHECK-NEXT: ld1 { v7.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #712 +; CHECK-NEXT: ld1 { v6.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #512 -; CHECK-NEXT: ld1 { v20.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #520 -; CHECK-NEXT: ld1 { v18.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #688 -; CHECK-NEXT: ld1 { v17.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #568 -; CHECK-NEXT: smull2 v6.4s, v19.8h, v21.8h -; CHECK-NEXT: ld1 { v1.b }[5], [x11] -; CHECK-NEXT: ld1 { v20.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #696 -; CHECK-NEXT: ld1 { v18.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #576 -; CHECK-NEXT: ld1 { v17.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: smlal v7.4s, v19.4h, v21.4h -; CHECK-NEXT: ldr b19, [sp, #872] -; CHECK-NEXT: ld1 { v20.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #704 -; CHECK-NEXT: ld1 { v18.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #584 -; CHECK-NEXT: ld1 { v17.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #536 -; CHECK-NEXT: ldr b21, [sp, #936] +; CHECK-NEXT: ld1 { v20.b }[5], [x11] ; CHECK-NEXT: add x11, sp, #656 -; CHECK-NEXT: ld1 { v20.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #712 -; CHECK-NEXT: ld1 { v18.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #592 -; CHECK-NEXT: ld1 { v17.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #880 -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v1.b }[6], [x11] -; CHECK-NEXT: ld1 { v20.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: ld1 { v18.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #720 -; CHECK-NEXT: ld1 { v19.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #944 -; CHECK-NEXT: smlal2 v6.4s, v0.8h, v3.8h +; CHECK-NEXT: ld1 { v16.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #584 +; CHECK-NEXT: ld1 { v7.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #720 +; CHECK-NEXT: ld1 { v6.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #520 +; CHECK-NEXT: ld1 { v20.b }[6], [x11] ; CHECK-NEXT: add x11, sp, #664 -; CHECK-NEXT: ld1 { v20.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #888 -; CHECK-NEXT: ld1 { v18.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #728 -; CHECK-NEXT: ld1 { v21.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #752 -; CHECK-NEXT: ld1 { v19.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #952 -; CHECK-NEXT: ld1 { v20.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #896 -; CHECK-NEXT: smlal v7.4s, v0.4h, v3.4h -; CHECK-NEXT: ldr b0, [sp, #744] -; CHECK-NEXT: ld1 { v21.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #904 +; CHECK-NEXT: ld1 { v16.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #592 +; CHECK-NEXT: smlal2 v3.4s, v17.8h, v19.8h +; CHECK-NEXT: ld1 { v7.b }[6], [x10] +; CHECK-NEXT: ld1 { v6.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #528 +; CHECK-NEXT: smlal2 v2.4s, v5.8h, v18.8h +; CHECK-NEXT: ld1 { v20.b }[7], [x11] +; CHECK-NEXT: smlal v1.4s, v17.4h, v19.4h +; CHECK-NEXT: ldr b17, [sp, #736] +; CHECK-NEXT: smlal v0.4s, v5.4h, v18.4h +; CHECK-NEXT: ldr b18, [sp, #1000] +; CHECK-NEXT: ld1 { v16.b }[6], [x8] +; CHECK-NEXT: add x10, sp, #728 +; CHECK-NEXT: ld1 { v6.b }[6], [x9] +; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: add x9, sp, #536 +; CHECK-NEXT: ldr b19, [sp, #744] +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 +; CHECK-NEXT: ld1 { v7.b }[7], [x10] +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: ld1 { v16.b }[7], [x8] +; CHECK-NEXT: sshll v5.8h, v20.8b, #0 +; CHECK-NEXT: add x8, sp, #752 +; CHECK-NEXT: smull v17.4s, v17.4h, v18.4h +; CHECK-NEXT: ldr b20, [sp, #808] +; CHECK-NEXT: add x10, sp, #816 +; CHECK-NEXT: ld1 { v6.b }[7], [x9] +; CHECK-NEXT: ldr b18, [sp, #872] +; CHECK-NEXT: add x9, sp, #880 +; CHECK-NEXT: ld1 { v19.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #760 +; CHECK-NEXT: ld1 { v20.b }[1], [x10] +; CHECK-NEXT: add x11, sp, #944 +; CHECK-NEXT: and v4.16b, v17.16b, v4.16b +; CHECK-NEXT: ldr b17, [sp, #936] +; CHECK-NEXT: ld1 { v18.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #824 +; CHECK-NEXT: ld1 { v19.b }[2], [x8] +; CHECK-NEXT: add x10, sp, #888 +; CHECK-NEXT: ld1 { v17.b }[1], [x11] +; CHECK-NEXT: add x8, sp, #768 +; CHECK-NEXT: ld1 { v20.b }[2], [x9] +; CHECK-NEXT: add x11, sp, #952 +; CHECK-NEXT: add x9, sp, #832 +; CHECK-NEXT: ld1 { v18.b }[2], [x10] ; CHECK-NEXT: ld1 { v19.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #960 -; CHECK-NEXT: ld1 { v0.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #760 -; CHECK-NEXT: ld1 { v1.b }[7], [x11] -; CHECK-NEXT: add x11, sp, #816 -; CHECK-NEXT: ld1 { v21.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #968 -; CHECK-NEXT: ldr b3, [sp, #808] -; CHECK-NEXT: ld1 { v19.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #912 -; CHECK-NEXT: ld1 { v0.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #768 -; CHECK-NEXT: ld1 { v3.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #824 -; CHECK-NEXT: ld1 { v21.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #976 -; CHECK-NEXT: ld1 { v19.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #920 -; CHECK-NEXT: ld1 { v0.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #776 -; CHECK-NEXT: ld1 { v3.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #832 -; CHECK-NEXT: ld1 { v21.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #984 -; CHECK-NEXT: ld1 { v19.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #928 -; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #848 -; CHECK-NEXT: ld1 { v3.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #840 -; CHECK-NEXT: ld1 { v21.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #992 -; CHECK-NEXT: ld1 { v19.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #784 -; CHECK-NEXT: smlal2 v16.4s, v2.8h, v4.8h -; CHECK-NEXT: ld1 { v3.b }[4], [x11] -; CHECK-NEXT: ld1 { v21.b }[7], [x8] +; CHECK-NEXT: add x10, sp, #896 +; CHECK-NEXT: ld1 { v17.b }[2], [x11] +; CHECK-NEXT: add x8, sp, #776 +; CHECK-NEXT: ld1 { v20.b }[3], [x9] +; CHECK-NEXT: add x11, sp, #960 +; CHECK-NEXT: add x9, sp, #840 +; CHECK-NEXT: ld1 { v18.b }[3], [x10] +; CHECK-NEXT: ld1 { v19.b }[4], [x8] +; CHECK-NEXT: add x10, sp, #904 +; CHECK-NEXT: ld1 { v17.b }[3], [x11] +; CHECK-NEXT: add x8, sp, #784 +; CHECK-NEXT: ld1 { v20.b }[4], [x9] +; CHECK-NEXT: add x11, sp, #968 +; CHECK-NEXT: add x9, sp, #848 +; CHECK-NEXT: ld1 { v18.b }[4], [x10] +; CHECK-NEXT: ld1 { v19.b }[5], [x8] +; CHECK-NEXT: add x10, sp, #912 +; CHECK-NEXT: ld1 { v17.b }[4], [x11] ; CHECK-NEXT: add x8, sp, #792 -; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: ld1 { v20.b }[5], [x9] +; CHECK-NEXT: add x11, sp, #976 ; CHECK-NEXT: add x9, sp, #856 -; CHECK-NEXT: smlal v5.4s, v2.4h, v4.4h -; CHECK-NEXT: ldr b2, [sp, #736] -; CHECK-NEXT: sshll v4.8h, v20.8b, #0 -; CHECK-NEXT: ldr b20, [sp, #1000] -; CHECK-NEXT: ld1 { v3.b }[5], [x10] -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: ld1 { v0.b }[6], [x8] -; CHECK-NEXT: sshll v20.8h, v20.8b, #0 +; CHECK-NEXT: ld1 { v18.b }[5], [x10] +; CHECK-NEXT: ld1 { v19.b }[6], [x8] +; CHECK-NEXT: add x10, sp, #920 +; CHECK-NEXT: ld1 { v17.b }[5], [x11] ; CHECK-NEXT: add x8, sp, #800 -; CHECK-NEXT: sshll v21.8h, v21.8b, #0 -; CHECK-NEXT: smull v2.4s, v2.4h, v20.4h -; CHECK-NEXT: ld1 { v3.b }[6], [x9] -; CHECK-NEXT: smull v20.4s, v4.4h, v21.4h -; CHECK-NEXT: ld1 { v0.b }[7], [x8] -; CHECK-NEXT: smull2 v4.4s, v4.8h, v21.8h +; CHECK-NEXT: ld1 { v20.b }[6], [x9] +; CHECK-NEXT: add x11, sp, #984 ; CHECK-NEXT: add x9, sp, #864 -; CHECK-NEXT: movi v21.2d, #0000000000000000 -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: ld1 { v3.b }[7], [x9] +; CHECK-NEXT: ld1 { v18.b }[6], [x10] +; CHECK-NEXT: ld1 { v19.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #928 +; CHECK-NEXT: ld1 { v17.b }[6], [x11] +; CHECK-NEXT: add x10, sp, #992 +; CHECK-NEXT: ld1 { v20.b }[7], [x9] +; CHECK-NEXT: mov v4.s[3], wzr +; CHECK-NEXT: ld1 { v18.b }[7], [x8] +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: sshll v16.8h, v16.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[7], [x10] ; CHECK-NEXT: sshll v19.8h, v19.8b, #0 -; CHECK-NEXT: mov v21.s[0], v2.s[0] -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: smull2 v2.4s, v1.8h, v19.8h +; CHECK-NEXT: sshll v20.8h, v20.8b, #0 +; CHECK-NEXT: smlal v4.4s, v6.4h, v19.4h +; CHECK-NEXT: smull2 v6.4s, v6.8h, v19.8h +; CHECK-NEXT: smull2 v19.4s, v16.8h, v20.8h +; CHECK-NEXT: smull v16.4s, v16.4h, v20.4h +; CHECK-NEXT: sshll v7.8h, v7.8b, #0 +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 ; CHECK-NEXT: sshll v18.8h, v18.8b, #0 -; CHECK-NEXT: smlal v21.4s, v17.4h, v0.4h -; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: smlal2 v2.4s, v17.8h, v0.8h -; CHECK-NEXT: smlal2 v4.4s, v18.8h, v3.8h -; CHECK-NEXT: smlal v20.4s, v18.4h, v3.4h -; CHECK-NEXT: smlal v21.4s, v1.4h, v19.4h -; CHECK-NEXT: add v0.4s, v6.4s, v16.4s -; CHECK-NEXT: add v1.4s, v7.4s, v5.4s -; CHECK-NEXT: add v2.4s, v2.4s, v4.4s -; CHECK-NEXT: add v3.4s, v21.4s, v20.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v1.4s, v3.4s, v2.4s +; CHECK-NEXT: smlal2 v19.4s, v7.8h, v17.8h +; CHECK-NEXT: smlal2 v6.4s, v5.8h, v18.8h +; CHECK-NEXT: smlal v16.4s, v7.4h, v17.4h +; CHECK-NEXT: smlal v4.4s, v5.4h, v18.4h +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v1.4s, v6.4s, v19.4s +; CHECK-NEXT: add v3.4s, v4.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 @@ -2024,151 +2052,153 @@ ; CHECK-NEXT: ldr b0, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b2, [sp, #144] -; CHECK-NEXT: add x9, sp, #152 +; CHECK-NEXT: add x9, sp, #96 ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: ldr b4, [sp, #16] ; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #96 -; CHECK-NEXT: add x10, sp, #104 -; CHECK-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-NEXT: mov v3.b[1], w1 -; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: add x11, sp, #128 +; CHECK-NEXT: add x8, sp, #152 +; CHECK-NEXT: add x10, sp, #24 ; CHECK-NEXT: ldr b1, [sp, #208] -; CHECK-NEXT: ld1 { v0.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #24 -; CHECK-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #168 -; CHECK-NEXT: mov v3.b[2], w2 -; CHECK-NEXT: ld1 { v4.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #112 -; CHECK-NEXT: ld1 { v0.b }[3], [x10] +; CHECK-NEXT: mov v3.b[1], w1 +; CHECK-NEXT: ldr b17, [sp, #544] +; CHECK-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #160 +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #104 +; CHECK-NEXT: ld1 { v4.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #32 -; CHECK-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #176 -; CHECK-NEXT: mov v3.b[3], w3 +; CHECK-NEXT: mov v3.b[2], w2 +; CHECK-NEXT: add x11, sp, #632 +; CHECK-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #168 +; CHECK-NEXT: ld1 { v0.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #112 ; CHECK-NEXT: ld1 { v4.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #40 -; CHECK-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: add x10, sp, #40 +; CHECK-NEXT: mov v3.b[3], w3 +; CHECK-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #176 +; CHECK-NEXT: ld1 { v0.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #120 +; CHECK-NEXT: ld1 { v4.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #48 ; CHECK-NEXT: mov v3.b[4], w4 -; CHECK-NEXT: ld1 { v4.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #48 -; CHECK-NEXT: ld1 { v0.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #192 +; CHECK-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #184 +; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #128 +; CHECK-NEXT: ld1 { v4.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: mov v3.b[5], w5 -; CHECK-NEXT: ld1 { v4.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #56 -; CHECK-NEXT: ld1 { v0.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #632 -; CHECK-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #200 +; CHECK-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #192 +; CHECK-NEXT: ld1 { v0.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #136 +; CHECK-NEXT: ld1 { v4.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #64 ; CHECK-NEXT: mov v3.b[6], w6 -; CHECK-NEXT: ld1 { v4.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #64 -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: ld1 { v0.b }[7], [x10] -; CHECK-NEXT: ld1 { v2.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #552 -; CHECK-NEXT: mov v3.b[7], w7 -; CHECK-NEXT: add x10, sp, #680 -; CHECK-NEXT: ld1 { v4.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #72 -; CHECK-NEXT: movi v6.2d, #0000000000000000 -; CHECK-NEXT: sshll v5.4s, v1.4h, #0 -; CHECK-NEXT: ldr b1, [sp, #608] -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: ld1 { v2.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #200 +; CHECK-NEXT: ld1 { v0.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #72 +; CHECK-NEXT: ld1 { v4.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #552 +; CHECK-NEXT: sshll v5.8h, v1.8b, #0 +; CHECK-NEXT: ld1 { v2.b }[7], [x8] +; CHECK-NEXT: adrp x8, .LCPI45_0 +; CHECK-NEXT: sshll v1.8h, v0.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: sshll v5.4s, v5.4h, #0 +; CHECK-NEXT: ld1 { v4.b }[7], [x9] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI45_0] ; CHECK-NEXT: add x8, sp, #616 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: sshll v7.8h, v3.8b, #0 -; CHECK-NEXT: ld1 { v1.b }[1], [x8] +; CHECK-NEXT: sshll v6.8h, v2.8b, #0 +; CHECK-NEXT: ldr b2, [sp, #608] +; CHECK-NEXT: mov v3.b[7], w7 +; CHECK-NEXT: add x9, sp, #680 +; CHECK-NEXT: and v16.16b, v5.16b, v0.16b +; CHECK-NEXT: add x10, sp, #560 +; CHECK-NEXT: mov v16.s[3], wzr +; CHECK-NEXT: ld1 { v2.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #624 -; CHECK-NEXT: sshll v3.8h, v4.8b, #0 -; CHECK-NEXT: mov v6.s[0], v5.s[0] -; CHECK-NEXT: saddl2 v5.4s, v3.8h, v2.8h -; CHECK-NEXT: saddl2 v16.4s, v7.8h, v0.8h -; CHECK-NEXT: ld1 { v1.b }[2], [x8] +; CHECK-NEXT: sshll v7.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[2], [x10] +; CHECK-NEXT: sshll v5.8h, v3.8b, #0 +; CHECK-NEXT: add x10, sp, #568 +; CHECK-NEXT: saddl2 v4.4s, v7.8h, v6.8h +; CHECK-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-NEXT: saddl v3.4s, v7.4h, v6.4h +; CHECK-NEXT: ldr b7, [sp, #480] +; CHECK-NEXT: saddw v6.4s, v16.4s, v5.4h ; CHECK-NEXT: add x8, sp, #488 -; CHECK-NEXT: saddw v4.4s, v6.4s, v7.4h -; CHECK-NEXT: ldr b6, [sp, #480] -; CHECK-NEXT: add v5.4s, v16.4s, v5.4s -; CHECK-NEXT: ldr b7, [sp, #544] ; CHECK-NEXT: ldr b16, [sp, #672] -; CHECK-NEXT: ld1 { v6.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #496 -; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #560 -; CHECK-NEXT: ld1 { v16.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #688 -; CHECK-NEXT: ld1 { v1.b }[3], [x11] +; CHECK-NEXT: ld1 { v2.b }[3], [x11] ; CHECK-NEXT: add x11, sp, #640 -; CHECK-NEXT: ld1 { v6.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #504 -; CHECK-NEXT: ld1 { v7.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #568 -; CHECK-NEXT: ld1 { v16.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #696 -; CHECK-NEXT: ld1 { v1.b }[4], [x11] +; CHECK-NEXT: ld1 { v7.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #496 +; CHECK-NEXT: ld1 { v16.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #688 +; CHECK-NEXT: ld1 { v17.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #576 +; CHECK-NEXT: ld1 { v2.b }[4], [x11] ; CHECK-NEXT: add x11, sp, #648 -; CHECK-NEXT: ld1 { v6.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #512 -; CHECK-NEXT: ld1 { v7.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #576 -; CHECK-NEXT: ld1 { v16.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #704 -; CHECK-NEXT: ld1 { v1.b }[5], [x11] +; CHECK-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #504 +; CHECK-NEXT: ld1 { v16.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #696 +; CHECK-NEXT: ld1 { v17.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #584 +; CHECK-NEXT: ld1 { v2.b }[5], [x11] ; CHECK-NEXT: add x11, sp, #656 -; CHECK-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-NEXT: ld1 { v7.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #512 +; CHECK-NEXT: ld1 { v16.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #704 +; CHECK-NEXT: ld1 { v17.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #592 +; CHECK-NEXT: saddl2 v5.4s, v5.8h, v1.8h +; CHECK-NEXT: ld1 { v2.b }[6], [x11] +; CHECK-NEXT: ld1 { v7.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #520 -; CHECK-NEXT: ld1 { v7.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #584 -; CHECK-NEXT: ld1 { v16.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: ld1 { v1.b }[6], [x11] +; CHECK-NEXT: ld1 { v16.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #712 +; CHECK-NEXT: saddw v1.4s, v6.4s, v1.4h +; CHECK-NEXT: ldr b6, [sp, #736] +; CHECK-NEXT: ld1 { v17.b }[6], [x10] ; CHECK-NEXT: add x11, sp, #664 -; CHECK-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #528 -; CHECK-NEXT: ld1 { v7.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #592 -; CHECK-NEXT: ld1 { v16.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #720 -; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h -; CHECK-NEXT: ldr b3, [sp, #736] -; CHECK-NEXT: ld1 { v6.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h -; CHECK-NEXT: ld1 { v7.b }[6], [x9] -; CHECK-NEXT: ld1 { v16.b }[6], [x10] -; CHECK-NEXT: add x9, sp, #728 +; CHECK-NEXT: ld1 { v16.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #720 ; CHECK-NEXT: add x10, sp, #536 -; CHECK-NEXT: ld1 { v1.b }[7], [x11] -; CHECK-NEXT: movi v4.2d, #0000000000000000 -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ld1 { v7.b }[7], [x8] -; CHECK-NEXT: sshll v2.8h, v3.8b, #0 -; CHECK-NEXT: ld1 { v16.b }[7], [x9] -; CHECK-NEXT: ld1 { v6.b }[7], [x10] -; CHECK-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: mov v4.s[0], v2.s[0] +; CHECK-NEXT: ld1 { v2.b }[7], [x11] +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ld1 { v7.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #728 +; CHECK-NEXT: ld1 { v16.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #600 +; CHECK-NEXT: sshll v3.8h, v6.8b, #0 +; CHECK-NEXT: add v4.4s, v5.4s, v4.4s +; CHECK-NEXT: ld1 { v17.b }[7], [x9] +; CHECK-NEXT: ld1 { v16.b }[7], [x8] +; CHECK-NEXT: ld1 { v7.b }[7], [x10] +; CHECK-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: and v0.16b, v3.16b, v0.16b +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: sshll v5.8h, v16.8b, #0 +; CHECK-NEXT: sshll v6.8h, v17.8b, #0 ; CHECK-NEXT: sshll v3.8h, v7.8b, #0 -; CHECK-NEXT: sshll v7.8h, v16.8b, #0 -; CHECK-NEXT: sshll v2.8h, v6.8b, #0 -; CHECK-NEXT: saddl2 v6.4s, v7.8h, v3.8h -; CHECK-NEXT: saddl2 v16.4s, v1.8h, v2.8h -; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h -; CHECK-NEXT: saddl v3.4s, v7.4h, v3.4h -; CHECK-NEXT: add v4.4s, v16.4s, v6.4s -; CHECK-NEXT: saddw v1.4s, v2.4s, v1.4h -; CHECK-NEXT: add v2.4s, v3.4s, v4.4s -; CHECK-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: saddl2 v7.4s, v6.8h, v5.8h +; CHECK-NEXT: saddl2 v16.4s, v3.8h, v2.8h +; CHECK-NEXT: saddw v0.4s, v0.4s, v3.4h +; CHECK-NEXT: saddl v3.4s, v6.4h, v5.4h +; CHECK-NEXT: add v5.4s, v16.4s, v7.4s +; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: add v2.4s, v3.4s, v5.4s +; CHECK-NEXT: add v1.4s, v1.4s, v4.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/neon-sad.ll b/llvm/test/CodeGen/AArch64/neon-sad.ll --- a/llvm/test/CodeGen/AArch64/neon-sad.ll +++ b/llvm/test/CodeGen/AArch64/neon-sad.ll @@ -9,9 +9,20 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uabdl v2.8h, v1.8b, v0.8b -; CHECK-NEXT: uabal2 v2.8h, v1.16b, v0.16b -; CHECK-NEXT: uaddlv s0, v2.8h +; CHECK-NEXT: usubl v2.8h, v1.8b, v0.8b +; CHECK-NEXT: usubl2 v0.8h, v1.16b, v0.16b +; CHECK-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: abs v0.4s, v0.4s +; CHECK-NEXT: abs v3.4s, v3.4s +; CHECK-NEXT: abs v1.4s, v1.4s +; CHECK-NEXT: abs v2.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -30,9 +41,20 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sabdl v2.8h, v1.8b, v0.8b -; CHECK-NEXT: sabal2 v2.8h, v1.16b, v0.16b -; CHECK-NEXT: uaddlv s0, v2.8h +; CHECK-NEXT: ssubl v2.8h, v1.8b, v0.8b +; CHECK-NEXT: ssubl2 v0.8h, v1.16b, v0.16b +; CHECK-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: abs v0.4s, v0.4s +; CHECK-NEXT: abs v3.4s, v3.4s +; CHECK-NEXT: abs v1.4s, v1.4s +; CHECK-NEXT: abs v2.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -478,47 +478,50 @@ ; CHECK-NEXT: ldr x10, [x0, #24] ; CHECK-NEXT: and x1, x8, #0x1 ; CHECK-NEXT: ldrb w11, [x0, #32] +; CHECK-NEXT: extr x12, x10, x9, #1 ; CHECK-NEXT: extr x2, x9, x8, #1 +; CHECK-NEXT: extr x8, x11, x10, #2 ; CHECK-NEXT: extr x4, x10, x9, #2 -; CHECK-NEXT: extr x6, x11, x10, #3 -; CHECK-NEXT: ubfx x3, x9, #1, #1 ; CHECK-NEXT: mov.d v0[1], x1 -; CHECK-NEXT: ubfx x5, x10, #2, #1 +; CHECK-NEXT: extr x6, x11, x10, #3 ; CHECK-NEXT: ubfx x7, x11, #3, #1 +; CHECK-NEXT: and x3, x12, #0x1 +; CHECK-NEXT: and x5, x8, #0x1 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v4i65: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ldp x10, x9, [x0, #16] -; CHECK-BE-NEXT: ldp x12, x11, [x0] -; CHECK-BE-NEXT: ldrb w8, [x0, #32] -; CHECK-BE-NEXT: lsr x13, x10, #56 -; CHECK-BE-NEXT: lsr x14, x12, #56 -; CHECK-BE-NEXT: extr x15, x11, x10, #56 -; CHECK-BE-NEXT: orr x7, x8, x9, lsl #8 -; CHECK-BE-NEXT: extr x8, x10, x9, #56 -; CHECK-BE-NEXT: extr x9, x12, x11, #56 -; CHECK-BE-NEXT: lsr x12, x12, #59 -; CHECK-BE-NEXT: ubfx x10, x10, #57, #1 -; CHECK-BE-NEXT: extr x5, x13, x8, #1 +; CHECK-BE-NEXT: ldp x9, x8, [x0, #8] +; CHECK-BE-NEXT: ldr x10, [x0] +; CHECK-BE-NEXT: ldr x11, [x0, #24] +; CHECK-BE-NEXT: ldrb w13, [x0, #32] +; CHECK-BE-NEXT: extr x12, x9, x8, #56 +; CHECK-BE-NEXT: extr x9, x10, x9, #56 +; CHECK-BE-NEXT: lsr x14, x10, #56 +; CHECK-BE-NEXT: extr x8, x8, x11, #56 +; CHECK-BE-NEXT: lsr x10, x10, #59 +; CHECK-BE-NEXT: orr x7, x13, x11, lsl #8 +; CHECK-BE-NEXT: extr x15, x9, x12, #1 +; CHECK-BE-NEXT: extr x16, x14, x9, #2 +; CHECK-BE-NEXT: and x11, x8, #0x1 ; CHECK-BE-NEXT: extr x1, x14, x9, #3 -; CHECK-BE-NEXT: ubfx x9, x11, #58, #1 -; CHECK-BE-NEXT: fmov d0, x12 -; CHECK-BE-NEXT: and x12, x8, #0x1 -; CHECK-BE-NEXT: lsr x11, x11, #56 -; CHECK-BE-NEXT: fmov d2, x10 -; CHECK-BE-NEXT: fmov d1, x9 -; CHECK-BE-NEXT: extr x3, x11, x15, #2 -; CHECK-BE-NEXT: fmov d3, x12 +; CHECK-BE-NEXT: fmov d0, x10 +; CHECK-BE-NEXT: extr x3, x9, x12, #2 +; CHECK-BE-NEXT: and x13, x15, #0x1 +; CHECK-BE-NEXT: and x10, x16, #0x1 +; CHECK-BE-NEXT: fmov d1, x11 +; CHECK-BE-NEXT: extr x5, x12, x8, #1 ; CHECK-BE-NEXT: mov v0.d[1], x1 +; CHECK-BE-NEXT: fmov d2, x13 +; CHECK-BE-NEXT: fmov d3, x10 +; CHECK-BE-NEXT: mov v1.d[1], x7 ; CHECK-BE-NEXT: mov v2.d[1], x5 -; CHECK-BE-NEXT: mov v1.d[1], x3 -; CHECK-BE-NEXT: mov v3.d[1], x7 +; CHECK-BE-NEXT: mov v3.d[1], x3 ; CHECK-BE-NEXT: fmov x0, d0 +; CHECK-BE-NEXT: fmov x6, d1 ; CHECK-BE-NEXT: fmov x4, d2 -; CHECK-BE-NEXT: fmov x2, d1 -; CHECK-BE-NEXT: fmov x6, d3 +; CHECK-BE-NEXT: fmov x2, d3 ; CHECK-BE-NEXT: ret %lv = load <4 x i65>, ptr %A, align 8, !nontemporal !0 ret <4 x i65> %lv diff --git a/llvm/test/CodeGen/AArch64/nzcv-save.ll b/llvm/test/CodeGen/AArch64/nzcv-save.ll --- a/llvm/test/CodeGen/AArch64/nzcv-save.ll +++ b/llvm/test/CodeGen/AArch64/nzcv-save.ll @@ -12,13 +12,13 @@ ; CHECK-NEXT: ldp x14, x15, [x3, #16] ; CHECK-NEXT: adds x9, x9, x11 ; CHECK-NEXT: adcs x8, x8, x10 -; CHECK-NEXT: adcs x10, x13, x14 -; CHECK-NEXT: adc x11, x12, x15 -; CHECK-NEXT: orr x12, x12, #0x100 +; CHECK-NEXT: orr x10, x12, #0x100 +; CHECK-NEXT: adcs x11, x13, x14 ; CHECK-NEXT: adc x12, x12, x15 +; CHECK-NEXT: adc x10, x10, x15 ; CHECK-NEXT: stp x9, x8, [x0] -; CHECK-NEXT: stp x10, x11, [x0, #16] -; CHECK-NEXT: stp x10, x12, [x1, #16] +; CHECK-NEXT: stp x11, x12, [x0, #16] +; CHECK-NEXT: stp x11, x10, [x1, #16] ; CHECK-NEXT: stp x9, x8, [x1] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/pr61111.ll b/llvm/test/CodeGen/AArch64/pr61111.ll --- a/llvm/test/CodeGen/AArch64/pr61111.ll +++ b/llvm/test/CodeGen/AArch64/pr61111.ll @@ -4,10 +4,11 @@ define i62 @f(i1 %0) { ; CHECK-LABEL: f: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: and x8, x0, #0x1 -; CHECK-NEXT: sub x8, x8, #1 -; CHECK-NEXT: tst x8, #0x3fffffffffffffff +; CHECK-NEXT: and w9, w0, #0x1 +; CHECK-NEXT: mov x8, #4611686018427387903 // =0x3fffffffffffffff +; CHECK-NEXT: neg w9, w9 +; CHECK-NEXT: sxtw x9, w9 +; CHECK-NEXT: bics xzr, x8, x9 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %2 = zext i1 %0 to i59 diff --git a/llvm/test/CodeGen/AArch64/pre-indexed-addrmode-with-constant-offset.ll b/llvm/test/CodeGen/AArch64/pre-indexed-addrmode-with-constant-offset.ll --- a/llvm/test/CodeGen/AArch64/pre-indexed-addrmode-with-constant-offset.ll +++ b/llvm/test/CodeGen/AArch64/pre-indexed-addrmode-with-constant-offset.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s ; Reduced test from https://github.com/llvm/llvm-project/issues/60645. @@ -7,8 +8,8 @@ ; CHECK-LABEL: pr60645: ; CHECK: // %bb.0: ; CHECK-NEXT: sub x8, x0, x1, lsl #2 -; CHECK-NEXT: str wzr, [x8, #-32]! -; CHECK-NEXT: stur wzr, [x8, #-8] +; CHECK-NEXT: stur wzr, [x8, #-32] +; CHECK-NEXT: stur wzr, [x8, #-40] ; CHECK-NEXT: ret %t1 = add nuw nsw i64 %t0, 8 %t2 = mul i64 %t1, -4 diff --git a/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll b/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll --- a/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll +++ b/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll @@ -4,8 +4,11 @@ define <8 x i16> @not_not_trunc_concat(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: not_not_trunc_concat: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: mvn v1.8b, v1.8b +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret %notx = xor <4 x i32> %x, %trnx = trunc <4 x i32> %notx to <4 x i16> @@ -19,10 +22,17 @@ define <16 x i8> @not_not_trunc_concat_chain(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: not_not_trunc_concat_chain: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: xtn v3.4h, v3.4s +; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: mvn v1.8b, v1.8b +; CHECK-NEXT: mvn v2.8b, v2.8b +; CHECK-NEXT: mvn v3.8b, v3.8b +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.d[1], v3.d[0] ; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: ret %nota = xor <4 x i32> %a, %trna = trunc <4 x i32> %nota to <4 x i16> diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -22,23 +22,24 @@ ; CHECK-NEXT: .cfi_offset b13, -48 ; CHECK-NEXT: .cfi_offset b14, -56 ; CHECK-NEXT: .cfi_offset b15, -64 -; CHECK-NEXT: movi v14.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: adrp x10, B+48 ; CHECK-NEXT: add x10, x10, :lo12:B+48 -; CHECK-NEXT: adrp x11, A -; CHECK-NEXT: add x11, x11, :lo12:A +; CHECK-NEXT: adrp x11, A+128 +; CHECK-NEXT: add x11, x11, :lo12:A+128 +; CHECK-NEXT: // implicit-def: $q0 ; CHECK-NEXT: // implicit-def: $q2 -; CHECK-NEXT: // implicit-def: $q3 -; CHECK-NEXT: // implicit-def: $q15 ; CHECK-NEXT: // implicit-def: $q4 ; CHECK-NEXT: // implicit-def: $q5 ; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: // implicit-def: $q14 ; CHECK-NEXT: // implicit-def: $q7 ; CHECK-NEXT: // implicit-def: $q16 -; CHECK-NEXT: // implicit-def: $q17 ; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: // kill: killed $q3 ; CHECK-NEXT: // implicit-def: $q19 ; CHECK-NEXT: // implicit-def: $q20 ; CHECK-NEXT: // implicit-def: $q21 @@ -61,114 +62,128 @@ ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov x12, xzr -; CHECK-NEXT: stp q15, q14, [sp] // 32-byte Folded Spill +; CHECK-NEXT: str q14, [sp] // 16-byte Folded Spill ; CHECK-NEXT: ldr q14, [x8] -; CHECK-NEXT: add x15, x11, x8 -; CHECK-NEXT: ldr q15, [x10], #64 -; CHECK-NEXT: ldr q0, [x12] ; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: mov v3.16b, v0.16b +; CHECK-NEXT: ldr x15, [x11, x8] +; CHECK-NEXT: ldr q0, [x12] ; CHECK-NEXT: ldr x12, [x12] ; CHECK-NEXT: fmov x13, d14 -; CHECK-NEXT: mov x14, v14.d[1] -; CHECK-NEXT: fmov x0, d15 +; CHECK-NEXT: ldr q15, [x10], #64 ; CHECK-NEXT: fmov x16, d0 -; CHECK-NEXT: ldr x15, [x15, #128] ; CHECK-NEXT: mul x17, x13, x12 +; CHECK-NEXT: mov x14, v14.d[1] +; CHECK-NEXT: fmov x2, d15 ; CHECK-NEXT: mov x18, v0.d[1] -; CHECK-NEXT: mul x4, x0, x12 ; CHECK-NEXT: mul x1, x16, x12 -; CHECK-NEXT: mul x3, x14, x12 +; CHECK-NEXT: mul x4, x13, x15 ; CHECK-NEXT: fmov d0, x17 -; CHECK-NEXT: mul x5, x13, x15 +; CHECK-NEXT: mul x5, x2, x12 ; CHECK-NEXT: mov x17, v15.d[1] -; CHECK-NEXT: fmov d15, x4 +; CHECK-NEXT: mul x3, x14, x12 ; CHECK-NEXT: fmov d14, x1 ; CHECK-NEXT: mul x1, x18, x12 -; CHECK-NEXT: mov v0.d[1], x3 -; CHECK-NEXT: mul x3, x16, x15 -; CHECK-NEXT: ldr x2, [x8], #8 +; CHECK-NEXT: mov v17.16b, v16.16b +; CHECK-NEXT: fmov d15, x4 +; CHECK-NEXT: mov v16.16b, v7.16b +; CHECK-NEXT: mul x4, x14, x15 ; CHECK-NEXT: mul x12, x17, x12 +; CHECK-NEXT: mov v0.d[1], x3 +; CHECK-NEXT: mul x3, x2, x15 +; CHECK-NEXT: mov v7.16b, v6.16b +; CHECK-NEXT: mov v6.16b, v5.16b +; CHECK-NEXT: mov v5.16b, v4.16b +; CHECK-NEXT: mov v4.16b, v2.16b +; CHECK-NEXT: mov v2.16b, v1.16b ; CHECK-NEXT: fmov d1, x5 ; CHECK-NEXT: mov v14.d[1], x1 -; CHECK-NEXT: mul x1, x14, x15 +; CHECK-NEXT: mul x1, x16, x15 +; CHECK-NEXT: ldr x0, [x8], #8 +; CHECK-NEXT: mov v1.d[1], x12 +; CHECK-NEXT: mul x12, x18, x15 +; CHECK-NEXT: mul x15, x17, x15 +; CHECK-NEXT: cmp x8, #64 ; CHECK-NEXT: add v12.2d, v12.2d, v0.2d -; CHECK-NEXT: mul x13, x13, x2 ; CHECK-NEXT: fmov d0, x3 -; CHECK-NEXT: mul x3, x0, x15 -; CHECK-NEXT: mov v15.d[1], x12 -; CHECK-NEXT: mul x12, x18, x2 -; CHECK-NEXT: mov v1.d[1], x1 -; CHECK-NEXT: mul x18, x18, x15 -; CHECK-NEXT: mul x16, x16, x2 -; CHECK-NEXT: cmp x8, #64 -; CHECK-NEXT: mul x15, x17, x15 ; CHECK-NEXT: add v13.2d, v13.2d, v14.2d -; CHECK-NEXT: mul x14, x14, x2 +; CHECK-NEXT: mul x13, x13, x0 ; CHECK-NEXT: add v11.2d, v11.2d, v14.2d -; CHECK-NEXT: fmov d14, x3 -; CHECK-NEXT: add v10.2d, v10.2d, v15.2d -; CHECK-NEXT: fmov d15, x13 -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x13, x0, x2 -; CHECK-NEXT: add v29.2d, v29.2d, v1.2d -; CHECK-NEXT: fmov d1, x16 -; CHECK-NEXT: mov v14.d[1], x15 -; CHECK-NEXT: mov v15.d[1], x14 -; CHECK-NEXT: mov v1.d[1], x12 -; CHECK-NEXT: mul x12, x17, x2 -; CHECK-NEXT: add v28.2d, v28.2d, v0.2d +; CHECK-NEXT: mul x14, x14, x0 +; CHECK-NEXT: fmov d14, x1 +; CHECK-NEXT: mul x16, x16, x0 +; CHECK-NEXT: mov v0.d[1], x15 +; CHECK-NEXT: mul x3, x18, x0 +; CHECK-NEXT: add v10.2d, v10.2d, v1.2d +; CHECK-NEXT: mov v14.d[1], x12 +; CHECK-NEXT: mul x12, x17, x0 +; CHECK-NEXT: fmov d1, x13 +; CHECK-NEXT: mul x13, x2, x0 +; CHECK-NEXT: add v27.2d, v27.2d, v0.2d +; CHECK-NEXT: mov v15.d[1], x4 +; CHECK-NEXT: add v28.2d, v28.2d, v14.2d +; CHECK-NEXT: ldp q14, q0, [sp] // 32-byte Folded Reload +; CHECK-NEXT: mov v1.d[1], x14 +; CHECK-NEXT: add v29.2d, v29.2d, v15.2d +; CHECK-NEXT: fmov d15, x16 +; CHECK-NEXT: add v8.2d, v8.2d, v1.2d +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v15.d[1], x3 +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: add v27.2d, v27.2d, v14.2d -; CHECK-NEXT: ldr q14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add v8.2d, v8.2d, v15.2d +; CHECK-NEXT: add v25.2d, v25.2d, v1.2d +; CHECK-NEXT: add v22.2d, v22.2d, v1.2d ; CHECK-NEXT: mov v0.d[1], x12 -; CHECK-NEXT: add v25.2d, v25.2d, v15.2d -; CHECK-NEXT: add v22.2d, v22.2d, v15.2d +; CHECK-NEXT: add v14.2d, v14.2d, v1.2d +; CHECK-NEXT: add v1.2d, v2.2d, v1.2d +; CHECK-NEXT: mov v2.16b, v4.16b +; CHECK-NEXT: mov v4.16b, v5.16b +; CHECK-NEXT: mov v5.16b, v6.16b +; CHECK-NEXT: mov v6.16b, v7.16b +; CHECK-NEXT: mov v7.16b, v16.16b +; CHECK-NEXT: mov v16.16b, v17.16b +; CHECK-NEXT: add v9.2d, v9.2d, v15.2d +; CHECK-NEXT: add v31.2d, v31.2d, v15.2d +; CHECK-NEXT: add v26.2d, v26.2d, v15.2d +; CHECK-NEXT: add v23.2d, v23.2d, v15.2d +; CHECK-NEXT: add v21.2d, v21.2d, v15.2d +; CHECK-NEXT: add v19.2d, v19.2d, v15.2d ; CHECK-NEXT: add v18.2d, v18.2d, v15.2d +; CHECK-NEXT: add v7.2d, v7.2d, v15.2d ; CHECK-NEXT: add v6.2d, v6.2d, v15.2d -; CHECK-NEXT: add v14.2d, v14.2d, v15.2d -; CHECK-NEXT: ldr q15, [sp] // 16-byte Folded Reload -; CHECK-NEXT: add v9.2d, v9.2d, v1.2d -; CHECK-NEXT: add v31.2d, v31.2d, v1.2d -; CHECK-NEXT: add v26.2d, v26.2d, v1.2d -; CHECK-NEXT: add v23.2d, v23.2d, v1.2d -; CHECK-NEXT: add v21.2d, v21.2d, v1.2d -; CHECK-NEXT: add v19.2d, v19.2d, v1.2d -; CHECK-NEXT: add v17.2d, v17.2d, v1.2d -; CHECK-NEXT: add v7.2d, v7.2d, v1.2d -; CHECK-NEXT: add v5.2d, v5.2d, v1.2d -; CHECK-NEXT: add v15.2d, v15.2d, v1.2d -; CHECK-NEXT: add v3.2d, v3.2d, v1.2d +; CHECK-NEXT: add v4.2d, v4.2d, v15.2d +; CHECK-NEXT: add v2.2d, v2.2d, v15.2d ; CHECK-NEXT: add v30.2d, v30.2d, v0.2d ; CHECK-NEXT: add v24.2d, v24.2d, v0.2d ; CHECK-NEXT: add v20.2d, v20.2d, v0.2d -; CHECK-NEXT: add v16.2d, v16.2d, v0.2d -; CHECK-NEXT: add v4.2d, v4.2d, v0.2d -; CHECK-NEXT: add v2.2d, v2.2d, v0.2d +; CHECK-NEXT: add v16.2d, v17.2d, v0.2d +; CHECK-NEXT: add v5.2d, v5.2d, v0.2d +; CHECK-NEXT: add v0.2d, v3.2d, v0.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C +; CHECK-NEXT: ldr q3, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: stp q13, q12, [x8] ; CHECK-NEXT: stp q11, q10, [x8, #32] ; CHECK-NEXT: stp q9, q8, [x8, #64] -; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: stp q14, q6, [x8, #400] +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: stp q31, q30, [x8, #96] -; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: stp q29, q28, [x8, #144] +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: stp q27, q26, [x8, #176] +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: str q25, [x8, #208] ; CHECK-NEXT: stp q24, q23, [x8, #240] ; CHECK-NEXT: stp q22, q21, [x8, #272] ; CHECK-NEXT: stp q20, q19, [x8, #304] -; CHECK-NEXT: stp q18, q17, [x8, #336] +; CHECK-NEXT: stp q3, q18, [x8, #336] ; CHECK-NEXT: stp q16, q7, [x8, #368] -; CHECK-NEXT: stp q6, q5, [x8, #400] -; CHECK-NEXT: stp q4, q15, [x8, #432] -; CHECK-NEXT: stp q14, q3, [x8, #464] -; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: str q2, [x8, #496] -; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: stp q5, q4, [x8, #432] +; CHECK-NEXT: stp q1, q2, [x8, #464] +; CHECK-NEXT: str q0, [x8, #496] ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore b8 diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -91,41 +91,41 @@ ; CHECK-NEXT: ext v6.16b, v6.16b, v6.16b, #4 ; CHECK-NEXT: ext v16.16b, v7.16b, v16.16b, #12 ; CHECK-NEXT: ext v17.16b, v3.16b, v17.16b, #12 -; CHECK-NEXT: mov v3.s[2], v5.s[3] -; CHECK-NEXT: mov v7.s[2], v2.s[3] -; CHECK-NEXT: mov v0.s[2], v2.s[1] -; CHECK-NEXT: uzp2 v4.4s, v4.4s, v18.4s -; CHECK-NEXT: sub v20.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v21.4s, v7.4s, v16.4s ; CHECK-NEXT: mov v3.s[1], v5.s[2] ; CHECK-NEXT: mov v7.s[1], v2.s[2] +; CHECK-NEXT: mov v0.s[1], v2.s[0] +; CHECK-NEXT: uzp2 v4.4s, v4.4s, v18.4s +; CHECK-NEXT: add v20.4s, v3.4s, v17.4s +; CHECK-NEXT: add v21.4s, v7.4s, v16.4s +; CHECK-NEXT: mov v3.s[2], v5.s[3] +; CHECK-NEXT: mov v7.s[2], v2.s[3] ; CHECK-NEXT: sub v18.4s, v1.4s, v6.4s ; CHECK-NEXT: mov v6.s[0], v5.s[1] -; CHECK-NEXT: sub v19.4s, v0.4s, v4.4s -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: add v2.4s, v3.4s, v17.4s -; CHECK-NEXT: add v3.4s, v7.4s, v16.4s +; CHECK-NEXT: add v19.4s, v0.4s, v4.4s +; CHECK-NEXT: mov v0.s[2], v2.s[1] +; CHECK-NEXT: sub v2.4s, v3.4s, v17.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v16.4s ; CHECK-NEXT: add v1.4s, v1.4s, v6.4s -; CHECK-NEXT: mov v3.d[1], v21.d[1] -; CHECK-NEXT: mov v2.d[1], v20.d[1] -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: mov v21.d[1], v3.d[1] +; CHECK-NEXT: mov v20.d[1], v2.d[1] +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s ; CHECK-NEXT: mov v1.d[1], v18.d[1] -; CHECK-NEXT: mov v0.d[1], v19.d[1] -; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 -; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 -; CHECK-NEXT: cmlt v4.8h, v1.8h, #0 -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v2.4s, v7.4s, v2.4s -; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: eor v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: mov v19.d[1], v0.d[1] +; CHECK-NEXT: cmlt v3.8h, v21.8h, #0 +; CHECK-NEXT: cmlt v4.8h, v20.8h, #0 +; CHECK-NEXT: cmlt v0.8h, v1.8h, #0 +; CHECK-NEXT: add v5.4s, v3.4s, v21.4s +; CHECK-NEXT: add v6.4s, v4.4s, v20.4s +; CHECK-NEXT: cmlt v2.8h, v19.8h, #0 +; CHECK-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-NEXT: eor v4.16b, v6.16b, v4.16b +; CHECK-NEXT: eor v3.16b, v5.16b, v3.16b +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: add v7.4s, v2.4s, v19.4s +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-NEXT: eor v1.16b, v7.16b, v2.16b +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 diff --git a/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll b/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll --- a/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll +++ b/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll @@ -25,7 +25,7 @@ ; CHECK-NEXT: sub x0, x29, #1 ; CHECK-NEXT: bl _bar ; CHECK-NEXT: ldurb w8, [x29, #-1] -; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: add w8, w8, #1 ; CHECK-NEXT: and x0, x8, #0xff ; CHECK-NEXT: sturb w8, [x29, #-1] ; CHECK-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/rotate-extract.ll b/llvm/test/CodeGen/AArch64/rotate-extract.ll --- a/llvm/test/CodeGen/AArch64/rotate-extract.ll +++ b/llvm/test/CodeGen/AArch64/rotate-extract.ll @@ -24,8 +24,8 @@ define i32 @ror_extract_shrl(i32 %i) nounwind { ; CHECK-LABEL: ror_extract_shrl: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #3 -; CHECK-NEXT: ror w0, w8, #4 +; CHECK-NEXT: ror w8, w0, #7 +; CHECK-NEXT: and w0, w8, #0xf1ffffff ; CHECK-NEXT: ret %lhs_div = lshr i32 %i, 7 %rhs_div = lshr i32 %i, 3 @@ -50,11 +50,11 @@ define i64 @ror_extract_udiv(i64 %i) nounwind { ; CHECK-LABEL: ror_extract_udiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-6148914691236517206 +; CHECK-NEXT: mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa ; CHECK-NEXT: movk x8, #43691 ; CHECK-NEXT: umulh x8, x0, x8 -; CHECK-NEXT: lsr x8, x8, #1 -; CHECK-NEXT: ror x0, x8, #4 +; CHECK-NEXT: ror x8, x8, #5 +; CHECK-NEXT: and x0, x8, #0xf7ffffffffffffff ; CHECK-NEXT: ret %lhs_div = udiv i64 %i, 3 %rhs_div = udiv i64 %i, 48 @@ -66,9 +66,10 @@ define i64 @ror_extract_mul_with_mask(i64 %i) nounwind { ; CHECK-LABEL: ror_extract_mul_with_mask: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, x0, lsl #3 -; CHECK-NEXT: ror x8, x8, #57 -; CHECK-NEXT: and x0, x8, #0xff +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: add x9, x0, x0, lsl #3 +; CHECK-NEXT: lsr x0, x9, #57 +; CHECK-NEXT: bfi x0, x8, #7, #1 ; CHECK-NEXT: ret %lhs_mul = mul i64 %i, 1152 %rhs_mul = mul i64 %i, 9 @@ -127,15 +128,15 @@ define i32 @no_extract_udiv(i32 %i) nounwind { ; CHECK-LABEL: no_extract_udiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #33437 -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: movk w8, #21399, lsl #16 -; CHECK-NEXT: movk w9, #43690, lsl #16 +; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: mov w9, #33437 // =0x829d +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: movk w9, #21399, lsl #16 ; CHECK-NEXT: umull x8, w0, w8 ; CHECK-NEXT: umull x9, w0, w9 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: lsr x9, x9, #33 -; CHECK-NEXT: extr w0, w9, w8, #4 +; CHECK-NEXT: lsr x8, x8, #33 +; CHECK-NEXT: lsr x9, x9, #36 +; CHECK-NEXT: orr w0, w9, w8, lsl #28 ; CHECK-NEXT: ret %lhs_div = udiv i32 %i, 3 %rhs_div = udiv i32 %i, 49 diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -133,15 +133,17 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1 { v0.b }[0], [x1] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v1.b }[0], [x0] -; CHECK-NEXT: add x9, x0, #1 -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: ldrsb w8, [x1] +; CHECK-NEXT: ldrsb w9, [x0] +; CHECK-NEXT: ldrsb w10, [x1, #1] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: ldrsb w9, [x0, #1] +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: shl v1.2s, v1.2s, #24 -; CHECK-NEXT: sqadd v0.2s, v1.2s, v0.2s +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 @@ -173,15 +175,17 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1 { v0.h }[0], [x1] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v1.h }[0], [x0] -; CHECK-NEXT: add x9, x0, #2 -; CHECK-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: ldrsh w8, [x1] +; CHECK-NEXT: ldrsh w9, [x0] +; CHECK-NEXT: ldrsh w10, [x1, #2] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: ldrsh w9, [x0, #2] +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: sqadd v0.2s, v1.2s, v0.2s +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: unsigned_sat_constant_i8_using_min: ; CHECK: // %bb.0: ; CHECK-NEXT: and w9, w0, #0xff -; CHECK-NEXT: mov w8, #-43 +; CHECK-NEXT: mov w8, #-43 // =0xffffffd5 ; CHECK-NEXT: cmp w9, #213 ; CHECK-NEXT: csel w8, w0, w8, lo ; CHECK-NEXT: add w0, w8, #42 @@ -26,7 +26,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff ; CHECK-NEXT: add w8, w8, #42 -; CHECK-NEXT: tst w8, #0x100 +; CHECK-NEXT: lsr w9, w8, #8 +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csinv w0, w8, wzr, eq ; CHECK-NEXT: ret %a = add i8 %x, 42 @@ -52,9 +53,9 @@ define i16 @unsigned_sat_constant_i16_using_min(i16 %x) { ; CHECK-LABEL: unsigned_sat_constant_i16_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65493 +; CHECK-NEXT: mov w8, #65493 // =0xffd5 ; CHECK-NEXT: cmp w8, w0, uxth -; CHECK-NEXT: mov w8, #-43 +; CHECK-NEXT: mov w8, #-43 // =0xffffffd5 ; CHECK-NEXT: csel w8, w0, w8, hi ; CHECK-NEXT: add w0, w8, #42 ; CHECK-NEXT: ret @@ -69,7 +70,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff ; CHECK-NEXT: add w8, w8, #42 -; CHECK-NEXT: tst w8, #0x10000 +; CHECK-NEXT: lsr w9, w8, #16 +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csinv w0, w8, wzr, eq ; CHECK-NEXT: ret %a = add i16 %x, 42 @@ -81,7 +83,7 @@ define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) { ; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_notval: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65493 +; CHECK-NEXT: mov w8, #65493 // =0xffd5 ; CHECK-NEXT: add w9, w0, #42 ; CHECK-NEXT: cmp w8, w0, uxth ; CHECK-NEXT: csinv w0, w9, wzr, hs @@ -95,7 +97,7 @@ define i32 @unsigned_sat_constant_i32_using_min(i32 %x) { ; CHECK-LABEL: unsigned_sat_constant_i32_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-43 +; CHECK-NEXT: mov w8, #-43 // =0xffffffd5 ; CHECK-NEXT: cmn w0, #43 ; CHECK-NEXT: csel w8, w0, w8, lo ; CHECK-NEXT: add w0, w8, #42 @@ -133,7 +135,7 @@ define i64 @unsigned_sat_constant_i64_using_min(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-43 +; CHECK-NEXT: mov x8, #-43 // =0xffffffffffffffd5 ; CHECK-NEXT: cmn x0, #43 ; CHECK-NEXT: csel x8, x0, x8, lo ; CHECK-NEXT: add x0, x8, #42 @@ -189,7 +191,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff ; CHECK-NEXT: add w8, w8, w1, uxtb -; CHECK-NEXT: tst w8, #0x100 +; CHECK-NEXT: lsr w9, w8, #8 +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csinv w0, w8, wzr, eq ; CHECK-NEXT: ret %a = add i8 %x, %y @@ -204,7 +207,8 @@ ; CHECK-NEXT: and w8, w1, #0xff ; CHECK-NEXT: add w9, w0, w1 ; CHECK-NEXT: add w8, w8, w0, uxtb -; CHECK-NEXT: tst w8, #0x100 +; CHECK-NEXT: lsr w8, w8, #8 +; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: csinv w0, w9, wzr, eq ; CHECK-NEXT: ret %noty = xor i8 %y, -1 @@ -235,7 +239,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff ; CHECK-NEXT: add w8, w8, w1, uxth -; CHECK-NEXT: tst w8, #0x10000 +; CHECK-NEXT: lsr w9, w8, #16 +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csinv w0, w8, wzr, eq ; CHECK-NEXT: ret %a = add i16 %x, %y @@ -250,7 +255,8 @@ ; CHECK-NEXT: and w8, w1, #0xffff ; CHECK-NEXT: add w9, w0, w1 ; CHECK-NEXT: add w8, w8, w0, uxth -; CHECK-NEXT: tst w8, #0x10000 +; CHECK-NEXT: lsr w8, w8, #16 +; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: csinv w0, w9, wzr, eq ; CHECK-NEXT: ret %noty = xor i16 %y, -1 @@ -459,9 +465,9 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) { ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-43 +; CHECK-NEXT: mov x8, #-43 // =0xffffffffffffffd5 ; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: cmhi v2.2d, v1.2d, v0.2d ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: dup v1.2d, x8 @@ -476,7 +482,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) { ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret @@ -489,7 +495,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) { ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/select_fmf.ll b/llvm/test/CodeGen/AArch64/select_fmf.ll --- a/llvm/test/CodeGen/AArch64/select_fmf.ll +++ b/llvm/test/CodeGen/AArch64/select_fmf.ll @@ -7,11 +7,12 @@ define float @select_select_fold_select_and(float %w, float %x, float %y, float %z) { ; CHECK-LABEL: select_select_fold_select_and: ; CHECK: // %bb.0: -; CHECK-NEXT: fminnm s5, s1, s2 +; CHECK-NEXT: fmaxnm s5, s0, s3 +; CHECK-NEXT: fminnm s6, s1, s2 ; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: fmaxnm s1, s0, s3 ; CHECK-NEXT: fmov s4, #0.50000000 -; CHECK-NEXT: fccmp s5, s0, #4, lt +; CHECK-NEXT: fcsel s1, s5, s0, lt +; CHECK-NEXT: fcmp s6, s0 ; CHECK-NEXT: fcsel s2, s1, s0, gt ; CHECK-NEXT: fadd s1, s0, s4 ; CHECK-NEXT: fadd s4, s1, s2 @@ -22,8 +23,8 @@ ; CHECK-NEXT: fadd s0, s2, s0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_2: // %if.end.i159.i.i -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #13107 +; CHECK-NEXT: mov w8, #52429 // =0xcccd +; CHECK-NEXT: mov w9, #13107 // =0x3333 ; CHECK-NEXT: movk w8, #48844, lsl #16 ; CHECK-NEXT: movk w9, #48819, lsl #16 ; CHECK-NEXT: fcmp s1, #0.0 @@ -65,11 +66,12 @@ define float @select_select_fold_select_or(float %w, float %x, float %y, float %z) { ; CHECK-LABEL: select_select_fold_select_or: ; CHECK: // %bb.0: -; CHECK-NEXT: fminnm s5, s1, s2 +; CHECK-NEXT: fmaxnm s5, s0, s3 +; CHECK-NEXT: fminnm s6, s1, s2 ; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: fmaxnm s1, s0, s3 ; CHECK-NEXT: fmov s4, #0.50000000 -; CHECK-NEXT: fccmp s5, s0, #0, ge +; CHECK-NEXT: fcsel s1, s0, s5, lt +; CHECK-NEXT: fcmp s6, s0 ; CHECK-NEXT: fcsel s2, s0, s1, gt ; CHECK-NEXT: fadd s1, s0, s4 ; CHECK-NEXT: fadd s4, s1, s2 @@ -80,8 +82,8 @@ ; CHECK-NEXT: fadd s0, s2, s0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_2: // %if.end.i159.i.i -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #13107 +; CHECK-NEXT: mov w8, #52429 // =0xcccd +; CHECK-NEXT: mov w9, #13107 // =0x3333 ; CHECK-NEXT: movk w8, #48844, lsl #16 ; CHECK-NEXT: movk w9, #48819, lsl #16 ; CHECK-NEXT: fcmp s1, #0.0 diff --git a/llvm/test/CodeGen/AArch64/setcc-fsh.ll b/llvm/test/CodeGen/AArch64/setcc-fsh.ll --- a/llvm/test/CodeGen/AArch64/setcc-fsh.ll +++ b/llvm/test/CodeGen/AArch64/setcc-fsh.ll @@ -63,7 +63,9 @@ define i1 @fshr_or_eq_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or_eq_0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w0, w1, lsl #8 +; CHECK-NEXT: lsl w8, w0, #16 +; CHECK-NEXT: orr w9, w0, w1 +; CHECK-NEXT: extr w8, w9, w8, #24 ; CHECK-NEXT: tst w8, #0xffff ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -76,7 +78,9 @@ define i1 @fshr_or_commute_eq_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or_commute_eq_0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w0, w1, lsl #8 +; CHECK-NEXT: lsl w8, w0, #16 +; CHECK-NEXT: orr w9, w1, w0 +; CHECK-NEXT: extr w8, w9, w8, #24 ; CHECK-NEXT: tst w8, #0xffff ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -156,7 +160,8 @@ define i1 @fshr_or_ne_0(i64 %x, i64 %y) { ; CHECK-LABEL: fshr_or_ne_0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1, lsl #63 +; CHECK-NEXT: orr w8, w0, w1 +; CHECK-NEXT: extr x8, x8, x0, #1 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -169,7 +174,8 @@ define i1 @fshr_or_commute_ne_0(i64 %x, i64 %y) { ; CHECK-LABEL: fshr_or_commute_ne_0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1, lsl #63 +; CHECK-NEXT: orr w8, w1, w0 +; CHECK-NEXT: extr x8, x8, x0, #1 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -182,8 +188,9 @@ define i1 @fshr_or2_ne_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or2_ne_0: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w1, #0xfffc -; CHECK-NEXT: orr w8, w0, w8, lsr #2 +; CHECK-NEXT: orr w8, w0, w1 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: extr w8, w0, w8, #18 ; CHECK-NEXT: tst w8, #0xffff ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -196,8 +203,9 @@ define i1 @fshr_or2_commute_ne_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or2_commute_ne_0: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w1, #0xfffc -; CHECK-NEXT: orr w8, w0, w8, lsr #2 +; CHECK-NEXT: orr w8, w1, w0 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: extr w8, w0, w8, #18 ; CHECK-NEXT: tst w8, #0xffff ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shift-accumulate.ll b/llvm/test/CodeGen/AArch64/shift-accumulate.ll --- a/llvm/test/CodeGen/AArch64/shift-accumulate.ll +++ b/llvm/test/CodeGen/AArch64/shift-accumulate.ll @@ -92,8 +92,8 @@ define <1 x i64> @ssra_v1i64(<2 x i32> %0) { ; CHECK-LABEL: ssra_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr d1, d0, #63 ; CHECK-NEXT: bic v0.2s, #64, lsl #24 +; CHECK-NEXT: ushr d1, d0, #63 ; CHECK-NEXT: ssra d1, d0, #62 ; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret @@ -108,8 +108,8 @@ define <2 x i64> @ssra_v2i64(<4 x i32> %0) { ; CHECK-LABEL: ssra_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v1.2d, v0.2d, #63 ; CHECK-NEXT: bic v0.4s, #64, lsl #24 +; CHECK-NEXT: ushr v1.2d, v0.2d, #63 ; CHECK-NEXT: ssra v1.2d, v0.2d, #62 ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll --- a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll +++ b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll @@ -61,7 +61,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #32 +; CHECK-NEXT: mov w10, #32 // =0x20 ; CHECK-NEXT: lsl w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -128,7 +128,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #64 +; CHECK-NEXT: mov w10, #64 // =0x40 ; CHECK-NEXT: lsl x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -198,7 +198,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #32 +; CHECK-NEXT: mov w10, #32 // =0x20 ; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -265,7 +265,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #64 +; CHECK-NEXT: mov w10, #64 // =0x40 ; CHECK-NEXT: lsr x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -335,7 +335,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #32 +; CHECK-NEXT: mov w10, #32 // =0x20 ; CHECK-NEXT: asr w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -402,7 +402,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #64 +; CHECK-NEXT: mov w10, #64 // =0x40 ; CHECK-NEXT: asr x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -476,7 +476,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #31 +; CHECK-NEXT: mov w10, #31 // =0x1f ; CHECK-NEXT: lsl w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -543,7 +543,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #63 +; CHECK-NEXT: mov w10, #63 // =0x3f ; CHECK-NEXT: lsl x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -613,7 +613,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #31 +; CHECK-NEXT: mov w10, #31 // =0x1f ; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -680,7 +680,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #63 +; CHECK-NEXT: mov w10, #63 // =0x3f ; CHECK-NEXT: lsr x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -750,7 +750,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #31 +; CHECK-NEXT: mov w10, #31 // =0x1f ; CHECK-NEXT: asr w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -817,7 +817,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #63 +; CHECK-NEXT: mov w10, #63 // =0x3f ; CHECK-NEXT: asr x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -1030,7 +1030,7 @@ define i32 @reg32_lshr_by_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b) nounwind { ; CHECK-LABEL: reg32_lshr_by_b_sub_negated_unfolded: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w2, w1 +; CHECK-NEXT: add w8, w1, w2 ; CHECK-NEXT: lsr w0, w0, w8 ; CHECK-NEXT: ret %nega = sub i32 0, %a @@ -1042,7 +1042,7 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounwind { ; CHECK-LABEL: reg64_lshr_by_b_sub_negated_unfolded: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x2, x1 +; CHECK-NEXT: add x8, x1, x2 ; CHECK-NEXT: lsr x0, x0, x8 ; CHECK-NEXT: ret %nega = sub i64 0, %a diff --git a/llvm/test/CodeGen/AArch64/shift-by-signext.ll b/llvm/test/CodeGen/AArch64/shift-by-signext.ll --- a/llvm/test/CodeGen/AArch64/shift-by-signext.ll +++ b/llvm/test/CodeGen/AArch64/shift-by-signext.ll @@ -80,12 +80,12 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind { ; CHECK-LABEL: n6_fshl: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsr w9, w1, #1 -; CHECK-NEXT: lsl w10, w0, w2 -; CHECK-NEXT: lsr w8, w9, w8 -; CHECK-NEXT: orr w0, w10, w8 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsr w10, w1, #1 +; CHECK-NEXT: lsr w9, w10, w9 +; CHECK-NEXT: lsl w8, w0, w8 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %shamt_wide = sext i8 %shamt to i32 %r = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %shamt_wide) @@ -94,12 +94,12 @@ define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind { ; CHECK-LABEL: n7_fshr: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsl w9, w0, #1 -; CHECK-NEXT: lsr w10, w1, w2 -; CHECK-NEXT: lsl w8, w9, w8 -; CHECK-NEXT: orr w0, w8, w10 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsl w10, w0, #1 +; CHECK-NEXT: lsr w8, w1, w8 +; CHECK-NEXT: lsl w9, w10, w9 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %shamt_wide = sext i8 %shamt to i32 %r = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt_wide) diff --git a/llvm/test/CodeGen/AArch64/shiftregister-from-and.ll b/llvm/test/CodeGen/AArch64/shiftregister-from-and.ll --- a/llvm/test/CodeGen/AArch64/shiftregister-from-and.ll +++ b/llvm/test/CodeGen/AArch64/shiftregister-from-and.ll @@ -21,7 +21,7 @@ define i64 @bic_shiftedreg_from_and(i64 %a, i64 %b) { ; CHECK-LABEL: bic_shiftedreg_from_and: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16777215 +; CHECK-NEXT: mov w8, #16777215 // =0xffffff ; CHECK-NEXT: orn x8, x8, x0, asr #23 ; CHECK-NEXT: and x0, x1, x8 ; CHECK-NEXT: ret @@ -37,8 +37,9 @@ define i64 @eon_shiftedreg_from_and(i64 %a, i64 %b) { ; CHECK-LABEL: eon_shiftedreg_from_and: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #17 -; CHECK-NEXT: eon x0, x1, x8, lsl #53 +; CHECK-NEXT: mov x8, #9007199254740991 // =0x1fffffffffffff +; CHECK-NEXT: orn x8, x8, x0, lsl #36 +; CHECK-NEXT: eor x0, x1, x8 ; CHECK-NEXT: ret %shl = shl i64 %a, 36 %and = and i64 %shl, -9007199254740992 @@ -67,7 +68,7 @@ define i64 @mvn_shiftedreg_from_and(i64 %a) { ; CHECK-LABEL: mvn_shiftedreg_from_and: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #9007199254740991 +; CHECK-NEXT: mov x8, #9007199254740991 // =0x1fffffffffffff ; CHECK-NEXT: orn x0, x8, x0, lsl #36 ; CHECK-NEXT: ret %shl = shl i64 %a, 36 @@ -205,7 +206,7 @@ define i32 @shiftedreg_from_and_negative_andc1(i32 %a, i32 %b) { ; CHECK-LABEL: shiftedreg_from_and_negative_andc1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #26215 +; CHECK-NEXT: mov w8, #26215 // =0x6667 ; CHECK-NEXT: movk w8, #65510, lsl #16 ; CHECK-NEXT: and w8, w8, w0, asr #23 ; CHECK-NEXT: add w0, w8, w1 @@ -221,7 +222,7 @@ define i32 @shiftedreg_from_and_negative_andc2(i32 %a, i32 %b) { ; CHECK-LABEL: shiftedreg_from_and_negative_andc2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-285212672 +; CHECK-NEXT: mov w8, #-285212672 // =0xef000000 ; CHECK-NEXT: and w8, w8, w0, asr #23 ; CHECK-NEXT: add w0, w8, w1 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll --- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll +++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll @@ -182,10 +182,10 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: adrp x8, .LCPI3_2 ; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI3_1] -; CHECK-NEXT: tbl v1.16b, { v0.16b }, v1.16b -; CHECK-NEXT: tbl v0.16b, { v2.16b }, v3.16b -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: tbl v2.16b, { v2.16b }, v1.16b +; CHECK-NEXT: tbl v1.16b, { v0.16b }, v3.16b +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b ; CHECK-NEXT: ret %x = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %y = shufflevector <16 x i8> %c, <16 x i8> %d, <8 x i32> @@ -429,13 +429,13 @@ define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %ce, <4 x i32> %de) { ; CHECK-LABEL: shuffle4_v4i32_trunc: ; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: xtn v4.4h, v0.4s -; CHECK-NEXT: xtn v5.4h, v1.4s -; CHECK-NEXT: xtn v6.4h, v2.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: xtn v7.4h, v3.4s -; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: xtn v3.8b, v0.8h +; CHECK-NEXT: xtn v4.8b, v2.8h +; CHECK-NEXT: tbl v0.16b, { v3.16b, v4.16b }, v1.16b ; CHECK-NEXT: ret %a = trunc <4 x i32> %ae to <4 x i8> %b = trunc <4 x i32> %be to <4 x i8> @@ -559,19 +559,17 @@ define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: insert4_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: adrp x9, .LCPI14_1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v4.16b, v3.16b +; CHECK-NEXT: dup v4.8b, v0.b[4] ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v0.d[1], v2.d[0] -; CHECK-NEXT: mov v3.16b, v1.16b -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_1] -; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b -; CHECK-NEXT: tbl v1.16b, { v3.16b, v4.16b }, v2.16b -; CHECK-NEXT: trn1 v0.4h, v1.4h, v0.4h -; CHECK-NEXT: trn2 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: mov v4.b[1], v2.b[0] +; CHECK-NEXT: mov v4.b[2], v1.b[15] +; CHECK-NEXT: mov v4.b[3], v3.b[11] +; CHECK-NEXT: mov v4.b[4], v2.b[6] +; CHECK-NEXT: mov v4.b[5], v0.b[3] +; CHECK-NEXT: mov v4.b[6], v3.b[8] +; CHECK-NEXT: mov v4.b[7], v1.b[12] +; CHECK-NEXT: fmov d0, d4 ; CHECK-NEXT: ret %e1 = extractelement <8 x i8> %a, i32 4 %e2 = extractelement <8 x i8> %c, i32 0 @@ -629,17 +627,25 @@ define <16 x i8> @insert4_v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: insert4_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q31_q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v4.8b, v0.b[4] ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v4.16b, v3.16b -; CHECK-NEXT: mov v3.16b, v1.16b -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: adrp x8, .LCPI15_1 -; CHECK-NEXT: mov v0.d[1], v2.d[0] -; CHECK-NEXT: tbl v31.16b, { v3.16b, v4.16b }, v5.16b -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] -; CHECK-NEXT: tbl v0.16b, { v31.16b, v0.16b }, v1.16b +; CHECK-NEXT: mov v4.b[1], v2.b[0] +; CHECK-NEXT: mov v4.b[2], v1.b[15] +; CHECK-NEXT: mov v4.b[3], v3.b[11] +; CHECK-NEXT: mov v4.b[4], v2.b[6] +; CHECK-NEXT: mov v4.b[5], v0.b[3] +; CHECK-NEXT: mov v4.b[6], v3.b[8] +; CHECK-NEXT: mov v4.b[7], v1.b[12] +; CHECK-NEXT: mov v4.b[8], v0.b[4] +; CHECK-NEXT: mov v4.b[9], v2.b[0] +; CHECK-NEXT: mov v4.b[10], v1.b[15] +; CHECK-NEXT: mov v4.b[11], v3.b[11] +; CHECK-NEXT: mov v4.b[12], v2.b[6] +; CHECK-NEXT: mov v4.b[13], v0.b[3] +; CHECK-NEXT: mov v4.b[14], v3.b[8] +; CHECK-NEXT: mov v4.b[15], v1.b[12] +; CHECK-NEXT: mov v0.16b, v4.16b ; CHECK-NEXT: ret %e1 = extractelement <8 x i8> %a, i32 4 %e2 = extractelement <8 x i8> %c, i32 0 @@ -698,35 +704,52 @@ ; CHECK-LABEL: test: ; CHECK: // %bb.0: ; CHECK-NEXT: frintm v0.2d, v0.2d -; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: frintm v4.2d, v4.2d ; CHECK-NEXT: frintm v1.2d, v1.2d -; CHECK-NEXT: frintm v5.2d, v5.2d ; CHECK-NEXT: frintm v2.2d, v2.2d -; CHECK-NEXT: frintm v6.2d, v6.2d ; CHECK-NEXT: frintm v3.2d, v3.2d -; CHECK-NEXT: frintm v7.2d, v7.2d ; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: fcvtzs v4.2d, v4.2d ; CHECK-NEXT: fcvtzs v1.2d, v1.2d +; CHECK-NEXT: frintm v4.2d, v4.2d +; CHECK-NEXT: frintm v5.2d, v5.2d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: xtn v16.2s, v1.2d +; CHECK-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fcvtzs v2.2d, v3.2d +; CHECK-NEXT: mov w9, v16.s[1] +; CHECK-NEXT: fcvtzs v4.2d, v4.2d +; CHECK-NEXT: frintm v3.2d, v7.2d ; CHECK-NEXT: fcvtzs v5.2d, v5.2d -; CHECK-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-NEXT: fcvtzs v6.2d, v6.2d +; CHECK-NEXT: xtn v7.2s, v1.2d +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: mov w8, v7.s[1] +; CHECK-NEXT: xtn v4.2s, v4.2d +; CHECK-NEXT: mov v0.h[1], v16.h[0] +; CHECK-NEXT: xtn v5.2s, v5.2d +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: mov w9, v2.s[1] +; CHECK-NEXT: frintm v6.2d, v6.2d ; CHECK-NEXT: fcvtzs v3.2d, v3.2d -; CHECK-NEXT: xtn v16.2s, v0.2d -; CHECK-NEXT: fcvtzs v0.2d, v7.2d -; CHECK-NEXT: xtn v20.2s, v4.2d -; CHECK-NEXT: xtn v17.2s, v1.2d -; CHECK-NEXT: xtn v21.2s, v5.2d -; CHECK-NEXT: xtn v18.2s, v2.2d -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: xtn v22.2s, v6.2d -; CHECK-NEXT: xtn v19.2s, v3.2d -; CHECK-NEXT: xtn v23.2s, v0.2d -; CHECK-NEXT: tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b -; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b -; CHECK-NEXT: uzp1 v0.8h, v2.8h, v1.8h -; CHECK-NEXT: uzp2 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: mov v0.h[2], v7.h[0] +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: mov w8, v4.s[1] +; CHECK-NEXT: fcvtzs v6.2d, v6.2d +; CHECK-NEXT: xtn v3.2s, v3.2d +; CHECK-NEXT: mov v0.h[3], v2.h[0] +; CHECK-NEXT: mov v1.h[3], w9 +; CHECK-NEXT: mov w9, v5.s[1] +; CHECK-NEXT: xtn v6.2s, v6.2d +; CHECK-NEXT: mov v0.h[4], v4.h[0] +; CHECK-NEXT: mov v1.h[4], w8 +; CHECK-NEXT: mov w8, v6.s[1] +; CHECK-NEXT: mov v0.h[5], v5.h[0] +; CHECK-NEXT: mov v1.h[5], w9 +; CHECK-NEXT: mov w9, v3.s[1] +; CHECK-NEXT: mov v0.h[6], v6.h[0] +; CHECK-NEXT: mov v1.h[6], w8 +; CHECK-NEXT: mov v0.h[7], v3.h[0] +; CHECK-NEXT: mov v1.h[7], w9 ; CHECK-NEXT: ret %l214 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l213) %l215 = fptosi <2 x double> %l214 to <2 x i16> diff --git a/llvm/test/CodeGen/AArch64/signbit-shift.ll b/llvm/test/CodeGen/AArch64/signbit-shift.ll --- a/llvm/test/CodeGen/AArch64/signbit-shift.ll +++ b/llvm/test/CodeGen/AArch64/signbit-shift.ll @@ -43,7 +43,7 @@ define i32 @sel_ifpos_tval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifpos_tval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: mov w8, #41 // =0x29 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, ge ; CHECK-NEXT: ret @@ -66,8 +66,9 @@ define i32 @add_sext_ifpos(i32 %x) { ; CHECK-LABEL: add_sext_ifpos: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #31 -; CHECK-NEXT: add w0, w8, #41 +; CHECK-NEXT: mvn w8, w0 +; CHECK-NEXT: asr w8, w8, #31 +; CHECK-NEXT: add w0, w8, #42 ; CHECK-NEXT: ret %c = icmp sgt i32 %x, -1 %e = sext i1 %c to i32 @@ -92,7 +93,7 @@ define i32 @sel_ifpos_fval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifpos_fval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: mov w8, #41 // =0x29 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, lt ; CHECK-NEXT: ret @@ -128,7 +129,7 @@ define i32 @sel_ifneg_tval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifneg_tval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: mov w8, #41 // =0x29 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, lt ; CHECK-NEXT: ret @@ -162,7 +163,7 @@ define i32 @sel_ifneg_fval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifneg_fval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: mov w8, #41 // =0x29 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, ge ; CHECK-NEXT: ret @@ -199,7 +200,7 @@ define i32 @sub_lshr_not(i32 %x) { ; CHECK-LABEL: sub_lshr_not: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: bfxil w8, w0, #31, #1 ; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/signbit-test.ll b/llvm/test/CodeGen/AArch64/signbit-test.ll --- a/llvm/test/CodeGen/AArch64/signbit-test.ll +++ b/llvm/test/CodeGen/AArch64/signbit-test.ll @@ -4,7 +4,7 @@ define i64 @test_clear_mask_i64_i32(i64 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i64_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: csel x0, x8, x0, ge ; CHECK-NEXT: ret @@ -22,9 +22,9 @@ define i64 @test_set_mask_i64_i32(i64 %x) nounwind { ; CHECK-LABEL: test_set_mask_i64_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: tst x0, #0x80000000 -; CHECK-NEXT: csel x0, x8, x0, ne +; CHECK-NEXT: mov w8, #42 // =0x2a +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: csel x0, x8, x0, lt ; CHECK-NEXT: ret entry: %a = and i64 %x, 2147483648 @@ -40,7 +40,7 @@ define i64 @test_clear_mask_i64_i16(i64 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i64_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst x0, #0x8000 ; CHECK-NEXT: csel x0, x8, x0, eq ; CHECK-NEXT: ret @@ -58,7 +58,7 @@ define i64 @test_set_mask_i64_i16(i64 %x) nounwind { ; CHECK-LABEL: test_set_mask_i64_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst x0, #0x8000 ; CHECK-NEXT: csel x0, x8, x0, ne ; CHECK-NEXT: ret @@ -76,7 +76,7 @@ define i64 @test_clear_mask_i64_i8(i64 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i64_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst x0, #0x80 ; CHECK-NEXT: csel x0, x8, x0, eq ; CHECK-NEXT: ret @@ -94,7 +94,7 @@ define i64 @test_set_mask_i64_i8(i64 %x) nounwind { ; CHECK-LABEL: test_set_mask_i64_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst x0, #0x80 ; CHECK-NEXT: csel x0, x8, x0, ne ; CHECK-NEXT: ret @@ -112,7 +112,7 @@ define i32 @test_clear_mask_i32_i16(i32 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i32_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst w0, #0x8000 ; CHECK-NEXT: csel w0, w8, w0, eq ; CHECK-NEXT: ret @@ -130,7 +130,7 @@ define i32 @test_set_mask_i32_i16(i32 %x) nounwind { ; CHECK-LABEL: test_set_mask_i32_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst w0, #0x8000 ; CHECK-NEXT: csel w0, w8, w0, ne ; CHECK-NEXT: ret @@ -148,7 +148,7 @@ define i32 @test_clear_mask_i32_i8(i32 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i32_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst w0, #0x80 ; CHECK-NEXT: csel w0, w8, w0, eq ; CHECK-NEXT: ret @@ -166,7 +166,7 @@ define i32 @test_set_mask_i32_i8(i32 %x) nounwind { ; CHECK-LABEL: test_set_mask_i32_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst w0, #0x80 ; CHECK-NEXT: csel w0, w8, w0, ne ; CHECK-NEXT: ret @@ -184,7 +184,7 @@ define i16 @test_clear_mask_i16_i8(i16 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i16_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst w0, #0x80 ; CHECK-NEXT: csel w0, w8, w0, eq ; CHECK-NEXT: ret @@ -202,8 +202,9 @@ define i16 @test_set_mask_i16_i8(i16 %x) nounwind { ; CHECK-LABEL: test_set_mask_i16_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: tst w0, #0x80 +; CHECK-NEXT: ubfx w9, w0, #7, #1 +; CHECK-NEXT: mov w8, #42 // =0x2a +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csel w0, w8, w0, ne ; CHECK-NEXT: ret entry: @@ -220,8 +221,9 @@ define i16 @test_set_mask_i16_i7(i16 %x) nounwind { ; CHECK-LABEL: test_set_mask_i16_i7: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: tst w0, #0x40 +; CHECK-NEXT: ubfx w9, w0, #6, #1 +; CHECK-NEXT: mov w8, #42 // =0x2a +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csel w0, w8, w0, ne ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll --- a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll +++ b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll @@ -1,41 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s define i128 @ldp_single_csdb(ptr %p) speculative_load_hardening { +; CHECK-LABEL: ldp_single_csdb: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x1, [x0] +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x8, x8, x16 +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: mov x2, sp +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: and x2, x2, x16 +; CHECK-NEXT: mov sp, x2 +; CHECK-NEXT: ret entry: %0 = load i128, ptr %p, align 16 ret i128 %0 -; CHECK-LABEL: ldp_single_csdb -; CHECK: ldp x8, x1, [x0] -; CHECK-NEXT: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and x8, x8, x16 -; CHECK-NEXT: and x1, x1, x16 -; CHECK-NEXT: csdb -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: mov x0, x8 -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret } define double @ld_double(ptr %p) speculative_load_hardening { +; CHECK-LABEL: ld_double: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: ret entry: %0 = load double, ptr %p, align 8 ret double %0 ; Checking that the address laoded from is masked for a floating point load. -; CHECK-LABEL: ld_double -; CHECK: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and x0, x0, x16 -; CHECK-NEXT: csdb -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret } define i32 @csdb_emitted_for_subreg_use(ptr %p, i32 %b) speculative_load_hardening { +; CHECK-LABEL: csdb_emitted_for_subreg_use: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x8, x8, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: add w9, w1, w8 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel w0, w1, w9, eq +; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: mov sp, x1 +; CHECK-NEXT: ret entry: %X = load i64, ptr %p, align 8 %X_trunc = trunc i64 %X to i32 @@ -44,23 +61,24 @@ %ret = select i1 %iszero, i32 %b, i32 %add ret i32 %ret ; Checking that the address laoded from is masked for a floating point load. -; CHECK-LABEL: csdb_emitted_for_subreg_use -; CHECK: ldr x8, [x0] -; CHECK-NEXT: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and x8, x8, x16 ; csdb instruction must occur before the add instruction with w8 as operand. -; CHECK-NEXT: csdb -; CHECK-NEXT: add w9, w1, w8 -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: csel w0, w1, w9, eq -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret } define i64 @csdb_emitted_for_superreg_use(ptr %p, i64 %b) speculative_load_hardening { +; CHECK-LABEL: csdb_emitted_for_superreg_use: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and w8, w8, w16 +; CHECK-NEXT: csdb +; CHECK-NEXT: add x9, x1, x8 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel x0, x1, x9, eq +; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: mov sp, x1 +; CHECK-NEXT: ret entry: %X = load i32, ptr %p, align 4 %X_ext = zext i32 %X to i64 @@ -69,88 +87,84 @@ %ret = select i1 %iszero, i64 %b, i64 %add ret i64 %ret ; Checking that the address laoded from is masked for a floating point load. -; CHECK-LABEL: csdb_emitted_for_superreg_use -; CHECK: ldr w8, [x0] -; CHECK-NEXT: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and w8, w8, w16 ; csdb instruction must occur before the add instruction with x8 as operand. -; CHECK-NEXT: csdb -; CHECK-NEXT: add x9, x1, x8 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel x0, x1, x9, eq -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret } define i64 @no_masking_with_full_control_flow_barriers(i64 %a, i64 %b, ptr %p) speculative_load_hardening { -; CHECK-LABEL: no_masking_with_full_control_flow_barriers -; CHECK: dsb sy -; CHECK: isb +; CHECK-LABEL: no_masking_with_full_control_flow_barriers: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dsb sy +; CHECK-NEXT: isb +; CHECK-NEXT: ldr x8, [x2] +; CHECK-NEXT: mov x17, x0 +; CHECK-NEXT: mov x16, x1 +; CHECK-NEXT: //APP +; CHECK-NEXT: hint #12 +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: add x0, x8, x17 +; CHECK-NEXT: ret entry: %0 = tail call i64 asm "hint #12", "={x17},{x16},0"(i64 %b, i64 %a) %X = load i64, ptr %p, align 8 %ret = add i64 %X, %0 -; CHECK-NOT: csdb -; CHECK-NOT: and -; CHECK: ret ret i64 %ret } define void @f_implicitdef_vector_load(ptr %dst, ptr %src) speculative_load_hardening +; CHECK-LABEL: f_implicitdef_vector_load: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: mov v0.d[1], v0.d[0] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: ret { entry: %0 = load <2 x i32>, ptr %src, align 8 %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> store <4 x i32> %shuffle, ptr %dst, align 4 ret void -; CHECK-LABEL: f_implicitdef_vector_load -; CHECK: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and x1, x1, x16 -; CHECK-NEXT: csdb -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: mov v0.d[1], v0.d[0] -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret } define <2 x double> @f_usedefvectorload(ptr %a, ptr %b) speculative_load_hardening { +; CHECK-LABEL: f_usedefvectorload: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: ret entry: -; CHECK-LABEL: f_usedefvectorload -; CHECK: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and x1, x1, x16 -; CHECK-NEXT: csdb -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret %0 = load double, ptr %b, align 16 %vld1_lane = insertelement <2 x double> , double %0, i32 0 ret <2 x double> %vld1_lane } define i32 @deadload() speculative_load_hardening uwtable { +; CHECK-LABEL: deadload: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr w8, [sp, #12] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: ret entry: -; CHECK-LABEL: deadload -; CHECK: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr w8, [sp, #12] -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret %a = alloca i32, align 4 %val = load volatile i32, ptr %a, align 4 ret i32 undef diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll --- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -265,12 +265,17 @@ ; CHECK-LABEL: frsqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte s1, s0 +; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: fmul s2, s1, s1 ; CHECK-NEXT: frsqrts s2, s0, s2 ; CHECK-NEXT: fmul s1, s1, s2 ; CHECK-NEXT: fmul s2, s1, s1 -; CHECK-NEXT: frsqrts s0, s0, s2 -; CHECK-NEXT: fmul s0, s1, s0 +; CHECK-NEXT: fmul s1, s0, s1 +; CHECK-NEXT: frsqrts s2, s0, s2 +; CHECK-NEXT: fmul s1, s1, s2 +; CHECK-NEXT: fcsel s0, s0, s1, eq +; CHECK-NEXT: fmov s1, #1.00000000 +; CHECK-NEXT: fdiv s0, s1, s0 ; CHECK-NEXT: ret %1 = tail call fast float @llvm.sqrt.f32(float %a) %2 = fdiv fast float 1.000000e+00, %1 @@ -287,13 +292,18 @@ ; ; CHECK-LABEL: f2rsqrt: ; CHECK: // %bb.0: -; CHECK-NEXT: frsqrte v1.2s, v0.2s -; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s -; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s -; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s -; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s -; CHECK-NEXT: frsqrts v0.2s, v0.2s, v2.2s -; CHECK-NEXT: fmul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: frsqrte v2.2s, v0.2s +; CHECK-NEXT: fmov v1.2s, #1.00000000 +; CHECK-NEXT: fmul v3.2s, v2.2s, v2.2s +; CHECK-NEXT: frsqrts v3.2s, v0.2s, v3.2s +; CHECK-NEXT: fmul v2.2s, v2.2s, v3.2s +; CHECK-NEXT: fmul v3.2s, v2.2s, v2.2s +; CHECK-NEXT: fmul v2.2s, v0.2s, v2.2s +; CHECK-NEXT: frsqrts v3.2s, v0.2s, v3.2s +; CHECK-NEXT: fmul v2.2s, v2.2s, v3.2s +; CHECK-NEXT: fcmeq v3.2s, v0.2s, #0.0 +; CHECK-NEXT: bif v0.8b, v2.8b, v3.8b +; CHECK-NEXT: fdiv v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) %2 = fdiv fast <2 x float> , %1 @@ -310,13 +320,18 @@ ; ; CHECK-LABEL: f4rsqrt: ; CHECK: // %bb.0: -; CHECK-NEXT: frsqrte v1.4s, v0.4s -; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s -; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s -; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s -; CHECK-NEXT: frsqrts v0.4s, v0.4s, v2.4s -; CHECK-NEXT: fmul v0.4s, v1.4s, v0.4s +; CHECK-NEXT: frsqrte v2.4s, v0.4s +; CHECK-NEXT: fmov v1.4s, #1.00000000 +; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s +; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s +; CHECK-NEXT: fmul v2.4s, v0.4s, v2.4s +; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fcmeq v3.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: fdiv v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) %2 = fdiv fast <4 x float> , %1 @@ -335,20 +350,29 @@ ; ; CHECK-LABEL: f8rsqrt: ; CHECK: // %bb.0: -; CHECK-NEXT: frsqrte v2.4s, v0.4s -; CHECK-NEXT: frsqrte v3.4s, v1.4s -; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s -; CHECK-NEXT: frsqrts v4.4s, v0.4s, v4.4s +; CHECK-NEXT: frsqrte v3.4s, v0.4s +; CHECK-NEXT: fmov v2.4s, #1.00000000 +; CHECK-NEXT: frsqrte v4.4s, v1.4s ; CHECK-NEXT: fmul v5.4s, v3.4s, v3.4s -; CHECK-NEXT: frsqrts v5.4s, v1.4s, v5.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v4.4s -; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s -; CHECK-NEXT: frsqrts v0.4s, v0.4s, v4.4s +; CHECK-NEXT: frsqrts v5.4s, v0.4s, v5.4s +; CHECK-NEXT: fmul v6.4s, v4.4s, v4.4s +; CHECK-NEXT: frsqrts v6.4s, v1.4s, v6.4s +; CHECK-NEXT: fmul v3.4s, v3.4s, v5.4s +; CHECK-NEXT: fmul v5.4s, v3.4s, v3.4s +; CHECK-NEXT: fmul v3.4s, v0.4s, v3.4s +; CHECK-NEXT: frsqrts v5.4s, v0.4s, v5.4s +; CHECK-NEXT: fmul v4.4s, v4.4s, v6.4s +; CHECK-NEXT: fmul v6.4s, v4.4s, v4.4s +; CHECK-NEXT: frsqrts v6.4s, v1.4s, v6.4s ; CHECK-NEXT: fmul v3.4s, v3.4s, v5.4s -; CHECK-NEXT: fmul v4.4s, v3.4s, v3.4s -; CHECK-NEXT: frsqrts v1.4s, v1.4s, v4.4s -; CHECK-NEXT: fmul v0.4s, v2.4s, v0.4s -; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s +; CHECK-NEXT: fcmeq v5.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v3.16b, v5.16b +; CHECK-NEXT: fmul v3.4s, v1.4s, v4.4s +; CHECK-NEXT: fcmeq v4.4s, v1.4s, #0.0 +; CHECK-NEXT: fdiv v0.4s, v2.4s, v0.4s +; CHECK-NEXT: fmul v3.4s, v3.4s, v6.4s +; CHECK-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-NEXT: fdiv v1.4s, v2.4s, v1.4s ; CHECK-NEXT: ret %1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a) %2 = fdiv fast <8 x float> , %1 @@ -366,6 +390,7 @@ ; CHECK-LABEL: drsqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 @@ -373,8 +398,12 @@ ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmul d2, d1, d1 -; CHECK-NEXT: frsqrts d0, d0, d2 -; CHECK-NEXT: fmul d0, d1, d0 +; CHECK-NEXT: fmul d1, d0, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fcsel d0, d0, d1, eq +; CHECK-NEXT: fmov d1, #1.00000000 +; CHECK-NEXT: fdiv d0, d1, d0 ; CHECK-NEXT: ret %1 = tail call fast double @llvm.sqrt.f64(double %a) %2 = fdiv fast double 1.000000e+00, %1 @@ -391,16 +420,21 @@ ; ; CHECK-LABEL: d2rsqrt: ; CHECK: // %bb.0: -; CHECK-NEXT: frsqrte v1.2d, v0.2d -; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d -; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d -; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d -; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d -; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d -; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d -; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d -; CHECK-NEXT: frsqrts v0.2d, v0.2d, v2.2d -; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d +; CHECK-NEXT: frsqrte v2.2d, v0.2d +; CHECK-NEXT: fmov v1.2d, #1.00000000 +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d +; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d +; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d +; CHECK-NEXT: fmul v2.2d, v0.2d, v2.2d +; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fcmeq v3.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: fdiv v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) %2 = fdiv fast <2 x double> , %1 @@ -419,26 +453,35 @@ ; ; CHECK-LABEL: d4rsqrt: ; CHECK: // %bb.0: -; CHECK-NEXT: frsqrte v2.2d, v0.2d -; CHECK-NEXT: frsqrte v3.2d, v1.2d -; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: frsqrte v3.2d, v0.2d +; CHECK-NEXT: fmov v2.2d, #1.00000000 +; CHECK-NEXT: frsqrte v4.2d, v1.2d ; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d -; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d -; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: frsqrts v5.2d, v0.2d, v5.2d +; CHECK-NEXT: fmul v6.2d, v4.2d, v4.2d +; CHECK-NEXT: frsqrts v6.2d, v1.2d, v6.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d ; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d -; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d -; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v0.2d, v0.2d, v4.2d +; CHECK-NEXT: frsqrts v5.2d, v0.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v4.2d, v6.2d +; CHECK-NEXT: fmul v6.2d, v4.2d, v4.2d +; CHECK-NEXT: frsqrts v6.2d, v1.2d, v6.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d +; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v0.2d, v3.2d +; CHECK-NEXT: frsqrts v5.2d, v0.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v4.2d, v6.2d +; CHECK-NEXT: fmul v6.2d, v4.2d, v4.2d +; CHECK-NEXT: frsqrts v6.2d, v1.2d, v6.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d -; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d -; CHECK-NEXT: frsqrts v1.2d, v1.2d, v4.2d -; CHECK-NEXT: fmul v0.2d, v2.2d, v0.2d -; CHECK-NEXT: fmul v1.2d, v3.2d, v1.2d +; CHECK-NEXT: fcmeq v5.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v3.16b, v5.16b +; CHECK-NEXT: fmul v3.2d, v1.2d, v4.2d +; CHECK-NEXT: fcmeq v4.2d, v1.2d, #0.0 +; CHECK-NEXT: fdiv v0.2d, v2.2d, v0.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v6.2d +; CHECK-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-NEXT: fdiv v1.2d, v2.2d, v1.2d ; CHECK-NEXT: ret %1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) %2 = fdiv fast <4 x double> , %1 @@ -454,6 +497,7 @@ ; CHECK-LABEL: sqrt_fdiv_common_operand: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 @@ -463,7 +507,9 @@ ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fmul d0, d0, d1 +; CHECK-NEXT: fmul d1, d0, d1 +; CHECK-NEXT: fcsel d1, d0, d1, eq +; CHECK-NEXT: fdiv d0, d0, d1 ; CHECK-NEXT: ret %sqrt = call fast double @llvm.sqrt.f64(double %x) %r = fdiv fast double %x, %sqrt @@ -516,9 +562,9 @@ ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmul d1, d0, d1 -; CHECK-NEXT: fcsel d2, d0, d1, eq -; CHECK-NEXT: fmov d0, d1 -; CHECK-NEXT: str d2, [x0] +; CHECK-NEXT: fcsel d1, d0, d1, eq +; CHECK-NEXT: fdiv d0, d0, d1 +; CHECK-NEXT: str d1, [x0] ; CHECK-NEXT: ret %sqrt = call fast double @llvm.sqrt.f64(double %x) store double %sqrt, ptr %p @@ -530,7 +576,7 @@ ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses: ; FAULT: // %bb.0: ; FAULT-NEXT: fsqrt d0, d0 -; FAULT-NEXT: mov x8, #4631107791820423168 +; FAULT-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; FAULT-NEXT: fmov d1, #1.00000000 ; FAULT-NEXT: fmov d2, x8 ; FAULT-NEXT: fdiv d1, d1, d0 @@ -542,17 +588,22 @@ ; CHECK-LABEL: sqrt_simplify_before_recip_3_uses: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 -; CHECK-NEXT: mov x8, #4631107791820423168 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmov d2, #1.00000000 +; CHECK-NEXT: fmul d1, d0, d1 +; CHECK-NEXT: fcsel d1, d0, d1, eq +; CHECK-NEXT: fdiv d1, d2, d1 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: fmul d1, d1, d3 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: fmul d1, d1, d3 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: fmul d1, d1, d3 ; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: fmul d2, d1, d2 ; CHECK-NEXT: str d1, [x0] @@ -571,9 +622,9 @@ ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses_order: ; FAULT: // %bb.0: ; FAULT-NEXT: fsqrt d0, d0 -; FAULT-NEXT: mov x8, #4631107791820423168 +; FAULT-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; FAULT-NEXT: fmov d1, x8 -; FAULT-NEXT: mov x8, #140737488355328 +; FAULT-NEXT: mov x8, #140737488355328 // =0x800000000000 ; FAULT-NEXT: movk x8, #16453, lsl #48 ; FAULT-NEXT: fmov d2, x8 ; FAULT-NEXT: fdiv d1, d1, d0 @@ -585,8 +636,9 @@ ; CHECK-LABEL: sqrt_simplify_before_recip_3_uses_order: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 -; CHECK-NEXT: mov x9, #140737488355328 -; CHECK-NEXT: mov x8, #4631107791820423168 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: mov x9, #140737488355328 // =0x800000000000 +; CHECK-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-NEXT: movk x9, #16453, lsl #48 ; CHECK-NEXT: fmov d3, x9 ; CHECK-NEXT: fmul d2, d1, d1 @@ -598,6 +650,10 @@ ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmov d2, #1.00000000 +; CHECK-NEXT: fmul d1, d0, d1 +; CHECK-NEXT: fcsel d1, d0, d1, eq +; CHECK-NEXT: fdiv d1, d2, d1 ; CHECK-NEXT: fmov d2, x8 ; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: fmul d2, d1, d2 @@ -620,8 +676,8 @@ ; FAULT: // %bb.0: ; FAULT-NEXT: fsqrt d0, d0 ; FAULT-NEXT: fmov d1, #1.00000000 -; FAULT-NEXT: mov x9, #140737488355328 -; FAULT-NEXT: mov x8, #4631107791820423168 +; FAULT-NEXT: mov x9, #140737488355328 // =0x800000000000 +; FAULT-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; FAULT-NEXT: movk x9, #16453, lsl #48 ; FAULT-NEXT: fmov d2, x8 ; FAULT-NEXT: fmov d3, x9 @@ -637,8 +693,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 ; CHECK-NEXT: fcmp d0, #0.0 -; CHECK-NEXT: mov x9, #140737488355328 -; CHECK-NEXT: mov x8, #4631107791820423168 +; CHECK-NEXT: mov x9, #140737488355328 // =0x800000000000 +; CHECK-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-NEXT: movk x9, #16453, lsl #48 ; CHECK-NEXT: fmov d3, x9 ; CHECK-NEXT: fmul d2, d1, d1 @@ -650,13 +706,15 @@ ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fmul d2, d0, d1 -; CHECK-NEXT: fmul d3, d1, d3 -; CHECK-NEXT: str d1, [x0] -; CHECK-NEXT: fcsel d2, d0, d2, eq -; CHECK-NEXT: fdiv d0, d0, d2 +; CHECK-NEXT: fmov d2, #1.00000000 +; CHECK-NEXT: fmul d1, d0, d1 +; CHECK-NEXT: fcsel d1, d0, d1, eq +; CHECK-NEXT: fdiv d1, d2, d1 ; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: fmul d2, d1, d2 +; CHECK-NEXT: fmul d3, d1, d3 +; CHECK-NEXT: str d1, [x0] ; CHECK-NEXT: str d2, [x1] ; CHECK-NEXT: str d3, [x2] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/srem-lkk.ll b/llvm/test/CodeGen/AArch64/srem-lkk.ll --- a/llvm/test/CodeGen/AArch64/srem-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-lkk.ll @@ -4,14 +4,14 @@ define i32 @fold_srem_positive_odd(i32 %x) { ; CHECK-LABEL: fold_srem_positive_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: mov w8, #37253 // =0x9185 ; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 ; CHECK-NEXT: lsr x8, x8, #32 ; CHECK-NEXT: add w8, w8, w0 ; CHECK-NEXT: asr w9, w8, #6 ; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: mov w9, #95 // =0x5f ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, 95 @@ -22,13 +22,13 @@ define i32 @fold_srem_positive_even(i32 %x) { ; CHECK-LABEL: fold_srem_positive_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #36849 +; CHECK-NEXT: mov w8, #36849 // =0x8ff1 ; CHECK-NEXT: movk w8, #15827, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 ; CHECK-NEXT: lsr x9, x8, #63 ; CHECK-NEXT: asr x8, x8, #40 ; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #1060 +; CHECK-NEXT: mov w9, #1060 // =0x424 ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, 1060 @@ -39,13 +39,13 @@ define i32 @fold_srem_negative_odd(i32 %x) { ; CHECK-LABEL: fold_srem_negative_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65445 +; CHECK-NEXT: mov w8, #65445 // =0xffa5 ; CHECK-NEXT: movk w8, #42330, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 ; CHECK-NEXT: lsr x9, x8, #63 ; CHECK-NEXT: asr x8, x8, #40 ; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #-723 +; CHECK-NEXT: mov w9, #-723 // =0xfffffd2d ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, -723 @@ -56,13 +56,13 @@ define i32 @fold_srem_negative_even(i32 %x) { ; CHECK-LABEL: fold_srem_negative_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #62439 +; CHECK-NEXT: mov w8, #62439 // =0xf3e7 ; CHECK-NEXT: movk w8, #64805, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 ; CHECK-NEXT: lsr x9, x8, #63 ; CHECK-NEXT: asr x8, x8, #40 ; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #-22981 +; CHECK-NEXT: mov w9, #-22981 // =0xffffa63b ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, -22981 @@ -74,14 +74,14 @@ define i32 @combine_srem_sdiv(i32 %x) { ; CHECK-LABEL: combine_srem_sdiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: mov w8, #37253 // =0x9185 ; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 ; CHECK-NEXT: lsr x8, x8, #32 ; CHECK-NEXT: add w8, w8, w0 ; CHECK-NEXT: asr w9, w8, #6 ; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: mov w9, #95 // =0x5f ; CHECK-NEXT: msub w9, w8, w9, w0 ; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret @@ -95,14 +95,14 @@ define i64 @dont_fold_srem_i64(i64 %x) { ; CHECK-LABEL: dont_fold_srem_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #58849 +; CHECK-NEXT: mov x8, #58849 // =0xe5e1 ; CHECK-NEXT: movk x8, #48148, lsl #16 ; CHECK-NEXT: movk x8, #33436, lsl #32 ; CHECK-NEXT: movk x8, #21399, lsl #48 ; CHECK-NEXT: smulh x8, x0, x8 ; CHECK-NEXT: asr x9, x8, #5 ; CHECK-NEXT: add x8, x9, x8, lsr #63 -; CHECK-NEXT: mov w9, #98 +; CHECK-NEXT: mov w9, #98 // =0x62 ; CHECK-NEXT: msub x0, x8, x9, x0 ; CHECK-NEXT: ret %1 = srem i64 %x, 98 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll @@ -4,12 +4,12 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; CHECK-LABEL: test_srem_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #33099 -; CHECK-NEXT: mov w9, #24493 +; CHECK-NEXT: mov w8, #33099 // =0x814b +; CHECK-NEXT: mov w9, #24493 // =0x5fad ; CHECK-NEXT: movk w8, #8026, lsl #16 ; CHECK-NEXT: movk w9, #41, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #48987 +; CHECK-NEXT: mov w9, #48987 // =0xbf5b ; CHECK-NEXT: movk w9, #82, lsl #16 ; CHECK-NEXT: and w8, w8, #0x1fffffff ; CHECK-NEXT: cmp w8, w9 @@ -24,7 +24,7 @@ ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: ; CHECK-NEXT: sbfx w9, w0, #0, #4 -; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: mov w8, #6 // =0x6 ; CHECK-NEXT: add w9, w9, w9, lsl #1 ; CHECK-NEXT: ubfx w10, w9, #7, #1 ; CHECK-NEXT: add w9, w10, w9, lsr #4 @@ -57,10 +57,10 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; CHECK-LABEL: test_srem_vec: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #7282 +; CHECK-NEXT: mov x8, #7282 // =0x1c72 ; CHECK-NEXT: sbfx x9, x0, #0, #33 ; CHECK-NEXT: movk x8, #29127, lsl #16 -; CHECK-NEXT: mov x11, #7281 +; CHECK-NEXT: mov x11, #7281 // =0x1c71 ; CHECK-NEXT: movk x8, #50972, lsl #32 ; CHECK-NEXT: movk x11, #29127, lsl #16 ; CHECK-NEXT: movk x8, #7281, lsl #48 @@ -83,7 +83,7 @@ ; CHECK-NEXT: add x11, x11, x11, lsl #3 ; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: add x10, x10, x11 -; CHECK-NEXT: mov x9, #8589934591 +; CHECK-NEXT: mov x9, #8589934591 // =0x1ffffffff ; CHECK-NEXT: adrp x11, .LCPI3_0 ; CHECK-NEXT: adrp x12, .LCPI3_1 ; CHECK-NEXT: mov v0.d[1], x8 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll b/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll @@ -4,12 +4,12 @@ define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { ; CHECK-LABEL: test_minsize: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #5 -; CHECK-NEXT: mov w9, #42 +; CHECK-NEXT: mov w8, #5 // =0x5 +; CHECK-NEXT: mov w9, #42 // =0x2a ; CHECK-NEXT: sdiv w8, w0, w8 ; CHECK-NEXT: add w8, w8, w8, lsl #2 ; CHECK-NEXT: cmp w0, w8 -; CHECK-NEXT: mov w8, #-10 +; CHECK-NEXT: mov w8, #-10 // =0xfffffff6 ; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret %rem = srem i32 %X, 5 @@ -21,16 +21,17 @@ define i32 @test_optsize(i32 %X) optsize nounwind readnone { ; CHECK-LABEL: test_optsize: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #858993459 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: mov w8, #-10 -; CHECK-NEXT: mov w9, #42 -; CHECK-NEXT: csel w0, w9, w8, lo +; CHECK-NEXT: mov w8, #26215 // =0x6667 +; CHECK-NEXT: movk w8, #26214, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #42 // =0x2a +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: mov w8, #-10 // =0xfffffff6 +; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret %rem = srem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -6,22 +6,23 @@ ; CHECK-LABEL: test_srem_odd_even: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: adrp x9, .LCPI0_1 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: mov v1.s[1], v0.s[1] +; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_1] ; CHECK-NEXT: adrp x8, .LCPI0_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] -; CHECK-NEXT: adrp x9, .LCPI0_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI0_3] -; CHECK-NEXT: adrp x8, .LCPI0_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -35,17 +36,25 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: adrp x8, .LCPI1_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] +; CHECK-NEXT: adrp x8, .LCPI1_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] +; CHECK-NEXT: adrp x8, .LCPI1_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -56,18 +65,26 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: adrp x8, .LCPI2_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: adrp x8, .LCPI2_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] +; CHECK-NEXT: adrp x8, .LCPI2_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: cmhi v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, @@ -79,21 +96,26 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: adrp x8, .LCPI3_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -103,21 +125,26 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, @@ -130,22 +157,25 @@ ; CHECK-LABEL: test_srem_odd_even_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI5_0 -; CHECK-NEXT: adrp x9, .LCPI5_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: adrp x8, .LCPI5_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] ; CHECK-NEXT: adrp x8, .LCPI5_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI5_1] -; CHECK-NEXT: adrp x9, .LCPI5_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI5_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI5_3] -; CHECK-NEXT: adrp x8, .LCPI5_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] +; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -156,22 +186,25 @@ ; CHECK-LABEL: test_srem_odd_even_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI6_0 -; CHECK-NEXT: adrp x9, .LCPI6_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: adrp x8, .LCPI6_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] ; CHECK-NEXT: adrp x8, .LCPI6_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_1] -; CHECK-NEXT: adrp x9, .LCPI6_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI6_3] -; CHECK-NEXT: adrp x8, .LCPI6_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2] +; CHECK-NEXT: adrp x8, .LCPI6_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, @@ -186,22 +219,23 @@ ; CHECK-LABEL: test_srem_odd_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI7_0 -; CHECK-NEXT: adrp x9, .LCPI7_1 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: adrp x8, .LCPI7_1 +; CHECK-NEXT: mov v1.s[2], v0.s[2] +; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_1] ; CHECK-NEXT: adrp x8, .LCPI7_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI7_1] -; CHECK-NEXT: adrp x9, .LCPI7_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI7_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI7_3] -; CHECK-NEXT: adrp x8, .LCPI7_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_2] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -214,22 +248,19 @@ ; CHECK-LABEL: test_srem_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI8_0 -; CHECK-NEXT: adrp x9, .LCPI8_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: adrp x8, .LCPI8_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_1] -; CHECK-NEXT: adrp x9, .LCPI8_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI8_3] -; CHECK-NEXT: adrp x8, .LCPI8_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: adrp x8, .LCPI8_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-NEXT: sshr v2.4s, v1.4s, #3 +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_1] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -242,22 +273,24 @@ ; CHECK-LABEL: test_srem_odd_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: adrp x9, .LCPI9_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: adrp x8, .LCPI9_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] ; CHECK-NEXT: adrp x8, .LCPI9_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI9_1] -; CHECK-NEXT: adrp x9, .LCPI9_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI9_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI9_3] -; CHECK-NEXT: adrp x8, .LCPI9_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] +; CHECK-NEXT: adrp x8, .LCPI9_3 +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_3] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -271,17 +304,25 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: adrp x8, .LCPI10_1 +; CHECK-NEXT: mov v1.s[2], v0.s[2] +; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI10_1] +; CHECK-NEXT: adrp x8, .LCPI10_2 +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI10_2] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -294,21 +335,24 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: adrp x8, .LCPI11_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_1] +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: adrp x8, .LCPI11_2 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -321,22 +365,26 @@ ; CHECK-LABEL: test_srem_odd_even_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI12_0 -; CHECK-NEXT: adrp x9, .LCPI12_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: adrp x8, .LCPI12_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] ; CHECK-NEXT: adrp x8, .LCPI12_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI12_1] -; CHECK-NEXT: adrp x9, .LCPI12_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI12_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI12_3] -; CHECK-NEXT: adrp x8, .LCPI12_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] +; CHECK-NEXT: adrp x8, .LCPI12_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -440,22 +488,25 @@ ; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: adrp x9, .LCPI16_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: adrp x8, .LCPI16_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] ; CHECK-NEXT: adrp x8, .LCPI16_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI16_1] -; CHECK-NEXT: adrp x9, .LCPI16_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI16_3] -; CHECK-NEXT: adrp x8, .LCPI16_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] +; CHECK-NEXT: adrp x8, .LCPI16_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_3] +; CHECK-NEXT: mov v2.s[1], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -468,22 +519,25 @@ ; CHECK-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI17_0 -; CHECK-NEXT: adrp x9, .LCPI17_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: adrp x8, .LCPI17_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1] ; CHECK-NEXT: adrp x8, .LCPI17_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI17_1] -; CHECK-NEXT: adrp x9, .LCPI17_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI17_3] -; CHECK-NEXT: adrp x8, .LCPI17_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2] +; CHECK-NEXT: adrp x8, .LCPI17_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_3] +; CHECK-NEXT: mov v2.s[1], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -496,22 +550,25 @@ ; CHECK-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI18_0 -; CHECK-NEXT: adrp x9, .LCPI18_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: adrp x8, .LCPI18_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] ; CHECK-NEXT: adrp x8, .LCPI18_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_1] -; CHECK-NEXT: adrp x9, .LCPI18_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI18_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI18_3] -; CHECK-NEXT: adrp x8, .LCPI18_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] +; CHECK-NEXT: adrp x8, .LCPI18_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: mov v2.s[1], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -525,17 +582,25 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_and_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: adrp x8, .LCPI19_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_1] +; CHECK-NEXT: adrp x8, .LCPI19_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_2] +; CHECK-NEXT: adrp x8, .LCPI19_3 +; CHECK-NEXT: ushr v3.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: and v2.16b, v3.16b, v2.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI19_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -548,21 +613,28 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_and_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI20_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: adrp x9, .LCPI20_3 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: adrp x8, .LCPI20_1 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI20_3] +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1] +; CHECK-NEXT: adrp x8, .LCPI20_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_2] +; CHECK-NEXT: adrp x8, .LCPI20_4 +; CHECK-NEXT: ushr v4.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: and v2.16b, v4.16b, v3.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_4] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -575,22 +647,27 @@ ; CHECK-LABEL: test_srem_odd_even_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI21_0 -; CHECK-NEXT: adrp x9, .LCPI21_1 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI21_3 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] +; CHECK-NEXT: adrp x8, .LCPI21_1 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI21_3] +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] ; CHECK-NEXT: adrp x8, .LCPI21_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI21_1] -; CHECK-NEXT: adrp x9, .LCPI21_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI21_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI21_3] +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_2] ; CHECK-NEXT: adrp x8, .LCPI21_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushr v4.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: and v2.16b, v4.16b, v3.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_4] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -605,22 +682,26 @@ ; CHECK-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI22_0 -; CHECK-NEXT: adrp x9, .LCPI22_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] +; CHECK-NEXT: adrp x8, .LCPI22_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1] ; CHECK-NEXT: adrp x8, .LCPI22_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI22_1] -; CHECK-NEXT: adrp x9, .LCPI22_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI22_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI22_3] -; CHECK-NEXT: adrp x8, .LCPI22_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2] +; CHECK-NEXT: adrp x8, .LCPI22_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -633,22 +714,23 @@ ; CHECK-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI23_0 -; CHECK-NEXT: adrp x9, .LCPI23_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: adrp x8, .LCPI23_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_1] +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: adrp x8, .LCPI23_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI23_1] -; CHECK-NEXT: adrp x9, .LCPI23_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI23_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI23_3] -; CHECK-NEXT: adrp x8, .LCPI23_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -661,22 +743,26 @@ ; CHECK-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI24_0 -; CHECK-NEXT: adrp x9, .LCPI24_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] +; CHECK-NEXT: adrp x8, .LCPI24_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] ; CHECK-NEXT: adrp x8, .LCPI24_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI24_1] -; CHECK-NEXT: adrp x9, .LCPI24_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI24_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI24_3] -; CHECK-NEXT: adrp x8, .LCPI24_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2] +; CHECK-NEXT: adrp x8, .LCPI24_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -690,22 +776,26 @@ ; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI25_0 -; CHECK-NEXT: adrp x9, .LCPI25_1 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] +; CHECK-NEXT: adrp x8, .LCPI25_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_1] ; CHECK-NEXT: adrp x8, .LCPI25_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI25_1] -; CHECK-NEXT: adrp x9, .LCPI25_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI25_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI25_3] -; CHECK-NEXT: adrp x8, .LCPI25_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_2] +; CHECK-NEXT: adrp x8, .LCPI25_3 +; CHECK-NEXT: ushr v4.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: and v2.16b, v4.16b, v3.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -717,22 +807,26 @@ ; CHECK-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI26_0 -; CHECK-NEXT: adrp x9, .LCPI26_1 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] +; CHECK-NEXT: adrp x8, .LCPI26_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] ; CHECK-NEXT: adrp x8, .LCPI26_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI26_1] -; CHECK-NEXT: adrp x9, .LCPI26_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI26_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI26_3] -; CHECK-NEXT: adrp x8, .LCPI26_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_2] +; CHECK-NEXT: adrp x8, .LCPI26_3 +; CHECK-NEXT: ushr v4.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: and v2.16b, v4.16b, v3.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -5,18 +5,18 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47185 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: movi v3.4s, #25 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #28834 -; CHECK-NEXT: movk w8, #2621, lsl #16 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshr v2.4s, v1.4s, #3 +; CHECK-NEXT: usra v2.4s, v1.4s, #31 ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: dup v0.4s, w8 -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -29,22 +29,19 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47184 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: movi v3.4s, #100 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #23592 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: movk w8, #655, lsl #16 -; CHECK-NEXT: shl v0.4s, v2.4s, #30 -; CHECK-NEXT: ushr v1.4s, v2.4s, #2 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshr v2.4s, v1.4s, #5 +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -58,18 +55,18 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_neg25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47185 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #28834 -; CHECK-NEXT: movk w8, #2621, lsl #16 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: adrp x8, .LCPI2_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshr v2.4s, v1.4s, #3 +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: dup v0.4s, w8 -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -82,22 +79,19 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_neg100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47184 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #23592 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: movk w8, #655, lsl #16 -; CHECK-NEXT: shl v0.4s, v2.4s, #30 -; CHECK-NEXT: ushr v1.4s, v2.4s, #2 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshr v2.4s, v1.4s, #5 +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -112,7 +106,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: mov w8, #34079 // =0x851f ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: movi v3.4s, #25 ; CHECK-NEXT: dup v1.4s, w8 @@ -135,7 +129,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: mov w8, #34079 // =0x851f ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: movi v3.4s, #100 ; CHECK-NEXT: dup v1.4s, w8 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -8,14 +8,15 @@ define i32 @test_srem_odd(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #858993459 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #26215 // =0x6667 +; CHECK-NEXT: movk w8, #26214, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 5 %cmp = icmp eq i32 %srem, 0 @@ -26,15 +27,16 @@ define i32 @test_srem_odd_25(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd_25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47185 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #28835 -; CHECK-NEXT: movk w9, #2621, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #35 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #25 // =0x19 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 25 %cmp = icmp eq i32 %srem, 0 @@ -46,12 +48,18 @@ define i32 @test_srem_odd_bit30(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd_bit30: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: movk w8, #27306, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: cmp w8, #3 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: sbfiz x10, x0, #29, #32 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov w8, #3 // =0x3 +; CHECK-NEXT: lsr x10, x9, #63 +; CHECK-NEXT: asr x9, x9, #59 +; CHECK-NEXT: movk w8, #16384, lsl #16 +; CHECK-NEXT: add w9, w9, w10 +; CHECK-NEXT: msub w8, w9, w8, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 1073741827 %cmp = icmp eq i32 %srem, 0 @@ -63,12 +71,17 @@ define i32 @test_srem_odd_bit31(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd_bit31: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #21845 -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: movk w8, #54613, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: cmp w8, #3 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: mov w8, #-2147483645 // =0x80000003 +; CHECK-NEXT: add x9, x9, x9, lsl #29 +; CHECK-NEXT: neg x9, x9 +; CHECK-NEXT: lsr x10, x9, #63 +; CHECK-NEXT: asr x9, x9, #60 +; CHECK-NEXT: add w9, w9, w10 +; CHECK-NEXT: msub w8, w9, w8, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 2147483651 %cmp = icmp eq i32 %srem, 0 @@ -83,13 +96,15 @@ define i16 @test_srem_even(i16 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #4680 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: lsl w10, w8, #15 -; CHECK-NEXT: bfxil w10, w8, #1, #15 -; CHECK-NEXT: cmp w9, w10, uxth -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #18725 // =0x4925 +; CHECK-NEXT: sxth w9, w0 +; CHECK-NEXT: mul w8, w9, w8 +; CHECK-NEXT: asr w9, w8, #18 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #14 // =0xe +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: tst w8, #0xffff +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i16 %X, 14 %cmp = icmp ne i16 %srem, 0 @@ -100,16 +115,16 @@ define i32 @test_srem_even_100(i32 %X) nounwind { ; CHECK-LABEL: test_srem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47184 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #23593 -; CHECK-NEXT: movk w9, #655, lsl #16 -; CHECK-NEXT: ror w8, w8, #2 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #37 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #100 // =0x64 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 100 %cmp = icmp eq i32 %srem, 0 @@ -121,13 +136,17 @@ define i32 @test_srem_even_bit30(i32 %X) nounwind { ; CHECK-LABEL: test_srem_even_bit30: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #20165 -; CHECK-NEXT: mov w9, #8 -; CHECK-NEXT: movk w8, #64748, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: ror w8, w8, #3 -; CHECK-NEXT: cmp w8, #3 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #65433 // =0xff99 +; CHECK-NEXT: movk w8, #16383, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #60 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #104 // =0x68 +; CHECK-NEXT: movk w9, #16384, lsl #16 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 1073741928 %cmp = icmp eq i32 %srem, 0 @@ -139,13 +158,18 @@ define i32 @test_srem_even_bit31(i32 %X) nounwind { ; CHECK-LABEL: test_srem_even_bit31: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1285 -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: movk w8, #50437, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, #3 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #65433 // =0xff99 +; CHECK-NEXT: movk w8, #32767, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #30 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #102 // =0x66 +; CHECK-NEXT: movk w9, #32768, lsl #16 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 2147483750 %cmp = icmp eq i32 %srem, 0 @@ -161,15 +185,15 @@ define i32 @test_srem_odd_setne(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd_setne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #13106 -; CHECK-NEXT: movk w9, #13107, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mov w8, #26215 // =0x6667 +; CHECK-NEXT: movk w8, #26214, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i32 %X, 5 %cmp = icmp ne i32 %srem, 0 @@ -181,15 +205,14 @@ define i32 @test_srem_negative_odd(i32 %X) nounwind { ; CHECK-LABEL: test_srem_negative_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #13106 -; CHECK-NEXT: movk w9, #13107, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mov w8, #-1717986919 // =0x99999999 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmn w0, w8 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i32 %X, -5 %cmp = icmp ne i32 %srem, 0 @@ -199,14 +222,17 @@ define i32 @test_srem_negative_even(i32 %X) nounwind { ; CHECK-LABEL: test_srem_negative_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mov w8, #56173 // =0xdb6d +; CHECK-NEXT: movk w8, #28086, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #3 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #-14 // =0xfffffff2 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i32 %X, -14 %cmp = icmp ne i32 %srem, 0 @@ -222,7 +248,7 @@ define i32 @test_srem_one(i32 %X) nounwind { ; CHECK-LABEL: test_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret %srem = srem i32 %X, 1 %cmp = icmp eq i32 %srem, 0 @@ -268,7 +294,7 @@ define i32 @test_srem_allones(i32 %X) nounwind { ; CHECK-LABEL: test_srem_allones: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret %srem = srem i32 %X, 4294967295 %cmp = icmp eq i32 %srem, 0 diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll --- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -7,12 +7,12 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.h[1] ; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: mov w10, #63421 -; CHECK-NEXT: mov w11, #37253 +; CHECK-NEXT: mov w10, #63421 // =0xf7bd +; CHECK-NEXT: mov w11, #37253 // =0x9185 ; CHECK-NEXT: movk w10, #31710, lsl #16 ; CHECK-NEXT: movk w11, #44150, lsl #16 ; CHECK-NEXT: smov w13, v0.h[2] -; CHECK-NEXT: mov w12, #33437 +; CHECK-NEXT: mov w12, #33437 // =0x829d ; CHECK-NEXT: smull x10, w8, w10 ; CHECK-NEXT: movk w12, #21399, lsl #16 ; CHECK-NEXT: smull x11, w9, w11 @@ -24,8 +24,8 @@ ; CHECK-NEXT: asr w15, w11, #6 ; CHECK-NEXT: add w10, w14, w10, lsr #31 ; CHECK-NEXT: add w11, w15, w11, lsr #31 -; CHECK-NEXT: mov w14, #95 -; CHECK-NEXT: mov w15, #-124 +; CHECK-NEXT: mov w14, #95 // =0x5f +; CHECK-NEXT: mov w15, #-124 // =0xffffff84 ; CHECK-NEXT: smull x12, w13, w12 ; CHECK-NEXT: msub w9, w11, w14, w9 ; CHECK-NEXT: msub w8, w10, w15, w8 @@ -33,9 +33,9 @@ ; CHECK-NEXT: asr x11, x12, #37 ; CHECK-NEXT: smov w12, v0.h[3] ; CHECK-NEXT: add w10, w11, w10 -; CHECK-NEXT: mov w11, #98 +; CHECK-NEXT: mov w11, #98 // =0x62 ; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov w9, #63249 +; CHECK-NEXT: mov w9, #63249 // =0xf711 ; CHECK-NEXT: movk w9, #48808, lsl #16 ; CHECK-NEXT: msub w10, w10, w11, w13 ; CHECK-NEXT: smull x9, w12, w9 @@ -43,7 +43,7 @@ ; CHECK-NEXT: lsr x8, x9, #63 ; CHECK-NEXT: asr x9, x9, #40 ; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: mov w9, #-1003 +; CHECK-NEXT: mov w9, #-1003 // =0xfffffc15 ; CHECK-NEXT: mov v0.h[2], w10 ; CHECK-NEXT: msub w8, w8, w9, w12 ; CHECK-NEXT: mov v0.h[3], w8 @@ -58,11 +58,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: mov w8, #37253 // =0x9185 ; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: smov w14, v0.h[2] -; CHECK-NEXT: mov w12, #95 +; CHECK-NEXT: mov w12, #95 // =0x5f ; CHECK-NEXT: smull x11, w9, w8 ; CHECK-NEXT: smull x13, w10, w8 ; CHECK-NEXT: lsr x11, x11, #32 @@ -105,12 +105,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: mov w8, #37253 // =0x9185 ; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: smov w11, v0.h[2] ; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: mov w14, #95 +; CHECK-NEXT: mov w14, #95 // =0x5f ; CHECK-NEXT: smull x13, w9, w8 ; CHECK-NEXT: smull x15, w10, w8 ; CHECK-NEXT: lsr x13, x13, #32 @@ -158,7 +158,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[1] ; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: mov w8, #37253 // =0x9185 ; CHECK-NEXT: smov w12, v0.h[2] ; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: negs w11, w9 @@ -181,7 +181,7 @@ ; CHECK-NEXT: csneg w9, w9, w10, mi ; CHECK-NEXT: asr w10, w8, #6 ; CHECK-NEXT: add w8, w10, w8, lsr #31 -; CHECK-NEXT: mov w10, #95 +; CHECK-NEXT: mov w10, #95 // =0x5f ; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: msub w8, w8, w10, w11 ; CHECK-NEXT: mov v0.h[3], w8 @@ -197,11 +197,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: mov w9, #30865 +; CHECK-NEXT: mov w9, #30865 // =0x7891 ; CHECK-NEXT: movk w9, #51306, lsl #16 ; CHECK-NEXT: smov w10, v0.h[2] -; CHECK-NEXT: mov w11, #17097 -; CHECK-NEXT: mov w12, #654 +; CHECK-NEXT: mov w11, #17097 // =0x42c9 +; CHECK-NEXT: mov w12, #654 // =0x28e ; CHECK-NEXT: movk w11, #45590, lsl #16 ; CHECK-NEXT: smull x9, w8, w9 ; CHECK-NEXT: smull x11, w10, w11 @@ -211,13 +211,13 @@ ; CHECK-NEXT: asr w13, w9, #9 ; CHECK-NEXT: add w11, w11, w10 ; CHECK-NEXT: add w9, w13, w9, lsr #31 -; CHECK-NEXT: mov w13, #23 +; CHECK-NEXT: mov w13, #23 // =0x17 ; CHECK-NEXT: msub w8, w9, w12, w8 ; CHECK-NEXT: asr w9, w11, #4 ; CHECK-NEXT: smov w12, v0.h[3] ; CHECK-NEXT: add w9, w9, w11, lsr #31 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov w11, #47143 +; CHECK-NEXT: mov w11, #47143 // =0xb827 ; CHECK-NEXT: movk w11, #24749, lsl #16 ; CHECK-NEXT: msub w9, w9, w13, w10 ; CHECK-NEXT: smull x10, w12, w11 @@ -225,7 +225,7 @@ ; CHECK-NEXT: lsr x8, x10, #63 ; CHECK-NEXT: asr x10, x10, #43 ; CHECK-NEXT: add w8, w10, w8 -; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: mov w10, #5423 // =0x152f ; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: msub w8, w8, w10, w12 ; CHECK-NEXT: mov v0.h[3], w8 @@ -241,11 +241,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[2] -; CHECK-NEXT: mov w8, #17097 +; CHECK-NEXT: mov w8, #17097 // =0x42c9 ; CHECK-NEXT: movk w8, #45590, lsl #16 ; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: mov w11, #23 +; CHECK-NEXT: mov w11, #23 // =0x17 ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: smull x8, w9, w8 ; CHECK-NEXT: lsr x8, x8, #32 @@ -256,7 +256,7 @@ ; CHECK-NEXT: and w10, w10, #0x7fff ; CHECK-NEXT: and w13, w13, #0x7fff ; CHECK-NEXT: csneg w10, w10, w13, mi -; CHECK-NEXT: mov w13, #47143 +; CHECK-NEXT: mov w13, #47143 // =0xb827 ; CHECK-NEXT: movk w13, #24749, lsl #16 ; CHECK-NEXT: msub w8, w8, w11, w9 ; CHECK-NEXT: smull x9, w12, w13 @@ -264,7 +264,7 @@ ; CHECK-NEXT: lsr x10, x9, #63 ; CHECK-NEXT: asr x9, x9, #43 ; CHECK-NEXT: add w9, w9, w10 -; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: mov w10, #5423 // =0x152f ; CHECK-NEXT: mov v1.h[2], w8 ; CHECK-NEXT: msub w8, w9, w10, w12 ; CHECK-NEXT: mov v1.h[3], w8 @@ -278,14 +278,14 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; CHECK-LABEL: dont_fold_srem_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #8549 +; CHECK-NEXT: mov x8, #8549 // =0x2165 ; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: movk x8, #22795, lsl #16 -; CHECK-NEXT: mov x12, #6055 +; CHECK-NEXT: mov x12, #6055 // =0x17a7 ; CHECK-NEXT: movk x8, #17096, lsl #32 ; CHECK-NEXT: movk x12, #58853, lsl #16 ; CHECK-NEXT: movk x8, #45590, lsl #48 -; CHECK-NEXT: mov x14, #21445 +; CHECK-NEXT: mov x14, #21445 // =0x53c5 ; CHECK-NEXT: mov x10, v1.d[1] ; CHECK-NEXT: movk x12, #47142, lsl #32 ; CHECK-NEXT: smulh x8, x9, x8 @@ -297,16 +297,16 @@ ; CHECK-NEXT: asr x13, x8, #4 ; CHECK-NEXT: movk x14, #25653, lsl #48 ; CHECK-NEXT: add x8, x13, x8, lsr #63 -; CHECK-NEXT: mov w13, #23 +; CHECK-NEXT: mov w13, #23 // =0x17 ; CHECK-NEXT: smulh x12, x10, x12 ; CHECK-NEXT: smulh x14, x11, x14 ; CHECK-NEXT: msub x8, x8, x13, x9 ; CHECK-NEXT: asr x13, x12, #11 ; CHECK-NEXT: add x12, x13, x12, lsr #63 ; CHECK-NEXT: asr x13, x14, #8 -; CHECK-NEXT: mov w9, #5423 +; CHECK-NEXT: mov w9, #5423 // =0x152f ; CHECK-NEXT: add x13, x13, x14, lsr #63 -; CHECK-NEXT: mov w14, #654 +; CHECK-NEXT: mov w14, #654 // =0x28e ; CHECK-NEXT: msub x9, x12, x9, x10 ; CHECK-NEXT: fmov d1, x8 ; CHECK-NEXT: msub x10, x13, x14, x11 diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -134,15 +134,17 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1 { v0.b }[0], [x1] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v1.b }[0], [x0] -; CHECK-NEXT: add x9, x0, #1 -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: ldrsb w8, [x1] +; CHECK-NEXT: ldrsb w9, [x0] +; CHECK-NEXT: ldrsb w10, [x1, #1] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: ldrsb w9, [x0, #1] +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: shl v1.2s, v1.2s, #24 -; CHECK-NEXT: sqsub v0.2s, v1.2s, v0.2s +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 @@ -174,15 +176,17 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1 { v0.h }[0], [x1] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v1.h }[0], [x0] -; CHECK-NEXT: add x9, x0, #2 -; CHECK-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: ldrsh w8, [x1] +; CHECK-NEXT: ldrsh w9, [x0] +; CHECK-NEXT: ldrsh w10, [x1, #2] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: ldrsh w9, [x0, #2] +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: sqsub v0.2s, v1.2s, v0.2s +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 diff --git a/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll b/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll --- a/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll +++ b/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll @@ -16,12 +16,12 @@ ; CHECK-NEXT: bl return_in_block ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: str d0, [x8] -; CHECK-NEXT: str d1, [x8, #8] -; CHECK-NEXT: str d2, [x8, #16] ; CHECK-NEXT: str d3, [x8, #24] -; CHECK-NEXT: str d4, [x8, #32] ; CHECK-NEXT: str d5, [x8, #40] +; CHECK-NEXT: str d4, [x8, #32] +; CHECK-NEXT: str d2, [x8, #16] +; CHECK-NEXT: str d1, [x8, #8] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w30 @@ -40,9 +40,10 @@ ; CHECK-NEXT: bl return_in_block ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: stp d0, d1, [x8] -; CHECK-NEXT: stp d2, d3, [x8, #16] -; CHECK-NEXT: stp d4, d5, [x8, #32] +; CHECK-NEXT: stp d3, d4, [x8, #24] +; CHECK-NEXT: str d5, [x8, #40] +; CHECK-NEXT: stp d1, d2, [x8, #8] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %1 = call %T_IN_BLOCK @return_in_block() @@ -59,9 +60,10 @@ ; CHECK-NEXT: bl return_in_block ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: stp d0, d1, [x8] -; CHECK-NEXT: stp d2, d3, [x8, #16] -; CHECK-NEXT: stp d4, d5, [x8, #32] +; CHECK-NEXT: stp d3, d4, [x8, #24] +; CHECK-NEXT: str d5, [x8, #40] +; CHECK-NEXT: stp d1, d2, [x8, #8] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w30 diff --git a/llvm/test/CodeGen/AArch64/sve-aba.ll b/llvm/test/CodeGen/AArch64/sve-aba.ll --- a/llvm/test/CodeGen/AArch64/sve-aba.ll +++ b/llvm/test/CodeGen/AArch64/sve-aba.ll @@ -77,7 +77,9 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: sxtb z1.h, p0/m, z1.h ; CHECK-NEXT: sxtb z2.h, p0/m, z2.h -; CHECK-NEXT: saba z0.h, z1.h, z2.h +; CHECK-NEXT: sub z1.h, z1.h, z2.h +; CHECK-NEXT: abs z1.h, p0/m, z1.h +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: ret %b.sext = sext %b to %c.sext = sext %c to @@ -128,7 +130,9 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sxth z1.s, p0/m, z1.s ; CHECK-NEXT: sxth z2.s, p0/m, z2.s -; CHECK-NEXT: saba z0.s, z1.s, z2.s +; CHECK-NEXT: sub z1.s, z1.s, z2.s +; CHECK-NEXT: abs z1.s, p0/m, z1.s +; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: ret %b.sext = sext %b to %c.sext = sext %c to @@ -179,7 +183,9 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z1.d, p0/m, z1.d ; CHECK-NEXT: sxtw z2.d, p0/m, z2.d -; CHECK-NEXT: saba z0.d, z1.d, z2.d +; CHECK-NEXT: sub z1.d, z1.d, z2.d +; CHECK-NEXT: abs z1.d, p0/m, z1.d +; CHECK-NEXT: add z0.d, z0.d, z1.d ; CHECK-NEXT: ret %b.sext = sext %b to %c.sext = sext %c to @@ -231,9 +237,13 @@ define @uaba_b_promoted_ops( %a, %b, %c) #0 { ; CHECK-LABEL: uaba_b_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: mov z2.b, p1/z, #1 // =0x1 -; CHECK-NEXT: uaba z0.b, z1.b, z2.b +; CHECK-NEXT: mov z1.b, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.b, p0/z, #1 // =0x1 +; CHECK-NEXT: add z2.b, p1/m, z2.b, z1.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: abs z1.b, p0/m, z2.b +; CHECK-NEXT: add z0.b, z0.b, z1.b ; CHECK-NEXT: ret %b.zext = zext %b to %c.zext = zext %c to @@ -283,7 +293,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z2.h, z2.h, #0xff -; CHECK-NEXT: uaba z0.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sub z1.h, z1.h, z2.h +; CHECK-NEXT: abs z1.h, p0/m, z1.h +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: ret %b.zext = zext %b to %c.zext = zext %c to @@ -333,7 +346,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and z1.s, z1.s, #0xffff ; CHECK-NEXT: and z2.s, z2.s, #0xffff -; CHECK-NEXT: uaba z0.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sub z1.s, z1.s, z2.s +; CHECK-NEXT: abs z1.s, p0/m, z1.s +; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: ret %b.zext = zext %b to %c.zext = zext %c to @@ -383,7 +399,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff ; CHECK-NEXT: and z2.d, z2.d, #0xffffffff -; CHECK-NEXT: uaba z0.d, z1.d, z2.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sub z1.d, z1.d, z2.d +; CHECK-NEXT: abs z1.d, p0/m, z1.d +; CHECK-NEXT: add z0.d, z0.d, z1.d ; CHECK-NEXT: ret %b.zext = zext %b to %c.zext = zext %c to diff --git a/llvm/test/CodeGen/AArch64/sve-abd.ll b/llvm/test/CodeGen/AArch64/sve-abd.ll --- a/llvm/test/CodeGen/AArch64/sve-abd.ll +++ b/llvm/test/CodeGen/AArch64/sve-abd.ll @@ -56,7 +56,8 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: sxtb z0.h, p0/m, z0.h ; CHECK-NEXT: sxtb z1.h, p0/m, z1.h -; CHECK-NEXT: sabd z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: abs z0.h, p0/m, z0.h ; CHECK-NEXT: ret %a.sext = sext %a to %b.sext = sext %b to @@ -85,7 +86,8 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: sxth z1.s, p0/m, z1.s -; CHECK-NEXT: sabd z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: ret %a.sext = sext %a to %b.sext = sext %b to @@ -114,7 +116,8 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: sxtw z1.d, p0/m, z1.d -; CHECK-NEXT: sabd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: ret %a.sext = sext %a to %b.sext = sext %b to @@ -144,10 +147,12 @@ define @uabd_b_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_b_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.b -; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 -; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 -; CHECK-NEXT: uabd z0.b, p2/m, z0.b, z1.b +; CHECK-NEXT: mov z0.b, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: add z1.b, p1/m, z1.b, z0.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: abs z0.b, p0/m, z1.b ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -173,10 +178,11 @@ define @uabd_h_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_h_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: uabd z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: abs z0.h, p0/m, z0.h ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -202,10 +208,11 @@ define @uabd_s_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_s_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: and z1.s, z1.s, #0xffff -; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -231,10 +238,11 @@ define @uabd_d_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_d_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff -; CHECK-NEXT: uabd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -248,17 +256,9 @@ define @uabd_non_matching_extension( %a, %b) #0 { ; CHECK-LABEL: uabd_non_matching_extension: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0xff -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpkhi z3.d, z1.s -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub z0.d, z0.d, z1.d -; CHECK-NEXT: sub z1.d, z2.d, z3.d -; CHECK-NEXT: abs z1.d, p0/m, z1.d -; CHECK-NEXT: abs z0.d, p0/m, z0.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -273,10 +273,11 @@ define @uabd_non_matching_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_non_matching_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: and z1.s, z1.s, #0xffff -; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -64,13 +64,13 @@ ; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1] ; CHECK-NEXT: fmov s0, #1.00000000 ; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: mov w1, #1 -; CHECK-NEXT: mov w2, #2 -; CHECK-NEXT: mov w3, #3 -; CHECK-NEXT: mov w4, #4 -; CHECK-NEXT: mov w5, #5 -; CHECK-NEXT: mov w6, #6 -; CHECK-NEXT: mov w7, #7 +; CHECK-NEXT: mov w1, #1 // =0x1 +; CHECK-NEXT: mov w2, #2 // =0x2 +; CHECK-NEXT: mov w3, #3 // =0x3 +; CHECK-NEXT: mov w4, #4 // =0x4 +; CHECK-NEXT: mov w5, #5 // =0x5 +; CHECK-NEXT: mov w6, #6 // =0x6 +; CHECK-NEXT: mov w7, #7 // =0x7 ; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z16.d }, p0, [x9] @@ -158,18 +158,18 @@ ; CHECK-LABEL: foo4: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x3, #1, mul vl] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x3] -; CHECK-NEXT: ld1d { z24.d }, p0/z, [x3, #3, mul vl] -; CHECK-NEXT: ld1d { z25.d }, p0/z, [x3, #2, mul vl] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x3, #1, mul vl] +; CHECK-NEXT: ld1d { z24.d }, p0/z, [x3, #2, mul vl] +; CHECK-NEXT: ld1d { z25.d }, p0/z, [x3, #3, mul vl] ; CHECK-NEXT: st1d { z4.d }, p0, [x0, #3, mul vl] ; CHECK-NEXT: st1d { z3.d }, p0, [x0, #2, mul vl] ; CHECK-NEXT: st1d { z2.d }, p0, [x0, #1, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [x0] -; CHECK-NEXT: st1d { z25.d }, p0, [x1, #2, mul vl] -; CHECK-NEXT: st1d { z24.d }, p0, [x1, #3, mul vl] -; CHECK-NEXT: st1d { z7.d }, p0, [x1] -; CHECK-NEXT: st1d { z6.d }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1d { z25.d }, p0, [x1, #3, mul vl] +; CHECK-NEXT: st1d { z24.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: st1d { z7.d }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1d { z6.d }, p0, [x1] ; CHECK-NEXT: st1d { z5.d }, p0, [x2] ; CHECK-NEXT: ret entry: @@ -184,18 +184,18 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr x8, [sp] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x8] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x8, #3, mul vl] -; CHECK-NEXT: ld1d { z24.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1d { z24.d }, p0/z, [x8, #3, mul vl] ; CHECK-NEXT: st1d { z4.d }, p0, [x6, #3, mul vl] ; CHECK-NEXT: st1d { z3.d }, p0, [x6, #2, mul vl] ; CHECK-NEXT: st1d { z2.d }, p0, [x6, #1, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [x6] -; CHECK-NEXT: st1d { z24.d }, p0, [x7, #2, mul vl] -; CHECK-NEXT: st1d { z7.d }, p0, [x7, #3, mul vl] -; CHECK-NEXT: st1d { z6.d }, p0, [x7] -; CHECK-NEXT: st1d { z5.d }, p0, [x7, #1, mul vl] +; CHECK-NEXT: st1d { z24.d }, p0, [x7, #3, mul vl] +; CHECK-NEXT: st1d { z7.d }, p0, [x7, #2, mul vl] +; CHECK-NEXT: st1d { z6.d }, p0, [x7, #1, mul vl] +; CHECK-NEXT: st1d { z5.d }, p0, [x7] ; CHECK-NEXT: ret entry: store volatile %x1, * %ptr1 @@ -208,14 +208,14 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x2] -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2, #2, mul vl] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x2, #1, mul vl] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2, #1, mul vl] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x2, #2, mul vl] ; CHECK-NEXT: st1d { z5.d }, p0, [x0, #3, mul vl] ; CHECK-NEXT: st1d { z4.d }, p0, [x0, #2, mul vl] ; CHECK-NEXT: st1d { z3.d }, p0, [x0, #1, mul vl] ; CHECK-NEXT: st1d { z2.d }, p0, [x0] -; CHECK-NEXT: st1d { z7.d }, p0, [x1, #1, mul vl] -; CHECK-NEXT: st1d { z6.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: st1d { z7.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: st1d { z6.d }, p0, [x1, #1, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [x1] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll @@ -21,7 +21,7 @@ define void @masked_gather_base_plus_stride_v4f64(ptr %dst, ptr %src) #0 { ; CHECK-LABEL: masked_gather_base_plus_stride_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-32 +; CHECK-NEXT: mov x8, #-32 // =0xffffffffffffffe0 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: index z0.d, #-2, x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1, z0.d, lsl #3] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll @@ -46,7 +46,7 @@ define void @ctlz_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctlz_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -134,7 +134,7 @@ define void @ctlz_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctlz_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -222,7 +222,7 @@ define void @ctlz_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctlz_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -314,7 +314,7 @@ define void @ctlz_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctlz_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -406,7 +406,7 @@ define void @ctpop_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctpop_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -496,7 +496,7 @@ define void @ctpop_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctpop_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -588,7 +588,7 @@ define void @ctpop_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctpop_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -682,7 +682,7 @@ define void @ctpop_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctpop_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -779,7 +779,7 @@ define void @cttz_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: cttz_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -877,7 +877,7 @@ define void @cttz_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: cttz_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -977,7 +977,7 @@ define void @cttz_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: cttz_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1077,7 +1077,7 @@ define void @cttz_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: cttz_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll @@ -47,7 +47,7 @@ define void @bitcast_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: bitcast_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -135,7 +135,7 @@ define void @bitcast_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: bitcast_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -223,7 +223,7 @@ define void @bitcast_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: bitcast_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll @@ -42,7 +42,7 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: build_vector_minus2_dec32_v4i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #-32 +; VBITS_GE_256-NEXT: mov x8, #-32 // =0xffffffffffffffe0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: index z0.d, #-2, x8 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] @@ -53,11 +53,6 @@ ; Constant but not a sequence. define void @build_vector_no_stride_v4i64(ptr %a) #0 { -; VBITS_GE_256-LABEL: .LCPI4_0: -; VBITS_GE_256: .xword 0 -; VBITS_GE_256-NEXT: .xword 4 -; VBITS_GE_256-NEXT: .xword 1 -; VBITS_GE_256-NEXT: .xword 8 ; VBITS_GE_256-LABEL: build_vector_no_stride_v4i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: adrp x8, .LCPI4_0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll @@ -56,7 +56,7 @@ ; VBITS_GE_256-LABEL: concat_v64i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2, x8] @@ -214,7 +214,7 @@ ; VBITS_GE_256-LABEL: concat_v32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1] @@ -343,7 +343,7 @@ ; VBITS_GE_256-LABEL: concat_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] @@ -448,7 +448,7 @@ ; VBITS_GE_256-LABEL: concat_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] @@ -557,7 +557,7 @@ ; VBITS_GE_256-LABEL: concat_v32f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1] @@ -686,7 +686,7 @@ ; VBITS_GE_256-LABEL: concat_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] @@ -791,7 +791,7 @@ ; VBITS_GE_256-LABEL: concat_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -45,7 +45,7 @@ define void @extract_subvector_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] @@ -137,7 +137,7 @@ define void @extract_subvector_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] @@ -228,7 +228,7 @@ define void @extract_subvector_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] @@ -308,7 +308,7 @@ define void @extract_subvector_v8i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -322,12 +322,12 @@ define void @extract_subvector_v16i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v16i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 -; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x9, #12 // =0xc ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_256-NEXT: ret @@ -340,7 +340,7 @@ define void @extract_subvector_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -392,7 +392,7 @@ define void @extract_subvector_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] @@ -483,7 +483,7 @@ define void @extract_subvector_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] @@ -563,7 +563,7 @@ define void @extract_subvector_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll @@ -46,7 +46,7 @@ define half @extractelement_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: extractelement_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: mov z0.h, z0.h[15] @@ -69,7 +69,7 @@ ; CHECK-LABEL: extractelement_v64f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: whilels p0.h, xzr, x8 ; CHECK-NEXT: lastb h0, p0, z0.h @@ -83,7 +83,7 @@ ; CHECK-LABEL: extractelement_v128f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: whilels p0.h, xzr, x8 ; CHECK-NEXT: lastb h0, p0, z0.h @@ -130,7 +130,7 @@ define float @extractelement_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: extractelement_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: mov z0.s, z0.s[7] @@ -153,7 +153,7 @@ ; CHECK-LABEL: extractelement_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: whilels p0.s, xzr, x8 ; CHECK-NEXT: lastb s0, p0, z0.s @@ -167,7 +167,7 @@ ; CHECK-LABEL: extractelement_v64f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: whilels p0.s, xzr, x8 ; CHECK-NEXT: lastb s0, p0, z0.s @@ -212,7 +212,7 @@ define double @extractelement_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: extractelement_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: mov z0.d, z0.d[3] @@ -235,7 +235,7 @@ ; CHECK-LABEL: extractelement_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: whilels p0.d, xzr, x8 ; CHECK-NEXT: lastb d0, p0, z0.d @@ -249,7 +249,7 @@ ; CHECK-LABEL: extractelement_v32f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: whilels p0.d, xzr, x8 ; CHECK-NEXT: lastb d0, p0, z0.d diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll @@ -65,7 +65,7 @@ define void @test_copysign_v32f16_v32f16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: test_copysign_v32f16_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -189,7 +189,7 @@ define void @test_copysign_v16f32_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: test_copysign_v16f32_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -298,7 +298,7 @@ define void @test_copysign_v8f64_v8f64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: test_copysign_v8f64_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -436,30 +436,17 @@ ; SplitVecRes mismatched define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { -; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: -; CHECK_NO_EXTEND_ROUND: // %bb.0: -; CHECK_NO_EXTEND_ROUND-NEXT: ptrue p0.d, vl4 -; CHECK_NO_EXTEND_ROUND-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK_NO_EXTEND_ROUND-NEXT: ld1w { z1.d }, p0/z, [x1] -; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z1.d, p0/m, z1.s -; CHECK_NO_EXTEND_ROUND-NEXT: and z0.d, z0.d, #0x7fffffffffffffff -; CHECK_NO_EXTEND_ROUND-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK_NO_EXTEND_ROUND-NEXT: orr z0.d, z0.d, z1.d -; CHECK_NO_EXTEND_ROUND-NEXT: st1d { z0.d }, p0, [x0] -; CHECK_NO_EXTEND_ROUND-NEXT: ret -; -; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: -; CHECK_EXTEND_ROUND: // %bb.0: -; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d, vl4 -; CHECK_EXTEND_ROUND-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK_EXTEND_ROUND-NEXT: ldr q1, [x1] -; CHECK_EXTEND_ROUND-NEXT: uunpklo z1.d, z1.s -; CHECK_EXTEND_ROUND-NEXT: fcvt z1.d, p0/m, z1.s -; CHECK_EXTEND_ROUND-NEXT: and z0.d, z0.d, #0x7fffffffffffffff -; CHECK_EXTEND_ROUND-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK_EXTEND_ROUND-NEXT: orr z0.d, z0.d, z1.d -; CHECK_EXTEND_ROUND-NEXT: st1d { z0.d }, p0, [x0] -; CHECK_EXTEND_ROUND-NEXT: ret +; CHECK-LABEL: test_copysign_v4f64_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x1] +; CHECK-NEXT: fcvt z1.d, p0/m, z1.s +; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x float>, ptr %bp %tmp0 = fpext <4 x float> %b to <4 x double> @@ -556,3 +543,6 @@ declare <32 x double> @llvm.copysign.v32f64(<32 x double> %a, <32 x double> %b) #0 attributes #0 = { "target-features"="+sve" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK_EXTEND_ROUND: {{.*}} +; CHECK_NO_EXTEND_ROUND: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll @@ -48,7 +48,7 @@ define void @fadd_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fadd_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @fadd_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fadd_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -244,7 +244,7 @@ define void @fadd_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fadd_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -346,7 +346,7 @@ define void @fdiv_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fdiv_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -444,7 +444,7 @@ define void @fdiv_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fdiv_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -542,7 +542,7 @@ define void @fdiv_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fdiv_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -648,7 +648,7 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -758,7 +758,7 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -867,7 +867,7 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -977,7 +977,7 @@ define void @fmul_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fmul_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1075,7 +1075,7 @@ define void @fmul_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fmul_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1173,7 +1173,7 @@ define void @fmul_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fmul_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1273,7 +1273,7 @@ define void @fneg_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: fneg_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1361,7 +1361,7 @@ define void @fneg_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: fneg_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1449,7 +1449,7 @@ define void @fneg_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: fneg_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1541,7 +1541,7 @@ define void @fsqrt_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: fsqrt_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1629,7 +1629,7 @@ define void @fsqrt_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: fsqrt_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1717,7 +1717,7 @@ define void @fsqrt_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: fsqrt_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1811,7 +1811,7 @@ define void @fsub_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fsub_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1909,7 +1909,7 @@ define void @fsub_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fsub_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -2007,7 +2007,7 @@ define void @fsub_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fsub_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -2107,7 +2107,7 @@ define void @fabs_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: fabs_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -2195,7 +2195,7 @@ define void @fabs_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: fabs_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -2283,7 +2283,7 @@ define void @fabs_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: fabs_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll @@ -52,7 +52,7 @@ define void @fcmp_oeq_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fcmp_oeq_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -162,7 +162,7 @@ define void @fcmp_oeq_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fcmp_oeq_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -272,7 +272,7 @@ define void @fcmp_oeq_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fcmp_oeq_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -54,7 +54,7 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0] @@ -157,7 +157,7 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v8f16_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0] @@ -257,7 +257,7 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0] @@ -357,7 +357,7 @@ define void @fcvt_v16f32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v16f32_v16f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -460,7 +460,7 @@ define void @fcvt_v8f64_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -562,7 +562,7 @@ define void @fcvt_v8f64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v8f64_v8f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll @@ -55,7 +55,7 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -171,7 +171,7 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -286,7 +286,7 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll @@ -48,7 +48,7 @@ define void @fmaxnm_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmaxnm_v32f16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @fmaxnm_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmaxnm_v16f32: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x8, #8 // =0x8 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -244,7 +244,7 @@ define void @fmaxnm_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmaxnm_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x8, #4 // =0x4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -346,7 +346,7 @@ define void @fminnm_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fminnm_v32f16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -444,7 +444,7 @@ define void @fminnm_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fminnm_v16f32: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x8, #8 // =0x8 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -542,7 +542,7 @@ define void @fminnm_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fminnm_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x8, #4 // =0x4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -644,7 +644,7 @@ define void @fmax_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmax_v32f16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -742,7 +742,7 @@ define void @fmax_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmax_v16f32: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x8, #8 // =0x8 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -840,7 +840,7 @@ define void @fmax_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmax_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x8, #4 // =0x4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -942,7 +942,7 @@ define void @fmin_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmin_v32f16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1040,7 +1040,7 @@ define void @fmin_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmin_v16f32: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x8, #8 // =0x8 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1138,7 +1138,7 @@ define void @fmin_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmin_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x8, #4 // =0x4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll @@ -46,7 +46,7 @@ define void @frintp_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintp_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -134,7 +134,7 @@ define void @frintp_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintp_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -222,7 +222,7 @@ define void @frintp_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintp_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -314,7 +314,7 @@ define void @frintm_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintm_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -402,7 +402,7 @@ define void @frintm_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintm_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -490,7 +490,7 @@ define void @frintm_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintm_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -582,7 +582,7 @@ define void @frinti_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinti_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -670,7 +670,7 @@ define void @frinti_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinti_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -758,7 +758,7 @@ define void @frinti_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinti_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -850,7 +850,7 @@ define void @frintx_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintx_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -938,7 +938,7 @@ define void @frintx_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintx_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1026,7 +1026,7 @@ define void @frintx_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintx_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1118,7 +1118,7 @@ define void @frinta_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinta_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1206,7 +1206,7 @@ define void @frinta_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinta_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1294,7 +1294,7 @@ define void @frinta_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinta_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1386,7 +1386,7 @@ define void @frintn_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintn_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1474,7 +1474,7 @@ define void @frintn_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintn_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1562,7 +1562,7 @@ define void @frintn_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintn_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1654,7 +1654,7 @@ define void @frintz_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintz_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1742,7 +1742,7 @@ define void @frintz_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintz_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1830,7 +1830,7 @@ define void @frintz_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintz_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll @@ -54,7 +54,7 @@ define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.h @@ -178,7 +178,7 @@ define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.s @@ -282,9 +282,8 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d @@ -303,10 +302,9 @@ define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 -; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: and x9, x2, #0x1 +; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.d ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -322,9 +320,8 @@ ; ; VBITS_GE_512-LABEL: select_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 +; VBITS_GE_512-NEXT: and w8, w2, #0x1 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: and x8, x2, #0x1 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: ptrue p1.d @@ -343,9 +340,8 @@ define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d @@ -364,9 +360,8 @@ define void @select_v32f64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -50,7 +50,7 @@ define void @fcvtzu_v32f16_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v32f16_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -147,7 +147,7 @@ ; VBITS_GE_256-LABEL: fcvtzu_v16f16_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h @@ -250,7 +250,7 @@ ; VBITS_GE_256-LABEL: fcvtzu_v8f16_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -355,7 +355,7 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v16f32_v16i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -451,7 +451,7 @@ define void @fcvtzu_v16f32_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v16f32_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -548,7 +548,7 @@ ; VBITS_GE_256-LABEL: fcvtzu_v8f32_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s @@ -650,7 +650,7 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -750,7 +750,7 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -847,7 +847,7 @@ define void @fcvtzu_v8f64_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -943,7 +943,7 @@ define void @fcvtzs_v32f16_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v32f16_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1040,7 +1040,7 @@ ; VBITS_GE_256-LABEL: fcvtzs_v16f16_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h @@ -1143,7 +1143,7 @@ ; VBITS_GE_256-LABEL: fcvtzs_v8f16_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -1248,7 +1248,7 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v16f32_v16i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1344,7 +1344,7 @@ define void @fcvtzs_v16f32_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v16f32_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1441,7 +1441,7 @@ ; VBITS_GE_256-LABEL: fcvtzs_v8f32_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s @@ -1543,7 +1543,7 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1643,7 +1643,7 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1740,7 +1740,7 @@ define void @fcvtzs_v8f64_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -51,7 +51,7 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -164,7 +164,7 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -278,7 +278,7 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll @@ -114,7 +114,7 @@ ; CHECK-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: add x9, sp, #128 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ldr z1, [x9] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll @@ -11,58 +11,45 @@ define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8, ; CHECK-LABEL: func1: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x25, [sp, #-64]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -40 -; CHECK-NEXT: .cfi_offset w24, -48 -; CHECK-NEXT: .cfi_offset w25, -56 -; CHECK-NEXT: .cfi_offset w29, -64 -; CHECK-NEXT: add x8, sp, #64 -; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: add x10, sp, #160 -; CHECK-NEXT: add x11, sp, #192 +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x8, sp, #56 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x20, sp, #192 +; CHECK-NEXT: add x9, sp, #88 +; CHECK-NEXT: add x10, sp, #120 +; CHECK-NEXT: add x11, sp, #152 +; CHECK-NEXT: add x12, sp, #240 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: add x8, sp, #272 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9] ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x10] ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x11] -; CHECK-NEXT: ldp x18, x19, [sp, #368] -; CHECK-NEXT: add x21, sp, #160 -; CHECK-NEXT: add x22, sp, #128 -; CHECK-NEXT: ldp x24, x14, [sp, #296] -; CHECK-NEXT: add x23, sp, #64 -; CHECK-NEXT: ldr x25, [sp, #288] -; CHECK-NEXT: ldp x9, x8, [sp, #344] -; CHECK-NEXT: ldp x11, x10, [sp, #328] -; CHECK-NEXT: ldp x13, x12, [sp, #312] -; CHECK-NEXT: ldr x15, [sp, #120] -; CHECK-NEXT: ldur q4, [sp, #104] -; CHECK-NEXT: ldp x16, x17, [sp, #224] -; CHECK-NEXT: st1d { z3.d }, p0, [x20] -; CHECK-NEXT: st1d { z2.d }, p0, [x21] -; CHECK-NEXT: st1d { z1.d }, p0, [x22] -; CHECK-NEXT: st1d { z0.d }, p0, [x23] -; CHECK-NEXT: stp x18, x19, [sp, #368] -; CHECK-NEXT: stp x25, x24, [sp, #288] -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: stp x16, x17, [sp, #224] -; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: stur q4, [sp, #104] -; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: str x15, [sp, #120] -; CHECK-NEXT: stp x14, x13, [sp, #304] -; CHECK-NEXT: stp x12, x11, [sp, #320] -; CHECK-NEXT: stp x10, x9, [sp, #336] -; CHECK-NEXT: str x8, [sp, #352] -; CHECK-NEXT: ldp x29, x25, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x12] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8] +; CHECK-NEXT: add x14, sp, #272 +; CHECK-NEXT: ldp x8, x9, [sp, #32] +; CHECK-NEXT: add x15, sp, #240 +; CHECK-NEXT: add x16, sp, #152 +; CHECK-NEXT: ldp x12, x13, [sp, #320] +; CHECK-NEXT: add x17, sp, #120 +; CHECK-NEXT: add x18, sp, #88 +; CHECK-NEXT: ldr q6, [sp, #16] +; CHECK-NEXT: ldr x10, [sp, #184] +; CHECK-NEXT: ldr x11, [sp, #304] +; CHECK-NEXT: st1d { z5.d }, p0, [x14] +; CHECK-NEXT: add x14, sp, #56 +; CHECK-NEXT: st1d { z4.d }, p0, [x15] +; CHECK-NEXT: st1d { z3.d }, p0, [x16] +; CHECK-NEXT: st1d { z2.d }, p0, [x17] +; CHECK-NEXT: st1d { z1.d }, p0, [x18] +; CHECK-NEXT: st1d { z0.d }, p0, [x14] +; CHECK-NEXT: stp x12, x13, [sp, #320] +; CHECK-NEXT: str x11, [sp, #304] +; CHECK-NEXT: str x10, [sp, #184] +; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: str q6, [sp, #16] +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: b func2 ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr %v13, ptr %v14, ptr %v15, ptr %v16, ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24, diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -36,7 +36,7 @@ define <16 x half> @insertelement_v16f16(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #15 +; CHECK-NEXT: mov w9, #15 // =0xf ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: fmov h2, #5.00000000 @@ -55,8 +55,8 @@ define <32 x half> @insertelement_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: insertelement_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #16 -; VBITS_GE_256-NEXT: mov w10, #15 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov w10, #15 // =0xf ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: fmov h3, #5.00000000 ; VBITS_GE_256-NEXT: index z4.h, #0, #1 @@ -72,7 +72,7 @@ ; ; VBITS_GE_512-LABEL: insertelement_v32f16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov w9, #31 +; VBITS_GE_512-NEXT: mov w9, #31 // =0x1f ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: fmov h2, #5.00000000 @@ -91,7 +91,7 @@ define <64 x half> @insertelement_v64f16(ptr %a) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v64f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #63 +; CHECK-NEXT: mov w9, #63 // =0x3f ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: fmov h2, #5.00000000 @@ -110,7 +110,7 @@ define <128 x half> @insertelement_v128f16(ptr %a) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v128f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #127 +; CHECK-NEXT: mov w9, #127 // =0x7f ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: fmov h2, #5.00000000 @@ -153,7 +153,7 @@ define <8 x float> @insertelement_v8f32(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #7 +; CHECK-NEXT: mov w9, #7 // =0x7 ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fmov s2, #5.00000000 @@ -172,8 +172,8 @@ define <16 x float> @insertelement_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: insertelement_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 -; VBITS_GE_256-NEXT: mov w10, #7 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov w10, #7 // =0x7 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: fmov s3, #5.00000000 ; VBITS_GE_256-NEXT: index z4.s, #0, #1 @@ -189,7 +189,7 @@ ; ; VBITS_GE_512-LABEL: insertelement_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov w9, #15 +; VBITS_GE_512-NEXT: mov w9, #15 // =0xf ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: fmov s2, #5.00000000 @@ -208,7 +208,7 @@ define <32 x float> @insertelement_v32f32(ptr %a) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #31 +; CHECK-NEXT: mov w9, #31 // =0x1f ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fmov s2, #5.00000000 @@ -227,7 +227,7 @@ define <64 x float> @insertelement_v64f32(ptr %a) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #63 +; CHECK-NEXT: mov w9, #63 // =0x3f ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fmov s2, #5.00000000 @@ -247,7 +247,7 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v1f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #4617315517961601024 +; CHECK-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret %r = insertelement <1 x double> %op1, double 5.0, i64 0 @@ -268,7 +268,7 @@ define <4 x double> @insertelement_v4f64(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #3 +; CHECK-NEXT: mov w9, #3 // =0x3 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fmov d2, #5.00000000 @@ -287,8 +287,8 @@ define <8 x double> @insertelement_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: insertelement_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 -; VBITS_GE_256-NEXT: mov w10, #3 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov w10, #3 // =0x3 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: fmov d3, #5.00000000 ; VBITS_GE_256-NEXT: index z4.d, #0, #1 @@ -304,7 +304,7 @@ ; ; VBITS_GE_512-LABEL: insertelement_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov w9, #7 +; VBITS_GE_512-NEXT: mov w9, #7 // =0x7 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: fmov d2, #5.00000000 @@ -323,7 +323,7 @@ define <16 x double> @insertelement_v16f64(ptr %a) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #15 +; CHECK-NEXT: mov w9, #15 // =0xf ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fmov d2, #5.00000000 @@ -342,7 +342,7 @@ define <32 x double> @insertelement_v32f64(ptr %a) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #31 +; CHECK-NEXT: mov w9, #31 // =0x1f ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fmov d2, #5.00000000 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll @@ -48,7 +48,7 @@ define void @add_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: add_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @add_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: add_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -244,7 +244,7 @@ define void @add_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: add_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -342,7 +342,7 @@ define void @add_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: add_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -388,7 +388,7 @@ define void @add_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: add_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -449,7 +449,7 @@ define void @mul_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: mul_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -547,7 +547,7 @@ define void @mul_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: mul_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -645,7 +645,7 @@ define void @mul_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: mul_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -749,7 +749,7 @@ define void @mul_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: mul_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -851,7 +851,7 @@ define void @sub_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: sub_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -949,7 +949,7 @@ define void @sub_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: sub_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1047,7 +1047,7 @@ define void @sub_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: sub_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1145,7 +1145,7 @@ define void @sub_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: sub_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1246,7 +1246,7 @@ define void @abs_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: abs_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -1334,7 +1334,7 @@ define void @abs_v32i16(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: abs_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1352,9 +1352,9 @@ define void @abs_v64i16(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: abs_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #48 -; CHECK-NEXT: mov x9, #16 -; CHECK-NEXT: mov x10, #32 +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: mov x9, #16 // =0x10 +; CHECK-NEXT: mov x10, #48 // =0x30 ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] @@ -1378,13 +1378,13 @@ define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: abs_v128i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #96 -; CHECK-NEXT: mov x9, #48 -; CHECK-NEXT: mov x10, #16 -; CHECK-NEXT: mov x11, #80 -; CHECK-NEXT: mov x12, #32 -; CHECK-NEXT: mov x13, #112 -; CHECK-NEXT: mov x14, #64 +; CHECK-NEXT: mov x8, #112 // =0x70 +; CHECK-NEXT: mov x9, #32 // =0x20 +; CHECK-NEXT: mov x10, #16 // =0x10 +; CHECK-NEXT: mov x11, #64 // =0x40 +; CHECK-NEXT: mov x12, #48 // =0x30 +; CHECK-NEXT: mov x13, #96 // =0x60 +; CHECK-NEXT: mov x14, #80 // =0x50 ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] @@ -1454,7 +1454,7 @@ define void @abs_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: abs_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1542,7 +1542,7 @@ define void @abs_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: abs_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll @@ -52,7 +52,7 @@ define void @icmp_eq_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: icmp_eq_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -162,7 +162,7 @@ define void @icmp_eq_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: icmp_eq_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -272,7 +272,7 @@ define void @icmp_eq_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: icmp_eq_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -382,7 +382,7 @@ define void @icmp_eq_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: icmp_eq_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -370,25 +370,25 @@ define void @sdiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q3, q0, [x1] +; VBITS_GE_128-NEXT: ldp q0, q3, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0 -; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 -; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: sshll2 v4.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 -; VBITS_GE_128-NEXT: sshll2 v7.4s, v1.8h, #0 -; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: ldp q2, q1, [x0] +; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 -; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: sshll2 v7.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z2.s ; VBITS_GE_128-NEXT: movprfx z2, z7 ; VBITS_GE_128-NEXT: sdiv z2.s, p0/m, z2.s, z6.s +; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z3.s ; VBITS_GE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v4.8h -; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: stp q0, q1, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v16i16: @@ -543,24 +543,24 @@ define void @sdiv_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] -; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z5.s -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q6, q4, [x1] -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q4, q6, [x1] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z2 ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z6.s ; VBITS_GE_128-NEXT: movprfx z1, z3 ; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z4.s -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -666,24 +666,24 @@ define void @sdiv_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] -; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z5.d -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q6, q4, [x1] -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q4, q6, [x1] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z2 ; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z6.d ; VBITS_GE_128-NEXT: movprfx z1, z3 ; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z4.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1093,25 +1093,25 @@ define void @udiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q3, q0, [x1] +; VBITS_GE_128-NEXT: ldp q0, q3, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0 -; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 -; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ushll2 v4.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 -; VBITS_GE_128-NEXT: ushll2 v7.4s, v1.8h, #0 -; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: ldp q2, q1, [x0] +; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 -; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: ushll2 v7.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z2.s ; VBITS_GE_128-NEXT: movprfx z2, z7 ; VBITS_GE_128-NEXT: udiv z2.s, p0/m, z2.s, z6.s +; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z3.s ; VBITS_GE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v4.8h -; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: stp q0, q1, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v16i16: @@ -1257,24 +1257,24 @@ define void @udiv_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] -; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z5.s -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q6, q4, [x1] -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q4, q6, [x1] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z2 ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z6.s ; VBITS_GE_128-NEXT: movprfx z1, z3 ; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z4.s -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1380,24 +1380,24 @@ define void @udiv_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] -; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z5.d -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q6, q4, [x1] -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q4, q6, [x1] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z2 ; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z6.d ; VBITS_GE_128-NEXT: movprfx z1, z3 ; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z4.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -73,7 +73,7 @@ ; VBITS_GE_256-LABEL: sext_v32i8_v32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b @@ -157,7 +157,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -242,7 +242,7 @@ ; VBITS_GE_256-LABEL: sext_v8i8_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -322,7 +322,7 @@ ; VBITS_GE_256-LABEL: sext_v16i16_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h @@ -406,7 +406,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s @@ -486,7 +486,7 @@ ; VBITS_GE_256-LABEL: sext_v8i32_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s @@ -569,7 +569,7 @@ ; VBITS_GE_256-LABEL: zext_v32i8_v32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b @@ -653,7 +653,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -738,7 +738,7 @@ ; VBITS_GE_256-LABEL: zext_v8i8_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -818,7 +818,7 @@ ; VBITS_GE_256-LABEL: zext_v16i16_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h @@ -902,7 +902,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s @@ -982,7 +982,7 @@ ; VBITS_GE_256-LABEL: zext_v8i32_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll @@ -48,7 +48,7 @@ define void @and_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: and_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @and_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: and_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -244,7 +244,7 @@ define void @and_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: and_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -342,7 +342,7 @@ define void @and_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: and_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -444,7 +444,7 @@ define void @or_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: or_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -542,7 +542,7 @@ define void @or_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: or_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -640,7 +640,7 @@ define void @or_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: or_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -738,7 +738,7 @@ define void @or_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: or_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -840,7 +840,7 @@ define void @xor_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: xor_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -938,7 +938,7 @@ define void @xor_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: xor_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1036,7 +1036,7 @@ define void @xor_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: xor_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1134,7 +1134,7 @@ define void @xor_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: xor_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll @@ -48,7 +48,7 @@ define void @smax_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smax_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @smax_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smax_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -244,7 +244,7 @@ define void @smax_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smax_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -350,7 +350,7 @@ define void @smax_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smax_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -452,7 +452,7 @@ define void @smin_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smin_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -550,7 +550,7 @@ define void @smin_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smin_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -648,7 +648,7 @@ define void @smin_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smin_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -754,7 +754,7 @@ define void @smin_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smin_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -856,7 +856,7 @@ define void @umax_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umax_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -954,7 +954,7 @@ define void @umax_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umax_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1052,7 +1052,7 @@ define void @umax_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umax_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1158,7 +1158,7 @@ define void @umax_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umax_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1260,7 +1260,7 @@ define void @umin_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umin_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -1358,7 +1358,7 @@ define void @umin_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umin_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1456,7 +1456,7 @@ define void @umin_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umin_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1562,7 +1562,7 @@ define void @umin_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umin_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -53,11 +53,22 @@ define void @smulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: smulh_v32i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 // =0x10 +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1sb { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -73,26 +84,37 @@ define void @smulh_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smulh_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 +; VBITS_GE_256-NEXT: mov w8, #16 // =0x10 +; VBITS_GE_256-NEXT: mov w9, #32 // =0x20 +; VBITS_GE_256-NEXT: mov w10, #48 // =0x30 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1sb { z1.h }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1sb { z2.h }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1sb { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1sb { z4.h }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1sb { z5.h }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1sb { z6.h }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1sb { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.h, p0/m, z2.h, z5.h +; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z4.h +; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z6.h +; VBITS_GE_256-NEXT: mul z3.h, p0/m, z3.h, z7.h +; VBITS_GE_256-NEXT: lsr z2.h, z2.h, #8 +; VBITS_GE_256-NEXT: lsr z1.h, z1.h, #8 +; VBITS_GE_256-NEXT: lsr z0.h, z0.h, #8 +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: lsr z3.h, z3.h, #8 +; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b +; VBITS_GE_256-NEXT: splice z2.b, p0, z2.b, z0.b ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z2.b -; VBITS_GE_256-NEXT: smulh z1.b, p0/m, z1.b, z3.b -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: smulh_v64i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <64 x i8>, ptr %a %op2 = load <64 x i8>, ptr %b %insert = insertelement <64 x i16> undef, i16 8, i64 0 @@ -109,11 +131,22 @@ define void @smulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: smulh_v128i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #64 // =0x40 +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1sb { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -129,11 +162,22 @@ define void @smulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: smulh_v256i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #128 // =0x80 +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1sb { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl256 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a %op2 = load <256 x i8>, ptr %b @@ -185,11 +229,22 @@ define void @smulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: smulh_v16i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 // =0x8 +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1sh { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1sh { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -205,26 +260,37 @@ define void @smulh_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smulh_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x10, #24 // =0x18 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z1.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z2.s }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1sh { z4.s }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z5.s }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z6.s }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.s, p0/m, z2.s, z5.s +; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z4.s +; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z6.s +; VBITS_GE_256-NEXT: mul z3.s, p0/m, z3.s, z7.s +; VBITS_GE_256-NEXT: lsr z2.s, z2.s, #16 +; VBITS_GE_256-NEXT: lsr z1.s, z1.s, #16 +; VBITS_GE_256-NEXT: lsr z0.s, z0.s, #16 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: lsr z3.s, z3.s, #16 +; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z2.h -; VBITS_GE_256-NEXT: smulh z1.h, p0/m, z1.h, z3.h -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: smulh_v32i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <32 x i16>, ptr %a %op2 = load <32 x i16>, ptr %b %1 = sext <32 x i16> %op1 to <32 x i32> @@ -239,11 +305,22 @@ define void @smulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: smulh_v64i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1sh { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1sh { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a %op2 = load <64 x i16>, ptr %b @@ -259,11 +336,22 @@ define void @smulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: smulh_v128i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #64 // =0x40 +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1sh { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1sh { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i16>, ptr %a %op2 = load <128 x i16>, ptr %b @@ -315,11 +403,22 @@ define void @smulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: smulh_v8i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1sw { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1sw { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -335,26 +434,37 @@ define void @smulh_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smulh_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x10, #12 // =0xc +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z1.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z2.d }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1sw { z4.d }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z5.d }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z6.d }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.d, p0/m, z2.d, z5.d +; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z4.d +; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z6.d +; VBITS_GE_256-NEXT: mul z3.d, p0/m, z3.d, z7.d +; VBITS_GE_256-NEXT: lsr z2.d, z2.d, #32 +; VBITS_GE_256-NEXT: lsr z1.d, z1.d, #32 +; VBITS_GE_256-NEXT: lsr z0.d, z0.d, #32 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 +; VBITS_GE_256-NEXT: lsr z3.d, z3.d, #32 +; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p0, z2.s, z0.s ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z2.s -; VBITS_GE_256-NEXT: smulh z1.s, p0/m, z1.s, z3.s -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: smulh_v16i32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, ptr %a %op2 = load <16 x i32>, ptr %b %1 = sext <16 x i32> %op1 to <16 x i64> @@ -369,11 +479,22 @@ define void @smulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: smulh_v32i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1sw { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1sw { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i32>, ptr %a %op2 = load <32 x i32>, ptr %b @@ -389,11 +510,22 @@ define void @smulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: smulh_v64i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1sw { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1sw { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i32>, ptr %a %op2 = load <64 x i32>, ptr %b @@ -582,11 +714,22 @@ define void @umulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: umulh_v32i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 // =0x10 +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -602,26 +745,37 @@ define void @umulh_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umulh_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 +; VBITS_GE_256-NEXT: mov w8, #16 // =0x10 +; VBITS_GE_256-NEXT: mov w9, #32 // =0x20 +; VBITS_GE_256-NEXT: mov w10, #48 // =0x30 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.h }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.h }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z4.h }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z5.h }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z6.h }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.h, p0/m, z2.h, z5.h +; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z4.h +; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z6.h +; VBITS_GE_256-NEXT: mul z3.h, p0/m, z3.h, z7.h +; VBITS_GE_256-NEXT: lsr z2.h, z2.h, #8 +; VBITS_GE_256-NEXT: lsr z1.h, z1.h, #8 +; VBITS_GE_256-NEXT: lsr z0.h, z0.h, #8 +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: lsr z3.h, z3.h, #8 +; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b +; VBITS_GE_256-NEXT: splice z2.b, p0, z2.b, z0.b ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z2.b -; VBITS_GE_256-NEXT: umulh z1.b, p0/m, z1.b, z3.b -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: umulh_v64i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <64 x i8>, ptr %a %op2 = load <64 x i8>, ptr %b %1 = zext <64 x i8> %op1 to <64 x i16> @@ -636,11 +790,22 @@ define void @umulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: umulh_v128i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #64 // =0x40 +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -658,11 +823,22 @@ define void @umulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: umulh_v256i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #128 // =0x80 +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl256 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a %op2 = load <256 x i8>, ptr %b @@ -715,11 +891,22 @@ define void @umulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: umulh_v16i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 // =0x8 +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -735,26 +922,37 @@ define void @umulh_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umulh_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x10, #24 // =0x18 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.s }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.s }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.s }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.s, p0/m, z2.s, z5.s +; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z4.s +; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z6.s +; VBITS_GE_256-NEXT: mul z3.s, p0/m, z3.s, z7.s +; VBITS_GE_256-NEXT: lsr z2.s, z2.s, #16 +; VBITS_GE_256-NEXT: lsr z1.s, z1.s, #16 +; VBITS_GE_256-NEXT: lsr z0.s, z0.s, #16 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: lsr z3.s, z3.s, #16 +; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z2.h -; VBITS_GE_256-NEXT: umulh z1.h, p0/m, z1.h, z3.h -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: umulh_v32i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <32 x i16>, ptr %a %op2 = load <32 x i16>, ptr %b %1 = zext <32 x i16> %op1 to <32 x i32> @@ -769,11 +967,22 @@ define void @umulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: umulh_v64i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a %op2 = load <64 x i16>, ptr %b @@ -789,11 +998,22 @@ define void @umulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: umulh_v128i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #64 // =0x40 +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i16>, ptr %a %op2 = load <128 x i16>, ptr %b @@ -845,11 +1065,22 @@ define void @umulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: umulh_v8i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -867,26 +1098,37 @@ define void @umulh_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umulh_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x10, #12 // =0xc +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.d }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.d }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.d }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.d, p0/m, z2.d, z5.d +; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z4.d +; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z6.d +; VBITS_GE_256-NEXT: mul z3.d, p0/m, z3.d, z7.d +; VBITS_GE_256-NEXT: lsr z2.d, z2.d, #32 +; VBITS_GE_256-NEXT: lsr z1.d, z1.d, #32 +; VBITS_GE_256-NEXT: lsr z0.d, z0.d, #32 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 +; VBITS_GE_256-NEXT: lsr z3.d, z3.d, #32 +; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p0, z2.s, z0.s ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z2.s -; VBITS_GE_256-NEXT: umulh z1.s, p0/m, z1.s, z3.s -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: umulh_v16i32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, ptr %a %op2 = load <16 x i32>, ptr %b %1 = zext <16 x i32> %op1 to <16 x i64> @@ -901,11 +1143,22 @@ define void @umulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: umulh_v32i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i32>, ptr %a %op2 = load <32 x i32>, ptr %b @@ -921,11 +1174,22 @@ define void @umulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: umulh_v64i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i32>, ptr %a %op2 = load <64 x i32>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll @@ -48,7 +48,7 @@ define i8 @uaddv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: uaddv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -138,7 +138,7 @@ define i16 @uaddv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: uaddv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -228,7 +228,7 @@ define i32 @uaddv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: uaddv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -317,7 +317,7 @@ define i64 @uaddv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: uaddv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -406,7 +406,7 @@ define i8 @smaxv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: smaxv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -491,7 +491,7 @@ define i16 @smaxv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: smaxv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -576,7 +576,7 @@ define i32 @smaxv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: smaxv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -663,7 +663,7 @@ define i64 @smaxv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: smaxv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -752,7 +752,7 @@ define i8 @sminv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: sminv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -837,7 +837,7 @@ define i16 @sminv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: sminv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -922,7 +922,7 @@ define i32 @sminv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: sminv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1009,7 +1009,7 @@ define i64 @sminv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: sminv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1098,7 +1098,7 @@ define i8 @umaxv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: umaxv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -1183,7 +1183,7 @@ define i16 @umaxv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: umaxv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1268,7 +1268,7 @@ define i32 @umaxv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: umaxv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1355,7 +1355,7 @@ define i64 @umaxv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: umaxv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1444,7 +1444,7 @@ define i8 @uminv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: uminv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -1529,7 +1529,7 @@ define i16 @uminv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: uminv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1614,7 +1614,7 @@ define i32 @uminv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: uminv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1701,7 +1701,7 @@ define i64 @uminv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: uminv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -391,26 +391,26 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: sshll2 v4.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] ; VBITS_GE_128-NEXT: sshll2 v5.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: sshll v7.4s, v0.4h, #0 -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: sshll2 v4.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v17.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v6.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z17.s ; VBITS_GE_128-NEXT: sshll v16.4s, v2.4h, #0 -; VBITS_GE_128-NEXT: sshll2 v17.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s ; VBITS_GE_128-NEXT: sshll v6.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sdivr z7.s, p0/m, z7.s, z16.s ; VBITS_GE_128-NEXT: sshll v16.4s, v3.4h, #0 -; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z17.s ; VBITS_GE_128-NEXT: uzp1 v5.8h, v7.8h, v5.8h ; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z16.s ; VBITS_GE_128-NEXT: uzp1 v4.8h, v6.8h, v4.8h ; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v0.8h ; VBITS_GE_128-NEXT: mls v3.8h, v4.8h, v1.8h -; VBITS_GE_128-NEXT: stp q2, q3, [x0] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v16i16: @@ -583,25 +583,25 @@ define void @srem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z16, z0 ; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z4.s +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] ; VBITS_GE_128-NEXT: mls v0.4s, v16.4s, v4.4s -; VBITS_GE_128-NEXT: movprfx z16, z1 -; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z5.s -; VBITS_GE_128-NEXT: ldp q7, q6, [x1] ; VBITS_GE_128-NEXT: movprfx z4, z3 ; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z6.s +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z5.s ; VBITS_GE_128-NEXT: mls v1.4s, v16.4s, v5.4s ; VBITS_GE_128-NEXT: movprfx z5, z2 ; VBITS_GE_128-NEXT: sdiv z5.s, p0/m, z5.s, z7.s -; VBITS_GE_128-NEXT: mls v2.4s, v5.4s, v7.4s ; VBITS_GE_128-NEXT: mls v3.4s, v4.4s, v6.4s -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] -; VBITS_GE_128-NEXT: stp q2, q3, [x0] +; VBITS_GE_128-NEXT: mls v2.4s, v5.4s, v7.4s +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v16i32: @@ -730,27 +730,27 @@ define void @srem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_128-NEXT: ldp q3, q2, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z16, z1 ; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z3.d ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z3.d ; VBITS_GE_128-NEXT: movprfx z3, z0 ; VBITS_GE_128-NEXT: sdiv z3.d, p0/m, z3.d, z2.d ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z3.d, z2.d -; VBITS_GE_128-NEXT: ldp q4, q5, [x0] -; VBITS_GE_128-NEXT: ldp q7, q6, [x1] +; VBITS_GE_128-NEXT: ldp q5, q4, [x0] +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] ; VBITS_GE_128-NEXT: movprfx z16, z5 ; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z6.d ; VBITS_GE_128-NEXT: movprfx z2, z4 ; VBITS_GE_128-NEXT: sdiv z2.d, p0/m, z2.d, z7.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z4 ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z2.d, z7.d ; VBITS_GE_128-NEXT: movprfx z1, z5 ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z6.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v8i64: @@ -1209,26 +1209,26 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ushll2 v4.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] ; VBITS_GE_128-NEXT: ushll2 v5.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: ushll v7.4s, v0.4h, #0 -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ushll2 v4.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v17.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v6.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z17.s ; VBITS_GE_128-NEXT: ushll v16.4s, v2.4h, #0 -; VBITS_GE_128-NEXT: ushll2 v17.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s ; VBITS_GE_128-NEXT: ushll v6.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: udivr z7.s, p0/m, z7.s, z16.s ; VBITS_GE_128-NEXT: ushll v16.4s, v3.4h, #0 -; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z17.s ; VBITS_GE_128-NEXT: uzp1 v5.8h, v7.8h, v5.8h ; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z16.s ; VBITS_GE_128-NEXT: uzp1 v4.8h, v6.8h, v4.8h ; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v0.8h ; VBITS_GE_128-NEXT: mls v3.8h, v4.8h, v1.8h -; VBITS_GE_128-NEXT: stp q2, q3, [x0] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v16i16: @@ -1401,25 +1401,25 @@ define void @urem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z16, z0 ; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z4.s +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] ; VBITS_GE_128-NEXT: mls v0.4s, v16.4s, v4.4s -; VBITS_GE_128-NEXT: movprfx z16, z1 -; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z5.s -; VBITS_GE_128-NEXT: ldp q7, q6, [x1] ; VBITS_GE_128-NEXT: movprfx z4, z3 ; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z6.s +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z5.s ; VBITS_GE_128-NEXT: mls v1.4s, v16.4s, v5.4s ; VBITS_GE_128-NEXT: movprfx z5, z2 ; VBITS_GE_128-NEXT: udiv z5.s, p0/m, z5.s, z7.s -; VBITS_GE_128-NEXT: mls v2.4s, v5.4s, v7.4s ; VBITS_GE_128-NEXT: mls v3.4s, v4.4s, v6.4s -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] -; VBITS_GE_128-NEXT: stp q2, q3, [x0] +; VBITS_GE_128-NEXT: mls v2.4s, v5.4s, v7.4s +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v16i32: @@ -1548,27 +1548,27 @@ define void @urem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_128-NEXT: ldp q3, q2, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z16, z1 ; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z3.d ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z3.d ; VBITS_GE_128-NEXT: movprfx z3, z0 ; VBITS_GE_128-NEXT: udiv z3.d, p0/m, z3.d, z2.d ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z3.d, z2.d -; VBITS_GE_128-NEXT: ldp q4, q5, [x0] -; VBITS_GE_128-NEXT: ldp q7, q6, [x1] +; VBITS_GE_128-NEXT: ldp q5, q4, [x0] +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] ; VBITS_GE_128-NEXT: movprfx z16, z5 ; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z6.d ; VBITS_GE_128-NEXT: movprfx z2, z4 ; VBITS_GE_128-NEXT: udiv z2.d, p0/m, z2.d, z7.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z4 ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z2.d, z7.d ; VBITS_GE_128-NEXT: movprfx z1, z5 ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z6.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll @@ -54,7 +54,7 @@ define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.b @@ -178,7 +178,7 @@ define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.h @@ -302,7 +302,7 @@ define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.s @@ -406,9 +406,8 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d @@ -427,10 +426,9 @@ define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 -; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: and x9, x2, #0x1 +; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.d ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -446,9 +444,8 @@ ; ; VBITS_GE_512-LABEL: select_v8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 +; VBITS_GE_512-NEXT: and w8, w2, #0x1 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: and x8, x2, #0x1 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: ptrue p1.d @@ -467,9 +464,8 @@ define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d @@ -488,9 +484,8 @@ define void @select_v32i64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll @@ -50,7 +50,7 @@ define void @ashr_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ashr_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -150,7 +150,7 @@ define void @ashr_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ashr_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -250,7 +250,7 @@ define void @ashr_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ashr_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -350,7 +350,7 @@ define void @ashr_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ashr_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -454,7 +454,7 @@ define void @lshr_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: lshr_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -554,7 +554,7 @@ define void @lshr_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: lshr_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -654,7 +654,7 @@ define void @lshr_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: lshr_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -754,7 +754,7 @@ define void @lshr_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: lshr_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -856,7 +856,7 @@ define void @shl_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shl_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -954,7 +954,7 @@ define void @shl_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shl_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1052,7 +1052,7 @@ define void @shl_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shl_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1150,7 +1150,7 @@ define void @shl_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shl_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -50,7 +50,7 @@ define void @ucvtf_v32i16_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -147,7 +147,7 @@ ; VBITS_GE_256-LABEL: ucvtf_v16i16_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h @@ -252,7 +252,7 @@ ; VBITS_GE_256-LABEL: ucvtf_v8i16_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -352,7 +352,7 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -454,7 +454,7 @@ define void @ucvtf_v16i32_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -551,7 +551,7 @@ ; VBITS_GE_256-LABEL: ucvtf_v8i32_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s @@ -653,7 +653,7 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -757,7 +757,7 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -861,7 +861,7 @@ define void @ucvtf_v8i64_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -957,7 +957,7 @@ define void @scvtf_v32i16_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: scvtf_v32i16_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1054,7 +1054,7 @@ ; VBITS_GE_256-LABEL: scvtf_v16i16_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h @@ -1165,7 +1165,7 @@ ; VBITS_GE_256-LABEL: scvtf_v8i16_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -1271,7 +1271,7 @@ define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: scvtf_v16i32_v16f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1373,7 +1373,7 @@ define void @scvtf_v16i32_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: scvtf_v16i32_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1470,7 +1470,7 @@ ; VBITS_GE_256-LABEL: scvtf_v8i32_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s @@ -1578,7 +1578,7 @@ define <8 x half> @scvtf_v8i64_v8f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1682,7 +1682,7 @@ define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1786,7 +1786,7 @@ define void @scvtf_v8i64_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -50,7 +50,7 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -163,7 +163,7 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -276,7 +276,7 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -390,7 +390,7 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll @@ -52,7 +52,7 @@ define <16 x float> @load_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: load_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -87,9 +87,9 @@ define <32 x float> @load_v32f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: load_v32f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #16 -; VBITS_GE_256-NEXT: mov x10, #24 -; VBITS_GE_256-NEXT: mov x11, #8 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x10, #24 // =0x18 +; VBITS_GE_256-NEXT: mov x11, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] @@ -103,7 +103,7 @@ ; ; VBITS_GE_512-LABEL: load_v32f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov x9, #16 +; VBITS_GE_512-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -131,13 +131,13 @@ define <64 x float> @load_v64f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: load_v64f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 -; VBITS_GE_256-NEXT: mov x10, #48 -; VBITS_GE_256-NEXT: mov x11, #56 -; VBITS_GE_256-NEXT: mov x12, #32 -; VBITS_GE_256-NEXT: mov x13, #40 -; VBITS_GE_256-NEXT: mov x14, #16 -; VBITS_GE_256-NEXT: mov x15, #24 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x10, #48 // =0x30 +; VBITS_GE_256-NEXT: mov x11, #56 // =0x38 +; VBITS_GE_256-NEXT: mov x12, #32 // =0x20 +; VBITS_GE_256-NEXT: mov x13, #40 // =0x28 +; VBITS_GE_256-NEXT: mov x14, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x15, #24 // =0x18 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x10, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x11, lsl #2] @@ -159,9 +159,9 @@ ; ; VBITS_GE_512-LABEL: load_v64f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov x9, #32 -; VBITS_GE_512-NEXT: mov x10, #48 -; VBITS_GE_512-NEXT: mov x11, #16 +; VBITS_GE_512-NEXT: mov x9, #32 // =0x20 +; VBITS_GE_512-NEXT: mov x10, #48 // =0x30 +; VBITS_GE_512-NEXT: mov x11, #16 // =0x10 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] @@ -175,7 +175,7 @@ ; ; VBITS_GE_1024-LABEL: load_v64f32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: mov x9, #32 +; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll @@ -42,7 +42,7 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -146,7 +146,7 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -244,7 +244,7 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -338,7 +338,7 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -60,7 +60,7 @@ ; VBITS_GE_256-LABEL: masked_gather_v8i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr d0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] @@ -214,7 +214,7 @@ ; VBITS_GE_256-LABEL: masked_gather_v8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] @@ -350,7 +350,7 @@ ; VBITS_GE_256-LABEL: masked_gather_v8i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] @@ -497,7 +497,7 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -623,7 +623,7 @@ ; VBITS_GE_256-LABEL: masked_gather_v8f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: fcmeq v0.8h, v0.8h, #0.0 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] @@ -759,7 +759,7 @@ ; VBITS_GE_256-LABEL: masked_gather_v8f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] @@ -906,7 +906,7 @@ define void @masked_gather_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -980,13 +980,29 @@ ; CHECK-LABEL: masked_gather_32b_scaled_sext_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, sxtw #1] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x1] +; CHECK-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #32 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: and p2.b, p2/z, p2.b, p1.b +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x2, z0.d, lsl #1] +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x2, z1.d, lsl #1] +; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1002,11 +1018,25 @@ ; CHECK-LABEL: masked_gather_32b_scaled_sext_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1sw { z2.d }, p1/z, [x1] +; CHECK-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 +; CHECK-NEXT: and p2.b, p2/z, p2.b, p1.b +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: ld1w { z2.d }, p2/z, [x2, z2.d, lsl #2] +; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2] +; CHECK-NEXT: ptrue p1.s, vl16 +; CHECK-NEXT: uzp1 z1.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: splice z1.s, p1, z1.s, z0.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1042,13 +1072,29 @@ ; CHECK-LABEL: masked_gather_32b_scaled_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, uxtw #1] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ld1w { z1.d }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1] +; CHECK-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #32 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: and p2.b, p2/z, p2.b, p1.b +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x2, z0.d, lsl #1] +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x2, z1.d, lsl #1] +; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1064,13 +1110,29 @@ ; CHECK-LABEL: masked_gather_32b_unscaled_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, sxtw] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x1] +; CHECK-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #32 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: and p2.b, p2/z, p2.b, p1.b +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x2, z0.d] +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x2, z1.d] +; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1087,13 +1149,29 @@ ; CHECK-LABEL: masked_gather_32b_unscaled_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, uxtw] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ld1w { z1.d }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1] +; CHECK-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #32 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: and p2.b, p2/z, p2.b, p1.b +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x2, z0.d] +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x2, z1.d] +; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -88,7 +88,7 @@ define <16 x float> @masked_load_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -155,7 +155,7 @@ define <64 x i8> @masked_load_v64i8(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w9, #32 +; VBITS_GE_256-NEXT: mov w9, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -188,7 +188,7 @@ define <32 x i16> @masked_load_v32i16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -221,7 +221,7 @@ define <16 x i32> @masked_load_v16i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -254,7 +254,7 @@ define <8 x i64> @masked_load_v8i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -287,7 +287,7 @@ define <8 x i64> @masked_load_passthru_v8i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_passthru_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -323,7 +323,7 @@ define <8 x double> @masked_load_passthru_v8f64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_passthru_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -360,7 +360,7 @@ ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -392,7 +392,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -426,7 +426,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr d0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -460,7 +460,7 @@ ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -492,7 +492,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -525,7 +525,7 @@ ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -556,7 +556,7 @@ ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -588,7 +588,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -622,7 +622,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr d0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -656,7 +656,7 @@ ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -688,7 +688,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -721,7 +721,7 @@ ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -751,7 +751,7 @@ define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] @@ -791,7 +791,7 @@ define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] @@ -834,7 +834,7 @@ define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -879,7 +879,7 @@ define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] @@ -921,7 +921,7 @@ define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -964,7 +964,7 @@ define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -1004,7 +1004,7 @@ define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] @@ -1044,7 +1044,7 @@ define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] @@ -1087,7 +1087,7 @@ define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -1132,7 +1132,7 @@ define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] @@ -1174,7 +1174,7 @@ define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -1217,7 +1217,7 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -1450,7 +1450,7 @@ ; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -1481,7 +1481,7 @@ ; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpgt p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -57,7 +57,7 @@ ; VBITS_GE_256-LABEL: masked_scatter_v8i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr d0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: cmeq v1.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: zip1 v5.8b, v0.8b, v0.8b @@ -203,7 +203,7 @@ ; VBITS_GE_256-LABEL: masked_scatter_v8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: cmeq v1.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] @@ -332,7 +332,7 @@ ; VBITS_GE_256-LABEL: masked_scatter_v8i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] @@ -467,7 +467,7 @@ define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_scatter_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -581,7 +581,7 @@ ; VBITS_GE_256-LABEL: masked_scatter_v8f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: fcmeq v1.8h, v0.8h, #0.0 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] @@ -710,7 +710,7 @@ ; VBITS_GE_256-LABEL: masked_scatter_v8f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] @@ -845,7 +845,7 @@ define void @masked_scatter_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_scatter_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -911,13 +911,15 @@ ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw #1] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1h { z0.d }, p0, [x2, z1.d, lsl #1] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -932,10 +934,13 @@ ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: st1w { z0.s }, p0, [x2, z1.s, sxtw #2] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] ; CHECK-NEXT: ret %vals = load <32 x float>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -968,13 +973,15 @@ ; CHECK-LABEL: masked_scatter_32b_scaled_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: ld1w { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw #1] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1h { z0.d }, p0, [x2, z1.d, lsl #1] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -989,13 +996,15 @@ ; CHECK-LABEL: masked_scatter_32b_unscaled_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1h { z0.d }, p0, [x2, z1.d] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1011,13 +1020,15 @@ ; CHECK-LABEL: masked_scatter_32b_unscaled_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: ld1w { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1h { z0.d }, p0, [x2, z1.d] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -84,7 +84,7 @@ define void @masked_store_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_store_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @masked_store_trunc_v8i64i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -187,7 +187,7 @@ define void @masked_store_trunc_v8i64i16(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -231,7 +231,7 @@ define void @masked_store_trunc_v8i64i32(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -272,7 +272,7 @@ define void @masked_store_trunc_v16i32i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -316,7 +316,7 @@ define void @masked_store_trunc_v16i32i16(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -360,7 +360,7 @@ define void @masked_store_trunc_v32i16i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v32i16i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll @@ -165,7 +165,7 @@ define void @test_revhv32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: test_revhv32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ptrue p1.d ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] @@ -472,16 +472,16 @@ ; CHECK-NEXT: st1 { v1.h }[4], [x9] ; CHECK-NEXT: orr x9, x8, #0x4 ; CHECK-NEXT: st1 { v1.h }[5], [x10] -; CHECK-NEXT: mov w10, #26 +; CHECK-NEXT: mov w10, #26 // =0x1a ; CHECK-NEXT: orr x10, x8, x10 ; CHECK-NEXT: st1 { v0.h }[3], [x12] ; CHECK-NEXT: st1 { v1.h }[1], [x9] ; CHECK-NEXT: orr x9, x8, #0x2 ; CHECK-NEXT: st1 { v1.h }[7], [x11] -; CHECK-NEXT: mov w11, #20 -; CHECK-NEXT: mov w12, #18 +; CHECK-NEXT: mov w11, #20 // =0x14 +; CHECK-NEXT: mov w12, #18 // =0x12 ; CHECK-NEXT: st1 { v0.h }[6], [x10] -; CHECK-NEXT: mov w10, #10 +; CHECK-NEXT: mov w10, #10 // =0xa ; CHECK-NEXT: orr x11, x8, x11 ; CHECK-NEXT: st1 { v1.h }[2], [x9] ; CHECK-NEXT: orr x9, x8, x12 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll @@ -32,7 +32,7 @@ define void @zip_v32i16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: zip_v32i16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -241,7 +241,7 @@ define void @trn_v32i16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: trn_v32i16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -514,7 +514,7 @@ define void @uzp_v32i16(ptr %a, ptr %b) #1 { ; CHECK-LABEL: uzp_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll @@ -50,7 +50,7 @@ define void @bitreverse_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: bitreverse_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -142,7 +142,7 @@ define void @bitreverse_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: bitreverse_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -234,7 +234,7 @@ define void @bitreverse_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: bitreverse_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -326,7 +326,7 @@ define void @bitreverse_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: bitreverse_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -418,7 +418,7 @@ define void @bswap_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: bswap_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -506,7 +506,7 @@ define void @bswap_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: bswap_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -594,7 +594,7 @@ define void @bswap_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: bswap_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll @@ -47,7 +47,7 @@ define void @sdiv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: sdiv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -141,7 +141,7 @@ define void @sdiv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: sdiv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -236,7 +236,7 @@ define void @sdiv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: sdiv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -331,7 +331,7 @@ define void @sdiv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: sdiv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -56,9 +56,9 @@ ; CHECK-NEXT: mov v1.b[5], w10 ; CHECK-NEXT: umov w10, v0.b[14] ; CHECK-NEXT: mov v2.b[5], w8 -; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: mov v1.b[6], w9 -; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x9, #24 // =0x18 ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: mov v2.b[6], w10 ; CHECK-NEXT: umov w10, v0.b[15] @@ -71,7 +71,7 @@ ; CHECK-NEXT: mov v2.b[7], w10 ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: mov x11, #8 +; CHECK-NEXT: mov x11, #8 // =0x8 ; CHECK-NEXT: lsl z0.s, z0.s, #31 ; CHECK-NEXT: lsl z3.s, z3.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -47,7 +47,7 @@ define void @splat_v64i8(i8 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: mov z0.b, w0 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] @@ -130,7 +130,7 @@ define void @splat_v32i16(i16 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: mov z0.h, w0 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] @@ -213,7 +213,7 @@ define void @splat_v16i32(i32 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: mov z0.s, w0 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] @@ -296,7 +296,7 @@ define void @splat_v8i64(i64 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: mov z0.d, x0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] @@ -386,7 +386,7 @@ define void @splat_v32f16(half %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov z0.h, h0 @@ -476,7 +476,7 @@ define void @splat_v16f32(float %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, s0 @@ -564,7 +564,7 @@ define void @splat_v8f64(double %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov z0.d, d0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll @@ -52,7 +52,7 @@ define void @store_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: store_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] @@ -86,9 +86,9 @@ define void @store_v32f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: store_v32f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #24 -; VBITS_GE_256-NEXT: mov x9, #16 -; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x8, #24 // =0x18 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x10, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -99,7 +99,7 @@ ; ; VBITS_GE_512-LABEL: store_v32f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov x8, #16 +; VBITS_GE_512-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] @@ -126,17 +126,17 @@ define void @store_v64f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: store_v64f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #56 -; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x8, #56 // =0x38 +; VBITS_GE_256-NEXT: mov x9, #48 // =0x30 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_256-NEXT: mov x10, #40 -; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x10, #40 // =0x28 +; VBITS_GE_256-NEXT: mov x11, #32 // =0x20 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: mov x8, #24 -; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x8, #24 // =0x18 +; VBITS_GE_256-NEXT: mov x12, #16 // =0x10 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -147,9 +147,9 @@ ; ; VBITS_GE_512-LABEL: store_v64f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov x8, #48 -; VBITS_GE_512-NEXT: mov x9, #32 -; VBITS_GE_512-NEXT: mov x10, #16 +; VBITS_GE_512-NEXT: mov x8, #48 // =0x30 +; VBITS_GE_512-NEXT: mov x9, #32 // =0x20 +; VBITS_GE_512-NEXT: mov x10, #16 // =0x10 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -160,7 +160,7 @@ ; ; VBITS_GE_1024-LABEL: store_v64f32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: mov x8, #32 +; VBITS_GE_1024-NEXT: mov x8, #32 // =0x20 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0 ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll @@ -46,7 +46,7 @@ define void @subvector_v32i16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: subvector_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -101,7 +101,7 @@ define void @subvector_v16i32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: subvector_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -157,7 +157,7 @@ define void @subvector_v8i64(ptr %in, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: subvector_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -234,7 +234,7 @@ define void @subvector_v32f16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: subvector_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -289,7 +289,7 @@ define void @subvector_v16f32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: subvector_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -343,7 +343,7 @@ define void @subvector_v8f64(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: subvector_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll @@ -34,7 +34,7 @@ define void @store_trunc_v8i64i8(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256-LABEL: store_trunc_v8i64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -88,7 +88,7 @@ ; Currently does not use the truncating store ; VBITS_GE_256-LABEL: store_trunc_v8i64i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -115,7 +115,7 @@ define void @store_trunc_v8i64i32(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256-LABEL: store_trunc_v8i64i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -143,7 +143,7 @@ ; Currently does not use the truncating store ; VBITS_GE_256-LABEL: store_trunc_v16i32i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -170,7 +170,7 @@ define void @store_trunc_v16i32i16(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256-LABEL: store_trunc_v16i32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -197,7 +197,7 @@ define void @store_trunc_v32i16i8(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256-LABEL: store_trunc_v32i16i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll @@ -26,7 +26,7 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: trunc_v32i16_v32i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -112,7 +112,7 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) #0 { ; VBITS_GE_256-LABEL: trunc_v16i32_v16i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -196,7 +196,7 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: trunc_v16i32_v16i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -283,7 +283,7 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) #0 { ; VBITS_GE_256-LABEL: trunc_v8i64_v8i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -366,7 +366,7 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) #0 { ; VBITS_GE_256-LABEL: trunc_v8i64_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -450,7 +450,7 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: trunc_v8i64_v8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll @@ -50,7 +50,7 @@ define void @shuffle_ext_byone_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] @@ -93,7 +93,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v128i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: whilels p1.b, xzr, x8 @@ -127,7 +127,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v256i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl256 -; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov w8, #255 // =0xff ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: whilels p1.b, xzr, x8 @@ -215,7 +215,7 @@ define void @shuffle_ext_byone_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] @@ -254,7 +254,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -280,7 +280,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v128i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -351,7 +351,7 @@ define void @shuffle_ext_byone_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] @@ -388,7 +388,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -410,7 +410,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -463,7 +463,7 @@ define void @shuffle_ext_byone_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] @@ -499,7 +499,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 @@ -519,7 +519,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 @@ -578,7 +578,7 @@ define void @shuffle_ext_byone_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] @@ -614,7 +614,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -640,7 +640,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v128f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -710,7 +710,7 @@ define void @shuffle_ext_byone_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] @@ -744,7 +744,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -766,7 +766,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -818,7 +818,7 @@ define void @shuffle_ext_byone_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] @@ -851,7 +851,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 @@ -871,7 +871,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -opaque-pointers -aarch64-sve-vector-bits-min=256 -verify-machineinstrs | FileCheck %s --check-prefixes=SVE256 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -opaque-pointers -aarch64-sve-vector-bits-min=128 -verify-machineinstrs | FileCheck %s --check-prefixes=NEON @@ -6,24 +7,38 @@ define internal i32 @test(ptr nocapture readonly %p1, i32 %i1, ptr nocapture readonly %p2, i32 %i2) { ; SVE256-LABEL: test: -; SVE256: ld1b { z0.h }, p0/z, -; SVE256: ld1b { z1.h }, p0/z, -; SVE256: sub z0.h, z0.h, z1.h -; SVE256-NEXT: sunpklo z1.s, z0.h -; SVE256-NEXT: ext z0.b, z0.b, z0.b, #16 -; SVE256-NEXT: sunpklo z0.s, z0.h -; SVE256-NEXT: add z0.s, z1.s, z0.s -; SVE256-NEXT: uaddv d0, p1, z0.s +; SVE256: // %bb.0: // %L.entry +; SVE256-NEXT: ptrue p0.s, vl8 +; SVE256-NEXT: mov w9, wzr +; SVE256-NEXT: mov w10, wzr +; SVE256-NEXT: mov w8, wzr +; SVE256-NEXT: mov w11, #-16 // =0xfffffff0 +; SVE256-NEXT: mov w12, #8 // =0x8 +; SVE256-NEXT: .p2align 5, , 16 +; SVE256-NEXT: .LBB0_1: // %L1 +; SVE256-NEXT: // =>This Inner Loop Header: Depth=1 +; SVE256-NEXT: sxtw x13, w9 +; SVE256-NEXT: sxtw x15, w10 +; SVE256-NEXT: adds w11, w11, #1 +; SVE256-NEXT: add w10, w10, w3 +; SVE256-NEXT: ld1b { z1.s }, p0/z, [x0, x13] +; SVE256-NEXT: add x14, x0, x13 +; SVE256-NEXT: add x16, x2, x15 +; SVE256-NEXT: ld1b { z3.s }, p0/z, [x2, x15] +; SVE256-NEXT: add w9, w9, w1 +; SVE256-NEXT: ld1b { z0.s }, p0/z, [x14, x12] +; SVE256-NEXT: ld1b { z2.s }, p0/z, [x16, x12] +; SVE256-NEXT: sub z1.s, z1.s, z3.s +; SVE256-NEXT: sub z0.s, z0.s, z2.s +; SVE256-NEXT: add z0.s, z1.s, z0.s +; SVE256-NEXT: uaddv d0, p0, z0.s +; SVE256-NEXT: fmov x13, d0 +; SVE256-NEXT: add w8, w13, w8 +; SVE256-NEXT: b.lo .LBB0_1 +; SVE256-NEXT: // %bb.2: // %L2 +; SVE256-NEXT: mov w0, w8 +; SVE256-NEXT: ret -; NEON-LABEL: test: -; NEON: ldr q0, [x0, w9, sxtw] -; NEON: ldr q1, [x2, w10, sxtw] -; NEON: usubl2 v2.8h, v0.16b, v1.16b -; NEON-NEXT: usubl v0.8h, v0.8b, v1.8b -; NEON: saddl2 v1.4s, v0.8h, v2.8h -; NEON-NEXT: saddl v0.4s, v0.4h, v2.4h -; NEON-NEXT: add v0.4s, v0.4s, v1.4s -; NEON-NEXT: addv s0, v0.4s L.entry: br label %L1 @@ -55,3 +70,5 @@ } declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; NEON: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll --- a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll +++ b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll @@ -33,9 +33,8 @@ ; CHECK-LABEL: sti32ldi32ext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: sxtw z0.d, p0/m, z0.d -; CHECK-NEXT: st1w { z1.d }, p0, [x0] +; CHECK-NEXT: st1w { z0.d }, p0, [x0] +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret entry: %0 = trunc %v to diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll --- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll +++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll @@ -6,7 +6,7 @@ define void @scatter_i8_index_offset_maximum(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_i8_index_offset_maximum: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #33554431 +; CHECK-NEXT: mov w8, #33554431 // =0x1ffffff ; CHECK-NEXT: add x9, x0, x1 ; CHECK-NEXT: index z1.s, #0, w8 ; CHECK-NEXT: st1b { z0.s }, p0, [x9, z1.s, sxtw] @@ -27,7 +27,7 @@ define void @scatter_i16_index_offset_minimum(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_i16_index_offset_minimum: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-33554432 +; CHECK-NEXT: mov w8, #-33554432 // =0xfe000000 ; CHECK-NEXT: add x9, x0, x1, lsl #1 ; CHECK-NEXT: index z1.s, #0, w8 ; CHECK-NEXT: st1h { z0.s }, p0, [x9, z1.s, sxtw #1] @@ -102,10 +102,10 @@ ; CHECK-LABEL: scatter_i8_index_offset_maximum_plus_one: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov w9, #67108864 +; CHECK-NEXT: mov w9, #67108864 // =0x4000000 ; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: add x11, x0, x1 -; CHECK-NEXT: mov w10, #33554432 +; CHECK-NEXT: mov w10, #33554432 // =0x2000000 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: madd x8, x8, x9, x11 ; CHECK-NEXT: uunpklo z2.d, z0.s @@ -132,11 +132,11 @@ ; CHECK-LABEL: scatter_i8_index_offset_minimum_minus_one: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov x9, #-2 +; CHECK-NEXT: mov x9, #-2 // =0xfffffffffffffffe ; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: movk x9, #64511, lsl #16 ; CHECK-NEXT: add x11, x0, x1 -; CHECK-NEXT: mov x10, #-33554433 +; CHECK-NEXT: mov x10, #-33554433 // =0xfffffffffdffffff ; CHECK-NEXT: madd x8, x8, x9, x11 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: uunpklo z2.d, z0.s @@ -163,10 +163,10 @@ ; CHECK-LABEL: scatter_i8_index_stride_too_big: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov x9, #-9223372036854775808 +; CHECK-NEXT: mov x9, #-9223372036854775808 // =0x8000000000000000 ; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: add x11, x0, x1 -; CHECK-NEXT: mov x10, #4611686018427387904 +; CHECK-NEXT: mov x10, #4611686018427387904 // =0x4000000000000000 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: madd x8, x8, x9, x11 ; CHECK-NEXT: uunpklo z2.d, z0.s @@ -214,7 +214,7 @@ define @gather_f32_index_offset_8(ptr %base, i64 %offset, %pg) #0 { ; CHECK-LABEL: gather_f32_index_offset_8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w8, #32 // =0x20 ; CHECK-NEXT: add x9, x0, x1, lsl #5 ; CHECK-NEXT: index z0.s, #0, w8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, z0.s, sxtw] @@ -255,7 +255,7 @@ define void @scatter_f16_index_offset_8(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_f16_index_offset_8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov w8, #16 // =0x10 ; CHECK-NEXT: add x9, x0, x1, lsl #4 ; CHECK-NEXT: index z1.s, #0, w8 ; CHECK-NEXT: st1h { z0.s }, p0, [x9, z1.s, sxtw] @@ -274,7 +274,7 @@ define void @scatter_f16_index_add_add(ptr %base, i64 %offset, i64 %offset2, %pg, %data) #0 { ; CHECK-LABEL: scatter_f16_index_add_add: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov w8, #16 // =0x10 ; CHECK-NEXT: add x9, x0, x2, lsl #4 ; CHECK-NEXT: add x9, x9, x1, lsl #4 ; CHECK-NEXT: index z1.s, #0, w8 @@ -297,7 +297,7 @@ define void @scatter_f16_index_add_add_mul(ptr %base, i64 %offset, i64 %offset2, %pg, %data) #0 { ; CHECK-LABEL: scatter_f16_index_add_add_mul: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #128 +; CHECK-NEXT: mov w8, #128 // =0x80 ; CHECK-NEXT: add x9, x0, x2, lsl #7 ; CHECK-NEXT: add x9, x9, x1, lsl #7 ; CHECK-NEXT: index z1.s, #0, w8 @@ -322,7 +322,7 @@ define @masked_gather_nxv2i64_const_with_vec_offsets( %vector_offsets, %pg) #0 { ; CHECK-LABEL: masked_gather_nxv2i64_const_with_vec_offsets: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 // =0x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3] ; CHECK-NEXT: ret %ptrs = getelementptr i64, ptr inttoptr (i64 8 to ptr), %vector_offsets @@ -347,7 +347,7 @@ define @masked_gather_nxv2i64_null_with__vec_plus_imm_offsets( %vector_offsets, %pg) #0 { ; CHECK-LABEL: masked_gather_nxv2i64_null_with__vec_plus_imm_offsets: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 // =0x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3] ; CHECK-NEXT: ret %scalar_offset.ins = insertelement undef, i64 1, i64 0 @@ -375,7 +375,7 @@ ; CHECK-LABEL: masked_gather_nxv4i32_u8_offsets: ; CHECK: // %bb.0: ; CHECK-NEXT: and z0.s, z0.s, #0xff -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i32, ptr %base, %offsets.zext @@ -400,7 +400,7 @@ define void @masked_scatter_nxv2i64_const_with_vec_offsets( %vector_offsets, %pg, %data) #0 { ; CHECK-LABEL: masked_scatter_nxv2i64_const_with_vec_offsets: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 // =0x8 ; CHECK-NEXT: st1d { z1.d }, p0, [x8, z0.d, lsl #3] ; CHECK-NEXT: ret %ptrs = getelementptr i64, ptr inttoptr (i64 8 to ptr), %vector_offsets @@ -425,7 +425,7 @@ define void @masked_scatter_nxv2i64_null_with__vec_plus_imm_offsets( %vector_offsets, %pg, %data) #0 { ; CHECK-LABEL: masked_scatter_nxv2i64_null_with__vec_plus_imm_offsets: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 // =0x8 ; CHECK-NEXT: st1d { z1.d }, p0, [x8, z0.d, lsl #3] ; CHECK-NEXT: ret %scalar_offset.ins = insertelement undef, i64 1, i64 0 @@ -453,7 +453,7 @@ ; CHECK-LABEL: masked_scatter_nxv4i32_u8_offsets: ; CHECK: // %bb.0: ; CHECK-NEXT: and z0.s, z0.s, #0xff -; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2] +; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i32, ptr %base, %offsets.zext diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll --- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll +++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll @@ -77,18 +77,30 @@ ; CHECK-LABEL: narrow_i64_gather_index_i8_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x1, x2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1, x2] -; CHECK-NEXT: ld1b { z1.s }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1b { z2.s }, p0/z, [x8, #2, mul vl] -; CHECK-NEXT: ld1b { z3.s }, p0/z, [x8, #3, mul vl] -; CHECK-NEXT: ld1b { z3.s }, p0/z, [x1, z3.s, uxtw] -; CHECK-NEXT: ld1b { z2.s }, p0/z, [x1, z2.s, uxtw] -; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1, z0.s, uxtw] -; CHECK-NEXT: ld1b { z1.s }, p0/z, [x1, z1.s, uxtw] -; CHECK-NEXT: uzp1 z2.h, z2.h, z3.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1, x2] +; CHECK-NEXT: ld1b { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1b { z2.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1b { z3.d }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1b { z4.d }, p0/z, [x8, #4, mul vl] +; CHECK-NEXT: ld1b { z5.d }, p0/z, [x8, #5, mul vl] +; CHECK-NEXT: ld1b { z6.d }, p0/z, [x8, #6, mul vl] +; CHECK-NEXT: ld1b { z7.d }, p0/z, [x8, #7, mul vl] +; CHECK-NEXT: ld1b { z7.d }, p0/z, [x1, z7.d] +; CHECK-NEXT: ld1b { z6.d }, p0/z, [x1, z6.d] +; CHECK-NEXT: ld1b { z5.d }, p0/z, [x1, z5.d] +; CHECK-NEXT: ld1b { z4.d }, p0/z, [x1, z4.d] +; CHECK-NEXT: ld1b { z3.d }, p0/z, [x1, z3.d] +; CHECK-NEXT: ld1b { z2.d }, p0/z, [x1, z2.d] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1, z0.d] +; CHECK-NEXT: ld1b { z1.d }, p0/z, [x1, z1.d] +; CHECK-NEXT: uzp1 z6.s, z6.s, z7.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z5.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z4.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: ret %1 = getelementptr inbounds i8, i8* %in, i64 %ptr %2 = bitcast i8* %1 to * @@ -103,18 +115,30 @@ ; CHECK-LABEL: narrow_i64_gather_index_i8_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x1, x2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x1, x2] -; CHECK-NEXT: ld1sb { z1.s }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1sb { z2.s }, p0/z, [x8, #2, mul vl] -; CHECK-NEXT: ld1sb { z3.s }, p0/z, [x8, #3, mul vl] -; CHECK-NEXT: ld1b { z3.s }, p0/z, [x1, z3.s, sxtw] -; CHECK-NEXT: ld1b { z2.s }, p0/z, [x1, z2.s, sxtw] -; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1, z0.s, sxtw] -; CHECK-NEXT: ld1b { z1.s }, p0/z, [x1, z1.s, sxtw] -; CHECK-NEXT: uzp1 z2.h, z2.h, z3.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x1, x2] +; CHECK-NEXT: ld1sb { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1sb { z2.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1sb { z3.d }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1sb { z4.d }, p0/z, [x8, #4, mul vl] +; CHECK-NEXT: ld1sb { z5.d }, p0/z, [x8, #5, mul vl] +; CHECK-NEXT: ld1sb { z6.d }, p0/z, [x8, #6, mul vl] +; CHECK-NEXT: ld1sb { z7.d }, p0/z, [x8, #7, mul vl] +; CHECK-NEXT: ld1b { z7.d }, p0/z, [x1, z7.d] +; CHECK-NEXT: ld1b { z6.d }, p0/z, [x1, z6.d] +; CHECK-NEXT: ld1b { z5.d }, p0/z, [x1, z5.d] +; CHECK-NEXT: ld1b { z4.d }, p0/z, [x1, z4.d] +; CHECK-NEXT: ld1b { z3.d }, p0/z, [x1, z3.d] +; CHECK-NEXT: ld1b { z2.d }, p0/z, [x1, z2.d] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1, z0.d] +; CHECK-NEXT: ld1b { z1.d }, p0/z, [x1, z1.d] +; CHECK-NEXT: uzp1 z6.s, z6.s, z7.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z5.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z4.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: ret %1 = getelementptr inbounds i8, i8* %in, i64 %ptr %2 = bitcast i8* %1 to * @@ -129,12 +153,18 @@ ; CHECK-LABEL: narrow_i64_gather_index_i16_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x1, x2, lsl #1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1, x2, lsl #1] -; CHECK-NEXT: ld1h { z1.s }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1, z0.s, uxtw #1] -; CHECK-NEXT: ld1h { z1.s }, p0/z, [x1, z1.s, uxtw #1] -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1, x2, lsl #1] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1h { z2.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1h { z3.d }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1h { z3.d }, p0/z, [x1, z3.d, lsl #1] +; CHECK-NEXT: ld1h { z2.d }, p0/z, [x1, z2.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x1, z1.d, lsl #1] +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h ; CHECK-NEXT: ret %1 = getelementptr inbounds i16, i16* %in, i64 %ptr %2 = bitcast i16* %1 to * @@ -149,12 +179,18 @@ ; CHECK-LABEL: narrow_i64_gather_index_i16_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x1, x2, lsl #1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x1, x2, lsl #1] -; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1, z0.s, sxtw #1] -; CHECK-NEXT: ld1h { z1.s }, p0/z, [x1, z1.s, sxtw #1] -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x1, x2, lsl #1] +; CHECK-NEXT: ld1sh { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1sh { z2.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1sh { z3.d }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1h { z3.d }, p0/z, [x1, z3.d, lsl #1] +; CHECK-NEXT: ld1h { z2.d }, p0/z, [x1, z2.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x1, z1.d, lsl #1] +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h ; CHECK-NEXT: ret %1 = getelementptr inbounds i16, i16* %in, i64 %ptr %2 = bitcast i16* %1 to * @@ -168,9 +204,13 @@ define @no_narrow_i64_gather_index_i32(i32* %out, i32* %in, %d, i64 %ptr){ ; CHECK-LABEL: no_narrow_i64_gather_index_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1, x2, lsl #2] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1, z0.s, uxtw #2] +; CHECK-NEXT: add x8, x1, x2, lsl #2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1, x2, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1, z0.d, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x1, z1.d, lsl #2] +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: ret %1 = getelementptr inbounds i32, i32* %in, i64 %ptr %2 = bitcast i32* %1 to * diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll @@ -44,8 +44,8 @@ define @index_ii_range() { ; CHECK-LABEL: index_ii_range: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16 -; CHECK-NEXT: mov x9, #-17 +; CHECK-NEXT: mov w8, #16 // =0x10 +; CHECK-NEXT: mov x9, #-17 // =0xffffffffffffffef ; CHECK-NEXT: index z0.d, x9, x8 ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv2i64(i64 -17, i64 16) @@ -55,8 +55,7 @@ define @index_ii_range_combine(i16 %a) { ; CHECK-LABEL: index_ii_range_combine: ; CHECK: // %bb.0: -; CHECK-NEXT: index z0.h, #0, #8 -; CHECK-NEXT: orr z0.h, z0.h, #0x2 +; CHECK-NEXT: index z0.h, #2, #8 ; CHECK-NEXT: ret %val = insertelement poison, i16 2, i32 0 %val1 = shufflevector %val, poison, zeroinitializer @@ -109,7 +108,7 @@ define @index_ir_range(i32 %a) { ; CHECK-LABEL: index_ir_range: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-17 +; CHECK-NEXT: mov w8, #-17 // =0xffffffef ; CHECK-NEXT: index z0.s, w8, w0 ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv4i32(i32 -17, i32 %a) @@ -174,7 +173,7 @@ define @index_ri_range(i16 %a) { ; CHECK-LABEL: index_ri_range: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov w8, #16 // =0x10 ; CHECK-NEXT: index z0.h, w0, w8 ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv8i16(i16 %a, i16 16) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll --- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll @@ -100,7 +100,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s ; CHECK-NEXT: ret %wide.load = call @llvm.masked.load.nxv2i16(* %in, i32 2, %mask, undef) %zext = zext %wide.load to diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll @@ -48,11 +48,11 @@ define void @ctlz_v32i8(ptr %a) #0 { ; CHECK-LABEL: ctlz_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: clz z0.b, p0/m, z0.b ; CHECK-NEXT: clz z1.b, p0/m, z1.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: clz z0.b, p0/m, z0.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op) @@ -101,11 +101,11 @@ define void @ctlz_v16i16(ptr %a) #0 { ; CHECK-LABEL: ctlz_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: clz z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: clz z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op) @@ -140,11 +140,11 @@ define void @ctlz_v8i32(ptr %a) #0 { ; CHECK-LABEL: ctlz_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: clz z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: clz z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op) @@ -179,11 +179,11 @@ define void @ctlz_v4i64(ptr %a) #0 { ; CHECK-LABEL: ctlz_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: clz z0.d, p0/m, z0.d ; CHECK-NEXT: clz z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: clz z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op) @@ -235,11 +235,11 @@ define void @ctpop_v32i8(ptr %a) #0 { ; CHECK-LABEL: ctpop_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: cnt z0.b, p0/m, z0.b ; CHECK-NEXT: cnt z1.b, p0/m, z1.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: cnt z0.b, p0/m, z0.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op) @@ -287,11 +287,11 @@ define void @ctpop_v16i16(ptr %a) #0 { ; CHECK-LABEL: ctpop_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: cnt z0.h, p0/m, z0.h ; CHECK-NEXT: cnt z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: cnt z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op) @@ -326,11 +326,11 @@ define void @ctpop_v8i32(ptr %a) #0 { ; CHECK-LABEL: ctpop_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: cnt z0.s, p0/m, z0.s ; CHECK-NEXT: cnt z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: cnt z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op) @@ -365,11 +365,11 @@ define void @ctpop_v4i64(ptr %a) #0 { ; CHECK-LABEL: ctpop_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: cnt z0.d, p0/m, z0.d ; CHECK-NEXT: cnt z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: cnt z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op) @@ -424,13 +424,13 @@ define void @cttz_v32i8(ptr %a) #0 { ; CHECK-LABEL: cttz_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: rbit z0.b, p0/m, z0.b -; CHECK-NEXT: clz z0.b, p0/m, z0.b ; CHECK-NEXT: rbit z1.b, p0/m, z1.b ; CHECK-NEXT: clz z1.b, p0/m, z1.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.b, p0/m, z0.b +; CHECK-NEXT: clz z0.b, p0/m, z0.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op) @@ -481,13 +481,13 @@ define void @cttz_v16i16(ptr %a) #0 { ; CHECK-LABEL: cttz_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: rbit z0.h, p0/m, z0.h -; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: rbit z1.h, p0/m, z1.h ; CHECK-NEXT: clz z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.h, p0/m, z0.h +; CHECK-NEXT: clz z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op) @@ -524,13 +524,13 @@ define void @cttz_v8i32(ptr %a) #0 { ; CHECK-LABEL: cttz_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: rbit z0.s, p0/m, z0.s -; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: rbit z1.s, p0/m, z1.s ; CHECK-NEXT: clz z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.s, p0/m, z0.s +; CHECK-NEXT: clz z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op) @@ -567,13 +567,13 @@ define void @cttz_v4i64(ptr %a) #0 { ; CHECK-LABEL: cttz_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: rbit z0.d, p0/m, z0.d -; CHECK-NEXT: clz z0.d, p0/m, z0.d ; CHECK-NEXT: rbit z1.d, p0/m, z1.d ; CHECK-NEXT: clz z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.d, p0/m, z0.d +; CHECK-NEXT: clz z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll @@ -77,10 +77,10 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -156,10 +156,10 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -219,10 +219,10 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -267,10 +267,10 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -334,10 +334,10 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v32f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -397,10 +397,10 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -445,10 +445,10 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -44,15 +44,15 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) #0 { ; CHECK-LABEL: test_copysign_v16f16_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: and z0.h, z0.h, #0x8000 -; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: and z2.h, z2.h, #0x7fff -; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: and z0.h, z0.h, #0x8000 ; CHECK-NEXT: and z3.h, z3.h, #0x7fff ; CHECK-NEXT: orr z1.d, z3.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z2.h, z2.h, #0x7fff +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %a = load <16 x half>, ptr %ap %b = load <16 x half>, ptr %bp @@ -100,15 +100,15 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) #0 { ; CHECK-LABEL: test_copysign_v8f32_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: and z0.s, z0.s, #0x80000000 -; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: and z2.s, z2.s, #0x7fffffff -; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: and z0.s, z0.s, #0x80000000 ; CHECK-NEXT: and z3.s, z3.s, #0x7fffffff ; CHECK-NEXT: orr z1.d, z3.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z2.s, z2.s, #0x7fffffff +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %a = load <8 x float>, ptr %ap %b = load <8 x float>, ptr %bp @@ -139,15 +139,15 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) #0 { ; CHECK-LABEL: test_copysign_v4f64_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: and z0.d, z0.d, #0x8000000000000000 -; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK-NEXT: and z2.d, z2.d, #0x7fffffffffffffff -; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: and z0.d, z0.d, #0x8000000000000000 ; CHECK-NEXT: and z3.d, z3.d, #0x7fffffffffffffff ; CHECK-NEXT: orr z1.d, z3.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z2.d, z2.d, #0x7fffffffffffffff +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x double>, ptr %bp @@ -237,17 +237,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #2 // =0x2 ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ld1w { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x1, x8, lsl #2] ; CHECK-NEXT: ld1w { z3.d }, p0/z, [x1] ; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NEXT: fcvt z1.d, p0/m, z1.s ; CHECK-NEXT: fcvt z3.d, p0/m, z3.s -; CHECK-NEXT: fcvt z2.d, p0/m, z2.s -; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff +; CHECK-NEXT: and z2.d, z2.d, #0x7fffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 ; CHECK-NEXT: and z3.d, z3.d, #0x8000000000000000 -; CHECK-NEXT: and z2.d, z2.d, #0x8000000000000000 +; CHECK-NEXT: orr z1.d, z2.d, z1.d ; CHECK-NEXT: orr z0.d, z0.d, z3.d -; CHECK-NEXT: orr z1.d, z1.d, z2.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %a = load <4 x double>, ptr %ap diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll @@ -49,12 +49,12 @@ define void @fadd_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fadd_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -92,12 +92,12 @@ define void @fadd_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fadd_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -122,12 +122,12 @@ define void @fadd_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fadd_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -182,12 +182,12 @@ define void @fdiv_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fdiv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fdiv z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -225,12 +225,12 @@ define void @fdiv_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fdiv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fdiv z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -255,12 +255,12 @@ define void @fdiv_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fdiv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fdiv z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -318,14 +318,14 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z4.h ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.h, p0/m, z2.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -366,14 +366,14 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z4.s ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.s, p0/m, z2.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -400,14 +400,14 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z4.d ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -463,12 +463,12 @@ define void @fmul_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmul_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -506,12 +506,12 @@ define void @fmul_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmul_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -536,12 +536,12 @@ define void @fmul_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmul_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmul z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -593,11 +593,11 @@ define void @fneg_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fneg_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: fneg z0.h, p0/m, z0.h ; CHECK-NEXT: fneg z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fneg z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = fneg <16 x half> %op @@ -632,11 +632,11 @@ define void @fneg_v8f32(ptr %a) #0 { ; CHECK-LABEL: fneg_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fneg z0.s, p0/m, z0.s ; CHECK-NEXT: fneg z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fneg z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = fneg <8 x float> %op @@ -659,11 +659,11 @@ define void @fneg_v4f64(ptr %a) #0 { ; CHECK-LABEL: fneg_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fneg z0.d, p0/m, z0.d ; CHECK-NEXT: fneg z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fneg z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = fneg <4 x double> %op @@ -714,11 +714,11 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fsqrt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h ; CHECK-NEXT: fsqrt z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op) @@ -753,11 +753,11 @@ define void @fsqrt_v8f32(ptr %a) #0 { ; CHECK-LABEL: fsqrt_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s ; CHECK-NEXT: fsqrt z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op) @@ -780,11 +780,11 @@ define void @fsqrt_v4f64(ptr %a) #0 { ; CHECK-LABEL: fsqrt_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d ; CHECK-NEXT: fsqrt z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op) @@ -838,12 +838,12 @@ define void @fsub_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fsub_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fsub z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -881,12 +881,12 @@ define void @fsub_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fsub_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -911,12 +911,12 @@ define void @fsub_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fsub_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fsub z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -968,11 +968,11 @@ define void @fabs_v16f16(ptr %a) #0 { ; CHECK-LABEL: fabs_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: fabs z0.h, p0/m, z0.h ; CHECK-NEXT: fabs z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fabs z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op) @@ -1007,11 +1007,11 @@ define void @fabs_v8f32(ptr %a) #0 { ; CHECK-LABEL: fabs_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fabs z0.s, p0/m, z0.s ; CHECK-NEXT: fabs z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fabs z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op) @@ -1034,11 +1034,11 @@ define void @fabs_v4f64(ptr %a) #0 { ; CHECK-LABEL: fabs_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fabs z0.d, p0/m, z0.d ; CHECK-NEXT: fabs z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fabs z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll @@ -56,14 +56,14 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_oeq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -106,14 +106,14 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_oeq_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s -; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -156,14 +156,14 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_oeq_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d -; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -180,18 +180,18 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ueq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p2.h, p0/z, z1.h, z0.h ; CHECK-NEXT: mov p1.b, p2/m, p2.b ; CHECK-NEXT: fcmuo p2.h, p0/z, z2.h, z3.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: sel p0.b, p0, p0.b, p2.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: sel p0.b, p0, p0.b, p2.b ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -208,18 +208,18 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_one_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmgt p2.h, p0/z, z1.h, z0.h ; CHECK-NEXT: mov p1.b, p2/m, p2.b ; CHECK-NEXT: fcmgt p2.h, p0/z, z3.h, z2.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: sel p0.b, p0, p0.b, p2.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: sel p0.b, p0, p0.b, p2.b ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -236,14 +236,14 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmne p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmne p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -260,14 +260,14 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ogt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -284,17 +284,17 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ugt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h ; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h -; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -311,14 +311,14 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_olt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -335,17 +335,17 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ult_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h ; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -362,14 +362,14 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_oge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -386,17 +386,17 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_uge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h ; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h -; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -413,14 +413,14 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ole_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -437,17 +437,17 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ule_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h ; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -464,14 +464,14 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_uno_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmuo p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -488,17 +488,17 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ord_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: fcmuo p0.h, p0/z, z2.h, z3.h ; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: fcmuo p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -515,14 +515,14 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_eq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -539,14 +539,14 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ne_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmne p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmne p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -563,14 +563,14 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_gt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -587,14 +587,14 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_lt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -611,14 +611,14 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -635,14 +635,14 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_le_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll @@ -7,14 +7,14 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fp_convert_combine_crash: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: fmov z2.s, #8.00000000 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %f = load <8 x float>, ptr %a %mul.i = fmul <8 x float> %f, , ptr %a %res = fpext <16 x half> %op1 to <16 x float> @@ -204,8 +204,8 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvt_v8f16_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #4 // =0x4 -; CHECK-NEXT: mov x9, #6 // =0x6 +; CHECK-NEXT: mov x8, #6 // =0x6 +; CHECK-NEXT: mov x9, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x10, #2 // =0x2 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] @@ -214,12 +214,12 @@ ; CHECK-NEXT: ld1h { z3.d }, p0/z, [x0] ; CHECK-NEXT: fcvt z0.d, p0/m, z0.h ; CHECK-NEXT: fcvt z1.d, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x1, #32] -; CHECK-NEXT: movprfx z0, z3 -; CHECK-NEXT: fcvt z0.d, p0/m, z3.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fcvt z1.d, p0/m, z2.h -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1, #32] +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvt z0.d, p0/m, z2.h +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: fcvt z1.d, p0/m, z3.h +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fpext <8 x half> %op1 to <8 x double> @@ -230,16 +230,16 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvt_v16f16_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #14 // =0xe -; CHECK-NEXT: mov x10, #12 // =0xc +; CHECK-NEXT: mov x9, #12 // =0xc +; CHECK-NEXT: mov x10, #14 // =0xe ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x8, #2 // =0x2 -; CHECK-NEXT: mov x11, #6 // =0x6 -; CHECK-NEXT: mov x12, #4 // =0x4 +; CHECK-NEXT: mov x11, #4 // =0x4 +; CHECK-NEXT: mov x12, #6 // =0x6 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, x10, lsl #1] -; CHECK-NEXT: mov x9, #8 // =0x8 -; CHECK-NEXT: mov x10, #10 // =0xa +; CHECK-NEXT: mov x9, #10 // =0xa +; CHECK-NEXT: mov x10, #8 // =0x8 ; CHECK-NEXT: ld1h { z2.d }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z3.d }, p0/z, [x0, x11, lsl #1] ; CHECK-NEXT: ld1h { z5.d }, p0/z, [x0, x12, lsl #1] @@ -248,22 +248,22 @@ ; CHECK-NEXT: ld1h { z4.d }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ld1h { z6.d }, p0/z, [x0, x10, lsl #1] ; CHECK-NEXT: ld1h { z7.d }, p0/z, [x0] -; CHECK-NEXT: stp q1, q0, [x1, #96] +; CHECK-NEXT: stp q0, q1, [x1, #96] ; CHECK-NEXT: movprfx z1, z4 ; CHECK-NEXT: fcvt z1.d, p0/m, z4.h ; CHECK-NEXT: movprfx z0, z6 ; CHECK-NEXT: fcvt z0.d, p0/m, z6.h -; CHECK-NEXT: stp q1, q0, [x1, #64] +; CHECK-NEXT: stp q0, q1, [x1, #64] ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fcvt z1.d, p0/m, z5.h ; CHECK-NEXT: movprfx z0, z3 ; CHECK-NEXT: fcvt z0.d, p0/m, z3.h -; CHECK-NEXT: stp q1, q0, [x1, #32] -; CHECK-NEXT: movprfx z1, z7 -; CHECK-NEXT: fcvt z1.d, p0/m, z7.h -; CHECK-NEXT: movprfx z0, z2 -; CHECK-NEXT: fcvt z0.d, p0/m, z2.h -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvt z1.d, p0/m, z2.h +; CHECK-NEXT: movprfx z0, z7 +; CHECK-NEXT: fcvt z0.d, p0/m, z7.h +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fpext <16 x half> %op1 to <16 x double> @@ -322,8 +322,8 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvt_v8f32_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #4 // =0x4 -; CHECK-NEXT: mov x9, #6 // =0x6 +; CHECK-NEXT: mov x8, #6 // =0x6 +; CHECK-NEXT: mov x9, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x10, #2 // =0x2 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] @@ -332,12 +332,12 @@ ; CHECK-NEXT: ld1w { z3.d }, p0/z, [x0] ; CHECK-NEXT: fcvt z0.d, p0/m, z0.s ; CHECK-NEXT: fcvt z1.d, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x1, #32] -; CHECK-NEXT: movprfx z0, z3 -; CHECK-NEXT: fcvt z0.d, p0/m, z3.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fcvt z1.d, p0/m, z2.s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1, #32] +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvt z0.d, p0/m, z2.s +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: fcvt z1.d, p0/m, z3.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fpext <8 x float> %op1 to <8 x double> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll @@ -40,14 +40,14 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z4.h ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.h, p0/m, z2.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -91,14 +91,14 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z4.s ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.s, p0/m, z2.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -140,14 +140,14 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z4.d ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll @@ -36,12 +36,12 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmaxnm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmaxnm z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -79,12 +79,12 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmaxnm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmaxnm z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -120,12 +120,12 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmaxnm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmaxnm z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -167,12 +167,12 @@ define void @fminnm_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fminnm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -210,12 +210,12 @@ define void @fminnm_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fminnm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -251,12 +251,12 @@ define void @fminnm_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fminnm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -298,12 +298,12 @@ define void @fmax_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmax_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmax z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -341,12 +341,12 @@ define void @fmax_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmax_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmax z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -382,12 +382,12 @@ define void @fmax_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmax_v4f64: ; CHECK: // %bb.0: -;