diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1775,13 +1775,15 @@ WorklistInserter AddNodes(*this); + DAG.AssignTopologicalOrder(); + // Add all the dag nodes to the worklist. // // Note: All nodes are not added to PruningList here, this is because the only // nodes which can be deleted are those which have no uses and all other nodes // which would otherwise be added to the worklist by the first call to // getNextWorklistEntry are already present in it. - for (SDNode &Node : DAG.allnodes()) + for (SDNode &Node : reverse(DAG.allnodes())) AddToWorklist(&Node, /* IsCandidateForPruning */ Node.use_empty()); // Create a dummy node (which is not added to allnodes), that adds a reference diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll @@ -925,14 +925,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -944,14 +944,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -963,14 +963,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -982,14 +982,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1001,14 +1001,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] @@ -1675,14 +1675,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -1694,14 +1694,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -1713,14 +1713,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -1732,14 +1732,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1751,14 +1751,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll @@ -557,8 +557,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O0: adds x2, x0, x9 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -572,8 +572,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O0: adds x2, x0, x9 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -587,8 +587,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_release: ; -O0: adds x2, x0, x9 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -602,8 +602,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O0: adds x2, x0, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -617,8 +617,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O0: adds x2, x0, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] @@ -1132,8 +1132,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O0: subs x2, x0, x9 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -1147,8 +1147,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O0: subs x2, x0, x9 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -1162,8 +1162,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: ; -O0: subs x2, x0, x9 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -1177,8 +1177,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O0: subs x2, x0, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1192,8 +1192,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O0: subs x2, x0, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x1, x9 +; -O0: ccmp x0, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll @@ -925,14 +925,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -944,14 +944,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -963,14 +963,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -982,14 +982,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1001,14 +1001,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] @@ -1675,14 +1675,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -1694,14 +1694,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -1713,14 +1713,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -1732,14 +1732,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1751,14 +1751,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll @@ -925,14 +925,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -944,14 +944,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -963,14 +963,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -982,14 +982,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1001,14 +1001,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] @@ -1675,14 +1675,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -1694,14 +1694,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -1713,14 +1713,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -1732,14 +1732,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1751,14 +1751,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll @@ -925,14 +925,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -944,14 +944,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -963,14 +963,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -982,14 +982,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1001,14 +1001,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] @@ -1675,14 +1675,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x0, x1, [x8] @@ -1694,14 +1694,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x0, x1, [x8] @@ -1713,14 +1713,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x0, x1, [x8] @@ -1732,14 +1732,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x0, x1, [x8] @@ -1751,14 +1751,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x9, x8, [x13] +; -O0: cmp x9, x10 +; -O0: cmp x8, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x9, x8, [x13] +; -O0: subs x11, x8, x11 +; -O0: ccmp x9, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x0, x1, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-outline_atomics.ll @@ -118,8 +118,8 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_unordered: ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_unordered: ; -O1: ldxp xzr, x8, [x2] @@ -131,8 +131,8 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_monotonic: ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_monotonic: ; -O1: ldxp xzr, x8, [x2] @@ -144,8 +144,8 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_release: ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_release: ; -O1: ldxp xzr, x8, [x2] @@ -157,8 +157,8 @@ define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_seq_cst: ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_seq_cst: ; -O1: ldaxp xzr, x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc.ll @@ -117,13 +117,13 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_unordered: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_unordered: ; -O1: ldxp xzr, x8, [x2] @@ -134,13 +134,13 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_monotonic: ; -O1: ldxp xzr, x8, [x2] @@ -151,13 +151,13 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_release: ; -O1: ldxp xzr, x8, [x2] @@ -168,13 +168,13 @@ define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_seq_cst: ; -O1: ldaxp xzr, x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-v8a.ll @@ -117,13 +117,13 @@ define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_unordered: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_unordered: ; -O1: ldxp xzr, x8, [x2] @@ -134,13 +134,13 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_monotonic: ; -O1: ldxp xzr, x8, [x2] @@ -151,13 +151,13 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_release: ; -O1: ldxp xzr, x8, [x2] @@ -168,13 +168,13 @@ define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) { ; -O0-LABEL: store_atomic_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: store_atomic_i128_aligned_seq_cst: ; -O1: ldaxp xzr, x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-lse2.ll @@ -305,13 +305,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O1: ldxp x1, x8, [x0] @@ -322,13 +322,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O1: ldaxp x1, x8, [x0] @@ -339,13 +339,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O1: ldxp x1, x8, [x0] @@ -356,13 +356,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O1: ldaxp x1, x8, [x0] @@ -373,13 +373,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O1: ldaxp x1, x8, [x0] @@ -945,14 +945,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -964,14 +964,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -983,14 +983,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1002,14 +1002,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1021,14 +1021,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1710,14 +1710,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1729,14 +1729,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1748,14 +1748,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1767,14 +1767,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1786,14 +1786,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -2475,15 +2475,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -2496,15 +2496,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -2517,15 +2517,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_release: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -2538,15 +2538,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -2559,15 +2559,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3300,17 +3300,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3325,17 +3325,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3350,17 +3350,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_release: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3375,17 +3375,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3400,17 +3400,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4165,15 +4165,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4186,15 +4186,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4207,15 +4207,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_release: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -4228,15 +4228,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -4249,15 +4249,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4950,15 +4950,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4971,15 +4971,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4992,15 +4992,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_release: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5013,15 +5013,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5034,15 +5034,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -5795,16 +5795,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -5818,16 +5818,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -5841,16 +5841,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5864,16 +5864,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5887,16 +5887,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6720,16 +6720,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6743,16 +6743,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6766,16 +6766,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6789,16 +6789,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6812,16 +6812,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -7645,16 +7645,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -7668,16 +7668,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -7691,16 +7691,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -7714,16 +7714,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -7737,16 +7737,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -8570,16 +8570,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -8593,16 +8593,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -8616,16 +8616,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -8639,16 +8639,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -8662,16 +8662,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-outline_atomics.ll @@ -146,8 +146,8 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O1: ldxp x1, x8, [x0] @@ -159,8 +159,8 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O1: ldaxp x1, x8, [x0] @@ -172,8 +172,8 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O1: ldxp x1, x8, [x0] @@ -185,8 +185,8 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O1: ldaxp x1, x8, [x0] @@ -198,8 +198,8 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O1: ldaxp x1, x8, [x0] @@ -527,8 +527,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O0: adds x3, x1, x9 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -542,8 +542,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O0: adds x3, x1, x9 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -557,8 +557,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_release: ; -O0: adds x3, x1, x9 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -572,8 +572,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O0: adds x3, x1, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -587,8 +587,8 @@ ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O0: adds x3, x1, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1102,8 +1102,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O0: subs x3, x1, x9 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1117,8 +1117,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O0: subs x3, x1, x9 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1132,8 +1132,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: ; -O0: subs x3, x1, x9 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1147,8 +1147,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O0: subs x3, x1, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1162,8 +1162,8 @@ ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O0: subs x3, x1, x9 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1718,8 +1718,8 @@ ; -O0: and x2, x0, x9 ; -O0: and x3, x1, x8 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1735,8 +1735,8 @@ ; -O0: and x2, x0, x9 ; -O0: and x3, x1, x8 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1752,8 +1752,8 @@ ; -O0: and x2, x0, x9 ; -O0: and x3, x1, x8 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1769,8 +1769,8 @@ ; -O0: and x2, x0, x9 ; -O0: and x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1786,8 +1786,8 @@ ; -O0: and x2, x0, x9 ; -O0: and x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -2460,8 +2460,8 @@ ; -O0: mvn x2, x9 ; -O0: mvn x3, x8 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -2481,8 +2481,8 @@ ; -O0: mvn x2, x9 ; -O0: mvn x3, x8 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -2502,8 +2502,8 @@ ; -O0: mvn x2, x9 ; -O0: mvn x3, x8 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -2523,8 +2523,8 @@ ; -O0: mvn x2, x9 ; -O0: mvn x3, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -2544,8 +2544,8 @@ ; -O0: mvn x2, x9 ; -O0: mvn x3, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3093,8 +3093,8 @@ ; -O0: orr x2, x0, x9 ; -O0: orr x3, x1, x8 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3110,8 +3110,8 @@ ; -O0: orr x2, x0, x9 ; -O0: orr x3, x1, x8 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3127,8 +3127,8 @@ ; -O0: orr x2, x0, x9 ; -O0: orr x3, x1, x8 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3144,8 +3144,8 @@ ; -O0: orr x2, x0, x9 ; -O0: orr x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3161,8 +3161,8 @@ ; -O0: orr x2, x0, x9 ; -O0: orr x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3608,8 +3608,8 @@ ; -O0: eor x2, x0, x9 ; -O0: eor x3, x1, x8 ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3625,8 +3625,8 @@ ; -O0: eor x2, x0, x9 ; -O0: eor x3, x1, x8 ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3642,8 +3642,8 @@ ; -O0: eor x2, x0, x9 ; -O0: eor x3, x1, x8 ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3659,8 +3659,8 @@ ; -O0: eor x2, x0, x9 ; -O0: eor x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3676,8 +3676,8 @@ ; -O0: eor x2, x0, x9 ; -O0: eor x3, x1, x8 ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4344,8 +4344,8 @@ ; -O0: csel x2, x0, x9, lt ; -O0: csel x3, x1, x8, lt ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4363,8 +4363,8 @@ ; -O0: csel x2, x0, x9, lt ; -O0: csel x3, x1, x8, lt ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4382,8 +4382,8 @@ ; -O0: csel x2, x0, x9, lt ; -O0: csel x3, x1, x8, lt ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -4401,8 +4401,8 @@ ; -O0: csel x2, x0, x9, lt ; -O0: csel x3, x1, x8, lt ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -4420,8 +4420,8 @@ ; -O0: csel x2, x0, x9, lt ; -O0: csel x3, x1, x8, lt ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -5199,8 +5199,8 @@ ; -O0: csel x2, x0, x9, ge ; -O0: csel x3, x1, x8, ge ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -5218,8 +5218,8 @@ ; -O0: csel x2, x0, x9, ge ; -O0: csel x3, x1, x8, ge ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -5237,8 +5237,8 @@ ; -O0: csel x2, x0, x9, ge ; -O0: csel x3, x1, x8, ge ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5256,8 +5256,8 @@ ; -O0: csel x2, x0, x9, ge ; -O0: csel x3, x1, x8, ge ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5275,8 +5275,8 @@ ; -O0: csel x2, x0, x9, ge ; -O0: csel x3, x1, x8, ge ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6054,8 +6054,8 @@ ; -O0: csel x2, x0, x9, lo ; -O0: csel x3, x1, x8, lo ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6073,8 +6073,8 @@ ; -O0: csel x2, x0, x9, lo ; -O0: csel x3, x1, x8, lo ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6092,8 +6092,8 @@ ; -O0: csel x2, x0, x9, lo ; -O0: csel x3, x1, x8, lo ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6111,8 +6111,8 @@ ; -O0: csel x2, x0, x9, lo ; -O0: csel x3, x1, x8, lo ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6130,8 +6130,8 @@ ; -O0: csel x2, x0, x9, lo ; -O0: csel x3, x1, x8, lo ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6909,8 +6909,8 @@ ; -O0: csel x2, x0, x9, hs ; -O0: csel x3, x1, x8, hs ; -O0: bl __aarch64_cas16_relax -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6928,8 +6928,8 @@ ; -O0: csel x2, x0, x9, hs ; -O0: csel x3, x1, x8, hs ; -O0: bl __aarch64_cas16_acq -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6947,8 +6947,8 @@ ; -O0: csel x2, x0, x9, hs ; -O0: csel x3, x1, x8, hs ; -O0: bl __aarch64_cas16_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6966,8 +6966,8 @@ ; -O0: csel x2, x0, x9, hs ; -O0: csel x3, x1, x8, hs ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6985,8 +6985,8 @@ ; -O0: csel x2, x0, x9, hs ; -O0: csel x3, x1, x8, hs ; -O0: bl __aarch64_cas16_acq_rel -; -O0: subs x10, x10, x11 -; -O0: ccmp x8, x9, #0, eq +; -O0: subs x9, x0, x9 +; -O0: ccmp x1, x8, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc.ll @@ -305,13 +305,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O1: ldxp x1, x8, [x0] @@ -322,13 +322,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O1: ldaxp x1, x8, [x0] @@ -339,13 +339,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O1: ldxp x1, x8, [x0] @@ -356,13 +356,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O1: ldaxp x1, x8, [x0] @@ -373,13 +373,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O1: ldaxp x1, x8, [x0] @@ -945,14 +945,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -964,14 +964,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -983,14 +983,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1002,14 +1002,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1021,14 +1021,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1710,14 +1710,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1729,14 +1729,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1748,14 +1748,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1767,14 +1767,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1786,14 +1786,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -2475,15 +2475,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -2496,15 +2496,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -2517,15 +2517,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_release: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -2538,15 +2538,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -2559,15 +2559,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3300,17 +3300,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3325,17 +3325,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3350,17 +3350,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_release: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3375,17 +3375,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3400,17 +3400,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4165,15 +4165,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4186,15 +4186,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4207,15 +4207,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_release: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -4228,15 +4228,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -4249,15 +4249,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4950,15 +4950,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4971,15 +4971,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4992,15 +4992,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_release: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5013,15 +5013,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5034,15 +5034,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -5795,16 +5795,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -5818,16 +5818,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -5841,16 +5841,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5864,16 +5864,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5887,16 +5887,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6720,16 +6720,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6743,16 +6743,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6766,16 +6766,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6789,16 +6789,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6812,16 +6812,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -7645,16 +7645,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -7668,16 +7668,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -7691,16 +7691,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -7714,16 +7714,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -7737,16 +7737,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -8570,16 +8570,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -8593,16 +8593,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -8616,16 +8616,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -8639,16 +8639,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -8662,16 +8662,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-rcpc3.ll @@ -305,13 +305,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O1: ldxp x1, x8, [x0] @@ -322,13 +322,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O1: ldaxp x1, x8, [x0] @@ -339,13 +339,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O1: ldxp x1, x8, [x0] @@ -356,13 +356,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O1: ldaxp x1, x8, [x0] @@ -373,13 +373,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O1: ldaxp x1, x8, [x0] @@ -945,14 +945,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -964,14 +964,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -983,14 +983,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1002,14 +1002,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1021,14 +1021,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1710,14 +1710,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1729,14 +1729,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1748,14 +1748,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1767,14 +1767,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1786,14 +1786,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -2475,15 +2475,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -2496,15 +2496,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -2517,15 +2517,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_release: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -2538,15 +2538,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -2559,15 +2559,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3300,17 +3300,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3325,17 +3325,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3350,17 +3350,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_release: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3375,17 +3375,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3400,17 +3400,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4165,15 +4165,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4186,15 +4186,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4207,15 +4207,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_release: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -4228,15 +4228,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -4249,15 +4249,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4950,15 +4950,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4971,15 +4971,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4992,15 +4992,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_release: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5013,15 +5013,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5034,15 +5034,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -5795,16 +5795,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -5818,16 +5818,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -5841,16 +5841,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5864,16 +5864,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5887,16 +5887,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6720,16 +6720,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6743,16 +6743,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6766,16 +6766,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6789,16 +6789,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6812,16 +6812,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -7645,16 +7645,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -7668,16 +7668,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -7691,16 +7691,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -7714,16 +7714,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -7737,16 +7737,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -8570,16 +8570,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -8593,16 +8593,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -8616,16 +8616,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -8639,16 +8639,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -8662,16 +8662,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomicrmw-v8a.ll @@ -305,13 +305,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_monotonic: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_monotonic: ; -O1: ldxp x1, x8, [x0] @@ -322,13 +322,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acquire: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acquire: ; -O1: ldaxp x1, x8, [x0] @@ -339,13 +339,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_release: -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_release: ; -O1: ldxp x1, x8, [x0] @@ -356,13 +356,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_acq_rel: ; -O1: ldaxp x1, x8, [x0] @@ -373,13 +373,13 @@ define dso_local i128 @atomicrmw_xchg_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xchg_i128_aligned_seq_cst: ; -O1: ldaxp x1, x8, [x0] @@ -945,14 +945,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -964,14 +964,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -983,14 +983,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_release: -; -O0: adds x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1002,14 +1002,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1021,14 +1021,14 @@ define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst: -; -O0: adds x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: adds x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_add_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -1710,14 +1710,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -1729,14 +1729,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -1748,14 +1748,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_release: -; -O0: subs x14, x11, x10 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -1767,14 +1767,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -1786,14 +1786,14 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst: -; -O0: subs x14, x11, x10 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x14, x10, x9 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_sub_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -2475,15 +2475,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_monotonic: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -2496,15 +2496,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acquire: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -2517,15 +2517,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_release: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -2538,15 +2538,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_acq_rel: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -2559,15 +2559,15 @@ define dso_local i128 @atomicrmw_and_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_and_i128_aligned_seq_cst: -; -O0: and x15, x13, x10 -; -O0: and x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: and x15, x11, x9 +; -O0: and x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_and_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -3300,17 +3300,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_monotonic: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -3325,17 +3325,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acquire: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -3350,17 +3350,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_release: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -3375,17 +3375,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_acq_rel: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -3400,17 +3400,17 @@ define dso_local i128 @atomicrmw_nand_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_nand_i128_aligned_seq_cst: -; -O0: and x8, x11, x8 -; -O0: and x10, x13, x10 -; -O0: mvn x15, x10 +; -O0: and x8, x10, x8 +; -O0: and x9, x11, x9 +; -O0: mvn x15, x9 ; -O0: mvn x14, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_nand_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4165,15 +4165,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_monotonic: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4186,15 +4186,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acquire: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4207,15 +4207,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_release: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -4228,15 +4228,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_acq_rel: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -4249,15 +4249,15 @@ define dso_local i128 @atomicrmw_or_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_or_i128_aligned_seq_cst: -; -O0: orr x15, x13, x10 -; -O0: orr x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: orr x15, x11, x9 +; -O0: orr x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_or_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -4950,15 +4950,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_monotonic: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -4971,15 +4971,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acquire: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -4992,15 +4992,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_release: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5013,15 +5013,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_acq_rel: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5034,15 +5034,15 @@ define dso_local i128 @atomicrmw_xor_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_xor_i128_aligned_seq_cst: -; -O0: eor x15, x13, x10 -; -O0: eor x14, x11, x8 -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: eor x15, x11, x9 +; -O0: eor x14, x10, x8 +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_xor_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -5795,16 +5795,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -5818,16 +5818,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -5841,16 +5841,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -5864,16 +5864,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -5887,16 +5887,16 @@ define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lt -; -O0: csel x14, x11, x8, lt -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lt +; -O0: csel x14, x10, x8, lt +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_max_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -6720,16 +6720,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -6743,16 +6743,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -6766,16 +6766,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -6789,16 +6789,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -6812,16 +6812,16 @@ define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, ge -; -O0: csel x14, x11, x8, ge -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, ge +; -O0: csel x14, x10, x8, ge +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_min_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -7645,16 +7645,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -7668,16 +7668,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -7691,16 +7691,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -7714,16 +7714,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -7737,16 +7737,16 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, lo -; -O0: csel x14, x11, x8, lo -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, lo +; -O0: csel x14, x10, x8, lo +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umax_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] @@ -8570,16 +8570,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_monotonic: ; -O1: ldxp x1, x0, [x8] @@ -8593,16 +8593,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stxp w8, x14, x15, [x9] -; -O0: stxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stxp w12, x14, x15, [x13] +; -O0: stxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acquire: ; -O1: ldaxp x1, x0, [x8] @@ -8616,16 +8616,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_release: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_release: ; -O1: ldxp x1, x0, [x8] @@ -8639,16 +8639,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_acq_rel: ; -O1: ldaxp x1, x0, [x8] @@ -8662,16 +8662,16 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) { ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst: -; -O0: subs x12, x8, x11 -; -O0: csel x15, x13, x10, hs -; -O0: csel x14, x11, x8, hs -; -O0: ldaxp x10, x12, [x9] -; -O0: cmp x10, x11 -; -O0: cmp x12, x13 -; -O0: stlxp w8, x14, x15, [x9] -; -O0: stlxp w8, x10, x12, [x9] -; -O0: subs x12, x12, x13 -; -O0: ccmp x10, x11, #0, eq +; -O0: subs x12, x8, x10 +; -O0: csel x15, x11, x9, hs +; -O0: csel x14, x10, x8, hs +; -O0: ldaxp x8, x9, [x13] +; -O0: cmp x8, x10 +; -O0: cmp x9, x11 +; -O0: stlxp w12, x14, x15, [x13] +; -O0: stlxp w12, x8, x9, [x13] +; -O0: subs x11, x9, x11 +; -O0: ccmp x8, x10, #0, eq ; ; -O1-LABEL: atomicrmw_umin_i128_aligned_seq_cst: ; -O1: ldaxp x1, x0, [x8] diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -73,8 +73,13 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: uabdl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: uaddlv s0, v0.8h +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: abs v1.4s, v1.4s +; CHECK-NEXT: abs v0.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -93,16 +98,16 @@ declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) define i32 @oversized_ADDV_512(ptr %arr) { -; SDAG-LABEL: oversized_ADDV_512: -; SDAG: // %bb.0: -; SDAG-NEXT: ldp q0, q1, [x0, #32] -; SDAG-NEXT: ldp q3, q2, [x0] -; SDAG-NEXT: add v0.4s, v3.4s, v0.4s -; SDAG-NEXT: add v1.4s, v2.4s, v1.4s -; SDAG-NEXT: add v0.4s, v0.4s, v1.4s -; SDAG-NEXT: addv s0, v0.4s -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; SDAG-LABEL: oversized_ADDV_512: +; SDAG: // %bb.0: +; SDAG-NEXT: ldp q0, q1, [x0, #32] +; SDAG-NEXT: ldp q3, q2, [x0] +; SDAG-NEXT: add v0.4s, v3.4s, v0.4s +; SDAG-NEXT: add v1.4s, v2.4s, v1.4s +; SDAG-NEXT: add v0.4s, v0.4s, v1.4s +; SDAG-NEXT: addv s0, v0.4s +; SDAG-NEXT: fmov w0, s0 +; SDAG-NEXT: ret ; ; GISEL-LABEL: oversized_ADDV_512: ; GISEL: // %bb.0: @@ -148,19 +153,19 @@ } define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) { -; SDAG-LABEL: addv_combine_i32: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.4s, v0.4s, v1.4s -; SDAG-NEXT: addv s0, v0.4s -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; SDAG-LABEL: addv_combine_i32: +; SDAG: // %bb.0: // %entry +; SDAG-NEXT: add v0.4s, v0.4s, v1.4s +; SDAG-NEXT: addv s0, v0.4s +; SDAG-NEXT: fmov w0, s0 +; SDAG-NEXT: ret ; ; GISEL-LABEL: addv_combine_i32: ; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addv s0, v0.4s -; GISEL-NEXT: addv s1, v1.4s -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: addv s0, v0.4s +; GISEL-NEXT: addv s1, v1.4s +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: add w0, w8, w9 ; GISEL-NEXT: ret entry: @@ -171,19 +176,19 @@ } define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) { -; SDAG-LABEL: addv_combine_i64: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.2d, v0.2d, v1.2d -; SDAG-NEXT: addp d0, v0.2d -; SDAG-NEXT: fmov x0, d0 -; SDAG-NEXT: ret +; SDAG-LABEL: addv_combine_i64: +; SDAG: // %bb.0: // %entry +; SDAG-NEXT: add v0.2d, v0.2d, v1.2d +; SDAG-NEXT: addp d0, v0.2d +; SDAG-NEXT: fmov x0, d0 +; SDAG-NEXT: ret ; ; GISEL-LABEL: addv_combine_i64: ; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addp d0, v0.2d -; GISEL-NEXT: addp d1, v1.2d -; GISEL-NEXT: fmov x8, d0 -; GISEL-NEXT: fmov x9, d1 +; GISEL-NEXT: addp d0, v0.2d +; GISEL-NEXT: addp d1, v1.2d +; GISEL-NEXT: fmov x8, d0 +; GISEL-NEXT: fmov x9, d1 ; GISEL-NEXT: add x0, x8, x9 ; GISEL-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll b/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll --- a/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll @@ -51,7 +51,8 @@ define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) { ; CHECK-LABEL: test_vbfdot_laneq_f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3] +; CHECK-NEXT: dup v2.2s, v2.s[3] +; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h ; CHECK-NEXT: ret entry: %.cast = bitcast <8 x bfloat> %b to <4 x float> diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -194,8 +194,9 @@ ; CHECK-NEXT: cmtst v0.8h, v0.8h, v0.8h ; CHECK-NEXT: dup v1.8h, w8 ; CHECK-NEXT: cmeq v1.8h, v1.8h, #0 -; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b ; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v1.16b +; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b ; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %tmp = xor <16 x i1> zeroinitializer, diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -320,8 +320,8 @@ ; CHECK-LABEL: insert_vec_v12i16_uaddlv_from_v4i16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: stp xzr, xzr, [x0, #32] +; CHECK-NEXT: stp xzr, xzr, [x0, #16] ; CHECK-NEXT: uaddlv.4h s1, v0 ; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: ushll.4s v0, v0, #0 diff --git a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll --- a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll +++ b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll @@ -9,21 +9,32 @@ ; CHECK-NEXT: add x8, sp, #40 ; CHECK-NEXT: add x0, sp, #40 ; CHECK-NEXT: stp x30, x18, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x3, x4, [sp, #56] -; CHECK-NEXT: stp x1, x2, [sp, #40] -; CHECK-NEXT: stp x5, x6, [sp, #72] -; CHECK-NEXT: str x7, [sp, #88] +; CHECK-NEXT: stp x6, x7, [sp, #80] +; CHECK-NEXT: stp x4, x5, [sp, #64] +; CHECK-NEXT: stp x2, x3, [sp, #48] +; CHECK-NEXT: str x1, [sp, #40] ; CHECK-NEXT: str x8, [sp, #8] ; CHECK-NEXT: bl other_func ; CHECK-NEXT: ldp x30, x18, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret ; +; DARWIN-LABEL: pass_va: ; DARWIN: ; %bb.0: ; %entry -; DARWIN-DAG: stp x3, x4, [sp, #56] -; DARWIN-DAG: stp x1, x2, [sp, #40] -; DARWIN-DAG: stp x5, x6, [sp, #72] -; DARWIN-DAG: str x7, [sp, #88] +; DARWIN-NEXT: str x18, [sp, #-96]! ; 8-byte Folded Spill +; DARWIN-NEXT: add x8, sp, #8 +; DARWIN-NEXT: add x9, sp, #40 +; DARWIN-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; DARWIN-NEXT: stp x1, x2, [sp, #40] +; DARWIN-NEXT: str x9, [x8] +; DARWIN-NEXT: ldr x0, [sp, #8] +; DARWIN-NEXT: stp x3, x4, [sp, #56] +; DARWIN-NEXT: stp x5, x6, [sp, #72] +; DARWIN-NEXT: str x7, [sp, #88] +; DARWIN-NEXT: bl _other_func +; DARWIN-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; DARWIN-NEXT: ldr x18, [sp], #96 ; 8-byte Folded Reload +; DARWIN-NEXT: ret entry: %ap = alloca ptr, align 8 call void @llvm.va_start(ptr %ap) @@ -47,15 +58,15 @@ ; CHECK-NEXT: ldr x18, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret ; -; DARWIN-LABEL: _f9: -; DARWIN: ; %bb.0: ; %entry -; DARWIN-NEXT: str x18, [sp, #-16]! ; 8-byte Folded Spill -; DARWIN-NEXT: add x8, sp, #8 -; DARWIN-NEXT: add x9, sp, #24 -; DARWIN-NEXT: str x9, [x8] -; DARWIN-NEXT: ldr x0, [sp, #8] -; DARWIN-NEXT: ldr x18, [sp], #16 ; 8-byte Folded Reload -; DARWIN-NEXT: ret +; DARWIN-LABEL: f9: +; DARWIN: ; %bb.0: ; %entry +; DARWIN-NEXT: str x18, [sp, #-16]! ; 8-byte Folded Spill +; DARWIN-NEXT: add x8, sp, #8 +; DARWIN-NEXT: add x9, sp, #24 +; DARWIN-NEXT: str x9, [x8] +; DARWIN-NEXT: ldr x0, [sp, #8] +; DARWIN-NEXT: ldr x18, [sp], #16 ; 8-byte Folded Reload +; DARWIN-NEXT: ret entry: %ap = alloca ptr, align 8 call void @llvm.va_start(ptr %ap) @@ -73,15 +84,15 @@ ; CHECK-NEXT: ldr x18, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret ; -; DARWIN-LABEL: _f8: -; DARWIN: ; %bb.0: ; %entry -; DARWIN-NEXT: str x18, [sp, #-16]! ; 8-byte Folded Spill -; DARWIN-NEXT: add x8, sp, #8 -; DARWIN-NEXT: add x9, sp, #16 -; DARWIN-NEXT: str x9, [x8] -; DARWIN-NEXT: ldr x0, [sp, #8] -; DARWIN-NEXT: ldr x18, [sp], #16 ; 8-byte Folded Reload -; DARWIN-NEXT: ret +; DARWIN-LABEL: f8: +; DARWIN: ; %bb.0: ; %entry +; DARWIN-NEXT: str x18, [sp, #-16]! ; 8-byte Folded Spill +; DARWIN-NEXT: add x8, sp, #8 +; DARWIN-NEXT: add x9, sp, #16 +; DARWIN-NEXT: str x9, [x8] +; DARWIN-NEXT: ldr x0, [sp, #8] +; DARWIN-NEXT: ldr x18, [sp], #16 ; 8-byte Folded Reload +; DARWIN-NEXT: ret entry: %ap = alloca ptr, align 8 call void @llvm.va_start(ptr %ap) @@ -100,16 +111,16 @@ ; CHECK-NEXT: ldr x18, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: ret ; -; DARWIN-LABEL: _f7: -; DARWIN: ; %bb.0: ; %entry -; DARWIN-NEXT: str x18, [sp, #-32]! ; 8-byte Folded Spill -; DARWIN-NEXT: add x8, sp, #8 -; DARWIN-NEXT: add x9, sp, #24 -; DARWIN-NEXT: str x7, [sp, #24] -; DARWIN-NEXT: str x9, [x8] -; DARWIN-NEXT: ldr x0, [sp, #8] -; DARWIN-NEXT: ldr x18, [sp], #32 ; 8-byte Folded Reload -; DARWIN-NEXT: ret +; DARWIN-LABEL: f7: +; DARWIN: ; %bb.0: ; %entry +; DARWIN-NEXT: str x18, [sp, #-32]! ; 8-byte Folded Spill +; DARWIN-NEXT: add x8, sp, #8 +; DARWIN-NEXT: add x9, sp, #24 +; DARWIN-NEXT: str x7, [sp, #24] +; DARWIN-NEXT: str x9, [x8] +; DARWIN-NEXT: ldr x0, [sp, #8] +; DARWIN-NEXT: ldr x18, [sp], #32 ; 8-byte Folded Reload +; DARWIN-NEXT: ret entry: %ap = alloca ptr, align 8 call void @llvm.va_start(ptr %ap) diff --git a/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll b/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll --- a/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll +++ b/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -frame-pointer=all -mtriple=arm64-windows | FileCheck %s ; Test generated from C code: @@ -15,18 +16,59 @@ declare ptr @llvm.addressofreturnaddress() define dso_local ptr @"foo"() { +; CHECK-LABEL: foo: +; CHECK: .seh_proc foo +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr_x 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .seh_set_fp +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: add x0, x29, #8 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr_x 16 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc entry: %0 = call ptr @llvm.addressofreturnaddress() ret ptr %0 -; CHECK-LABEL: foo -; CHECK: stp x29, x30, [sp, #-16]! -; CHECK: mov x29, sp -; CHECK: add x0, x29, #8 -; CHECK: ldp x29, x30, [sp], #16 } define dso_local i32 @"bar"(ptr %x, ...) { +; CHECK-LABEL: bar: +; CHECK: .seh_proc bar +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .seh_stackalloc 96 +; CHECK-NEXT: stp x29, x30, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr 16 +; CHECK-NEXT: add x29, sp, #16 +; CHECK-NEXT: .seh_add_fp 16 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: add x9, x29, #24 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: str x1, [x29, #24] +; CHECK-NEXT: add x1, x29, #8 +; CHECK-NEXT: stp x6, x7, [x29, #64] +; CHECK-NEXT: stp x9, x0, [sp] +; CHECK-NEXT: add x0, x29, #24 +; CHECK-NEXT: stp x4, x5, [x29, #48] +; CHECK-NEXT: stp x2, x3, [x29, #32] +; CHECK-NEXT: blr x8 +; CHECK-NEXT: add w0, w0, #1 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldp x29, x30, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr 16 +; CHECK-NEXT: add sp, sp, #96 +; CHECK-NEXT: .seh_stackalloc 96 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc entry: %x.addr = alloca ptr, align 8 %y = alloca ptr, align 8 @@ -39,12 +81,4 @@ %add = add nsw i32 %call, 1 ret i32 %add -; CHECK-LABEL: bar -; CHECK: sub sp, sp, #96 -; CHECK: stp x29, x30, [sp, #16] -; CHECK: add x29, sp, #16 -; CHECK: stp x1, x2, [x29, #24] -; CHECK: add x1, x29, #8 -; CHECK: ldp x29, x30, [sp, #16] -; CHECK: add sp, sp, #96 } diff --git a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll --- a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll +++ b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll @@ -385,12 +385,12 @@ ; CHECK-NEXT: bl return_in_block ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: str d0, [x8] -; CHECK-NEXT: str d1, [x8, #8] -; CHECK-NEXT: str d2, [x8, #16] ; CHECK-NEXT: str d3, [x8, #24] -; CHECK-NEXT: str d4, [x8, #32] ; CHECK-NEXT: str d5, [x8, #40] +; CHECK-NEXT: str d4, [x8, #32] +; CHECK-NEXT: str d2, [x8, #16] +; CHECK-NEXT: str d1, [x8, #8] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %1 = call %T_IN_BLOCK @return_in_block() @@ -457,17 +457,21 @@ ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: add x8, sp, #8 ; CHECK-NEXT: bl return_in_memory -; CHECK-NEXT: ldur q0, [sp, #24] +; CHECK-NEXT: ldr d0, [sp, #24] ; CHECK-NEXT: adrp x8, in_memory_store ; CHECK-NEXT: add x8, x8, :lo12:in_memory_store -; CHECK-NEXT: ldur q1, [sp, #8] +; CHECK-NEXT: ldr d1, [sp, #48] ; CHECK-NEXT: ldur q2, [sp, #56] -; CHECK-NEXT: ldur q3, [sp, #40] -; CHECK-NEXT: ldr d4, [sp, #72] -; CHECK-NEXT: stp q1, q0, [x8] +; CHECK-NEXT: ldur q3, [sp, #32] +; CHECK-NEXT: ldur q4, [sp, #8] +; CHECK-NEXT: ldr d5, [sp, #72] +; CHECK-NEXT: str q2, [x8, #48] ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload -; CHECK-NEXT: stp q3, q2, [x8, #32] -; CHECK-NEXT: str d4, [x8, #64] +; CHECK-NEXT: stur q3, [x8, #24] +; CHECK-NEXT: str q4, [x8] +; CHECK-NEXT: str d5, [x8, #64] +; CHECK-NEXT: str d1, [x8, #40] +; CHECK-NEXT: str d0, [x8, #16] ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %1 = call %T_IN_MEMORY @return_in_memory() @@ -540,10 +544,10 @@ ; CHECK-NEXT: bl return_no_block ; CHECK-NEXT: adrp x8, no_block_store ; CHECK-NEXT: add x8, x8, :lo12:no_block_store -; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: str w0, [x8, #8] -; CHECK-NEXT: str d1, [x8, #16] ; CHECK-NEXT: str w1, [x8, #24] +; CHECK-NEXT: str d1, [x8, #16] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %1 = call %T_NO_BLOCK @return_no_block() diff --git a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll --- a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll @@ -14,15 +14,14 @@ ; CHECK-NEXT: stp w6, w5, [sp, #36] ; CHECK-NEXT: str w7, [sp, #32] ; CHECK-NEXT: str w8, [x0] -; CHECK-NEXT: add x8, sp, #72 -; CHECK-NEXT: add x8, x8, #8 ; CHECK-NEXT: ldr w9, [sp, #72] -; CHECK-NEXT: str w9, [sp, #20] -; CHECK-NEXT: ldr w9, [x8], #8 -; CHECK-NEXT: str w9, [sp, #16] -; CHECK-NEXT: ldr w9, [x8], #8 +; CHECK-NEXT: ldr w8, [sp, #80] +; CHECK-NEXT: stp w8, w9, [sp, #16] +; CHECK-NEXT: add x8, sp, #72 +; CHECK-NEXT: add x8, x8, #24 ; CHECK-NEXT: str x8, [sp, #24] -; CHECK-NEXT: str w9, [sp, #12] +; CHECK-NEXT: ldr w8, [sp, #88] +; CHECK-NEXT: str w8, [sp, #12] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret %1 = alloca i32, align 4 @@ -64,37 +63,37 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: sub sp, sp, #96 ; CHECK-NEXT: stp x29, x30, [sp, #80] ; 16-byte Folded Spill -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov w9, #1 ; =0x1 +; CHECK-NEXT: mov w8, #2 ; =0x2 ; CHECK-NEXT: stp w8, w9, [sp, #72] -; CHECK-NEXT: mov w9, #3 -; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: mov w9, #3 ; =0x3 +; CHECK-NEXT: mov w8, #4 ; =0x4 ; CHECK-NEXT: stp w8, w9, [sp, #64] -; CHECK-NEXT: mov w9, #5 -; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: mov w9, #5 ; =0x5 +; CHECK-NEXT: mov w8, #6 ; =0x6 ; CHECK-NEXT: stp w8, w9, [sp, #56] -; CHECK-NEXT: mov w9, #7 -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w9, #7 ; =0x7 +; CHECK-NEXT: mov w8, #8 ; =0x8 ; CHECK-NEXT: stp w8, w9, [sp, #48] -; CHECK-NEXT: mov w8, #9 -; CHECK-NEXT: mov w9, #10 +; CHECK-NEXT: mov w8, #9 ; =0x9 +; CHECK-NEXT: mov w9, #10 ; =0xa ; CHECK-NEXT: stp w9, w8, [sp, #40] -; CHECK-NEXT: mov w10, #11 -; CHECK-NEXT: mov w11, #12 +; CHECK-NEXT: mov w10, #11 ; =0xb +; CHECK-NEXT: mov w11, #12 ; =0xc ; CHECK-NEXT: stp w11, w10, [sp, #32] ; CHECK-NEXT: stp x10, x11, [sp, #16] ; CHECK-NEXT: str x9, [sp, #8] ; CHECK-NEXT: str w8, [sp] ; CHECK-NEXT: add x0, sp, #76 -; CHECK-NEXT: mov w1, #2 -; CHECK-NEXT: mov w2, #3 -; CHECK-NEXT: mov w3, #4 -; CHECK-NEXT: mov w4, #5 -; CHECK-NEXT: mov w5, #6 -; CHECK-NEXT: mov w6, #7 -; CHECK-NEXT: mov w7, #8 +; CHECK-NEXT: mov w1, #2 ; =0x2 +; CHECK-NEXT: mov w2, #3 ; =0x3 +; CHECK-NEXT: mov w3, #4 ; =0x4 +; CHECK-NEXT: mov w4, #5 ; =0x5 +; CHECK-NEXT: mov w5, #6 ; =0x6 +; CHECK-NEXT: mov w6, #7 ; =0x7 +; CHECK-NEXT: mov w7, #8 ; =0x8 ; CHECK-NEXT: bl _fn9 -; CHECK-NEXT: mov w0, #0 +; CHECK-NEXT: mov w0, #0 ; =0x0 ; CHECK-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -24,7 +24,7 @@ define <8 x i16> @build_all_zero(<8 x i16> %a) #1 { ; CHECK-LABEL: build_all_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #44672 +; CHECK-NEXT: mov w8, #44672 // =0xae80 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret @@ -56,9 +56,9 @@ define void @widen_f16_build_vector(ptr %addr) { ; CHECK-LABEL: widen_f16_build_vector: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #13294 -; CHECK-NEXT: movk w8, #13294, lsl #16 -; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: mov w8, #13294 // =0x33ee +; CHECK-NEXT: dup v0.8h, w8 +; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret store <2 x half> , ptr %addr, align 2 ret void @@ -68,7 +68,7 @@ define <1 x i64> @single_element_vector_i64(<1 x i64> %arg) { ; CHECK-LABEL: single_element_vector_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: fmov d1, x8 ; CHECK-NEXT: add d0, d0, d1 ; CHECK-NEXT: ret @@ -94,7 +94,7 @@ ; CHECK-LABEL: convert_single_fp_vector_constant: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: mov x8, #4607182418800017408 +; CHECK-NEXT: mov x8, #4607182418800017408 // =0x3ff0000000000000 ; CHECK-NEXT: csetm x9, ne ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: fmov d1, x9 @@ -120,7 +120,7 @@ define <2 x double> @negzero_v2f64(<2 x double> %a) { ; CHECK-LABEL: negzero_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-9223372036854775808 +; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret @@ -141,7 +141,7 @@ define <1 x double> @negzero_v1f64(<1 x double> %a) { ; CHECK-LABEL: negzero_v1f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-9223372036854775808 +; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 ; CHECK-NEXT: fmov d1, x8 ; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll --- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll @@ -14,7 +14,7 @@ ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: LBB0_2: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 5 @@ -42,7 +42,7 @@ ; SDISEL-NEXT: bl _foo ; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; SDISEL-NEXT: LBB1_2: ; %if.end -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; ; GISEL-LABEL: single_different: @@ -55,7 +55,7 @@ ; GISEL-NEXT: bl _foo ; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; GISEL-NEXT: LBB1_2: ; %if.end -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret entry: %cmp = icmp sle i32 %a, 5 @@ -88,7 +88,7 @@ ; SDISEL-NEXT: bl _foo ; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; SDISEL-NEXT: LBB2_3: ; %if.end -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; ; GISEL-LABEL: single_flagclobber: @@ -106,7 +106,7 @@ ; GISEL-NEXT: bl _foo ; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; GISEL-NEXT: LBB2_3: ; %if.end -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret entry: %cmp = icmp eq i32 %a, 5 @@ -144,7 +144,7 @@ ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: LBB3_3: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 5 @@ -178,13 +178,13 @@ ; SDISEL-NEXT: ccmp w8, #16, #0, ge ; SDISEL-NEXT: b.le LBB4_2 ; SDISEL-NEXT: ; %bb.1: ; %if.end -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; SDISEL-NEXT: LBB4_2: ; %if.then ; SDISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; SDISEL-NEXT: bl _foo ; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; ; GISEL-LABEL: speculate_division: @@ -194,13 +194,13 @@ ; GISEL-NEXT: ccmp w8, #17, #0, gt ; GISEL-NEXT: b.lt LBB4_2 ; GISEL-NEXT: ; %bb.1: ; %if.end -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret ; GISEL-NEXT: LBB4_2: ; %if.then ; GISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; GISEL-NEXT: bl _foo ; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret entry: %cmp = icmp sgt i32 %a, 0 @@ -230,13 +230,13 @@ ; SDISEL-NEXT: fccmp s0, s1, #8, ge ; SDISEL-NEXT: b.ge LBB5_2 ; SDISEL-NEXT: ; %bb.1: ; %if.end -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; SDISEL-NEXT: LBB5_2: ; %if.then ; SDISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; SDISEL-NEXT: bl _foo ; SDISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; SDISEL-NEXT: mov w0, #7 +; SDISEL-NEXT: mov w0, #7 ; =0x7 ; SDISEL-NEXT: ret ; ; GISEL-LABEL: single_fcmp: @@ -248,13 +248,13 @@ ; GISEL-NEXT: fccmp s0, s1, #8, gt ; GISEL-NEXT: b.ge LBB5_2 ; GISEL-NEXT: ; %bb.1: ; %if.end -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret ; GISEL-NEXT: LBB5_2: ; %if.then ; GISEL-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; GISEL-NEXT: bl _foo ; GISEL-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; GISEL-NEXT: mov w0, #7 +; GISEL-NEXT: mov w0, #7 ; =0x7 ; GISEL-NEXT: ret entry: %cmp = icmp sgt i32 %a, 0 @@ -318,7 +318,7 @@ ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: LBB7_2: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 0 @@ -346,13 +346,13 @@ ; CHECK-NEXT: cmp w1, #32 ; CHECK-NEXT: b.eq LBB8_3 ; CHECK-NEXT: ; %bb.2: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret ; CHECK-NEXT: LBB8_3: ; %if.then ; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 5 @@ -380,7 +380,7 @@ ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: LBB9_2: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 0 @@ -408,7 +408,7 @@ ; CHECK-NEXT: bl _foo ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: LBB10_2: ; %if.end -; CHECK-NEXT: mov w0, #7 +; CHECK-NEXT: mov w0, #7 ; =0x7 ; CHECK-NEXT: ret entry: %cmp = icmp eq i32 %a, 0 @@ -466,7 +466,7 @@ ; ; GISEL-LABEL: select_and: ; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #5 +; GISEL-NEXT: mov w8, #5 ; =0x5 ; GISEL-NEXT: cmp w8, w1 ; GISEL-NEXT: ccmp w0, w1, #0, ne ; GISEL-NEXT: csel x0, x2, x3, lt @@ -488,7 +488,7 @@ ; ; GISEL-LABEL: select_or: ; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #5 +; GISEL-NEXT: mov w8, #5 ; =0x5 ; GISEL-NEXT: cmp w8, w1 ; GISEL-NEXT: ccmp w0, w1, #8, eq ; GISEL-NEXT: csel x0, x2, x3, lt @@ -510,7 +510,7 @@ ; ; GISEL-LABEL: select_or_float: ; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #5 +; GISEL-NEXT: mov w8, #5 ; =0x5 ; GISEL-NEXT: cmp w8, w1 ; GISEL-NEXT: ccmp w0, w1, #8, eq ; GISEL-NEXT: fcsel s0, s0, s1, lt @@ -528,13 +528,13 @@ ; SDISEL-NEXT: cmp x0, #2 ; SDISEL-NEXT: ccmp x0, #4, #4, ne ; SDISEL-NEXT: ccmp x1, #0, #0, eq -; SDISEL-NEXT: mov w8, #1 +; SDISEL-NEXT: mov w8, #1 ; =0x1 ; SDISEL-NEXT: cinc x0, x8, eq ; SDISEL-NEXT: ret ; ; GISEL-LABEL: gccbug: ; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #2 +; GISEL-NEXT: mov w8, #2 ; =0x2 ; GISEL-NEXT: cmp x0, #2 ; GISEL-NEXT: ccmp x0, #4, #4, ne ; GISEL-NEXT: ccmp x1, #0, #0, eq @@ -592,7 +592,7 @@ ; SDISEL-LABEL: select_andor32: ; SDISEL: ; %bb.0: ; SDISEL-NEXT: cmp w1, w2 -; SDISEL-NEXT: mov w8, #32 +; SDISEL-NEXT: mov w8, #32 ; =0x20 ; SDISEL-NEXT: ccmp w0, w8, #4, lt ; SDISEL-NEXT: ccmp w0, w1, #0, eq ; SDISEL-NEXT: csel w0, w0, w1, eq @@ -600,7 +600,7 @@ ; ; GISEL-LABEL: select_andor32: ; GISEL: ; %bb.0: -; GISEL-NEXT: mov w8, #32 +; GISEL-NEXT: mov w8, #32 ; =0x20 ; GISEL-NEXT: cmp w1, w2 ; GISEL-NEXT: ccmp w0, w8, #4, lt ; GISEL-NEXT: ccmp w0, w1, #0, eq @@ -663,8 +663,7 @@ ; SDISEL-NEXT: cmp x0, #0 ; SDISEL-NEXT: ccmp x0, #13, #0, ge ; SDISEL-NEXT: cset w8, gt -; SDISEL-NEXT: cmp w8, #0 -; SDISEL-NEXT: csel x0, xzr, x3, ne +; SDISEL-NEXT: csel x0, xzr, x3, gt ; SDISEL-NEXT: sbfx w8, w8, #0, #1 ; SDISEL-NEXT: adrp x9, _g@PAGE ; SDISEL-NEXT: str w8, [x9, _g@PAGEOFF] @@ -701,11 +700,11 @@ ; SDISEL-NEXT: ccmp w0, #13, #0, ge ; SDISEL-NEXT: cset w8, gt ; SDISEL-NEXT: cmp w0, #22 -; SDISEL-NEXT: mov w9, #44 +; SDISEL-NEXT: mov w9, #44 ; =0x2c ; SDISEL-NEXT: ccmp w0, w9, #0, ge ; SDISEL-NEXT: csel w8, wzr, w8, le ; SDISEL-NEXT: cmp w0, #99 -; SDISEL-NEXT: mov w9, #77 +; SDISEL-NEXT: mov w9, #77 ; =0x4d ; SDISEL-NEXT: ccmp w0, w9, #4, ne ; SDISEL-NEXT: cset w9, eq ; SDISEL-NEXT: tst w8, w9 diff --git a/llvm/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll --- a/llvm/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll +++ b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll @@ -14,7 +14,9 @@ define void @test(ptr nocapture %su) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: str wzr, [x0, #96] +; CHECK-NEXT: ldrh w8, [x0, #100] +; CHECK-NEXT: lsl x8, x8, #32 +; CHECK-NEXT: str w8, [x0, #96] ; CHECK-NEXT: ret entry: %r1 = getelementptr inbounds %"struct.SU", ptr %su, i64 1, i32 5 diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll --- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -593,7 +593,7 @@ define ptr @test_v16i8_post_reg_st1_lane(<16 x i8> %in, ptr %addr) { ; CHECK-LABEL: test_v16i8_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov w8, #2 ; =0x2 ; CHECK-NEXT: st1.b { v0 }[3], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <16 x i8> %in, i32 3 @@ -619,7 +619,7 @@ define ptr @test_v8i16_post_reg_st1_lane(<8 x i16> %in, ptr %addr) { ; CHECK-LABEL: test_v8i16_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: mov w8, #4 ; =0x4 ; CHECK-NEXT: st1.h { v0 }[3], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <8 x i16> %in, i32 3 @@ -644,7 +644,7 @@ define ptr @test_v4i32_post_reg_st1_lane(<4 x i32> %in, ptr %addr) { ; CHECK-LABEL: test_v4i32_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 ; =0x8 ; CHECK-NEXT: st1.s { v0 }[3], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <4 x i32> %in, i32 3 @@ -669,7 +669,7 @@ define ptr @test_v4f32_post_reg_st1_lane(<4 x float> %in, ptr %addr) { ; CHECK-LABEL: test_v4f32_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 ; =0x8 ; CHECK-NEXT: st1.s { v0 }[3], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <4 x float> %in, i32 3 @@ -694,7 +694,7 @@ define ptr @test_v2i64_post_reg_st1_lane(<2 x i64> %in, ptr %addr) { ; CHECK-LABEL: test_v2i64_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov w8, #16 ; =0x10 ; CHECK-NEXT: st1.d { v0 }[1], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <2 x i64> %in, i64 1 @@ -719,7 +719,7 @@ define ptr @test_v2f64_post_reg_st1_lane(<2 x double> %in, ptr %addr) { ; CHECK-LABEL: test_v2f64_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov w8, #16 ; =0x10 ; CHECK-NEXT: st1.d { v0 }[1], [x0], x8 ; CHECK-NEXT: ret %elt = extractelement <2 x double> %in, i32 1 @@ -745,7 +745,7 @@ define ptr @test_v8i8_post_reg_st1_lane(<8 x i8> %in, ptr %addr) { ; CHECK-LABEL: test_v8i8_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov w8, #2 ; =0x2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: st1.b { v0 }[3], [x0], x8 ; CHECK-NEXT: ret @@ -772,7 +772,7 @@ define ptr @test_v4i16_post_reg_st1_lane(<4 x i16> %in, ptr %addr) { ; CHECK-LABEL: test_v4i16_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: mov w8, #4 ; =0x4 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: st1.h { v0 }[3], [x0], x8 ; CHECK-NEXT: ret @@ -799,7 +799,7 @@ define ptr @test_v2i32_post_reg_st1_lane(<2 x i32> %in, ptr %addr) { ; CHECK-LABEL: test_v2i32_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 ; =0x8 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: st1.s { v0 }[1], [x0], x8 ; CHECK-NEXT: ret @@ -826,7 +826,7 @@ define ptr @test_v2f32_post_reg_st1_lane(<2 x float> %in, ptr %addr) { ; CHECK-LABEL: test_v2f32_post_reg_st1_lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 ; =0x8 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: st1.s { v0 }[1], [x0], x8 ; CHECK-NEXT: ret @@ -8271,8 +8271,9 @@ define <16 x i8> @test_v16i8_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v16i8_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.16b { v0 }, [x0], #1 +; CHECK-NEXT: ldrb w8, [x0], #1 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.16b v0, w8 ; CHECK-NEXT: ret %tmp1 = load i8, ptr %bar %tmp2 = insertelement <16 x i8> , i8 %tmp1, i32 0 @@ -8327,8 +8328,9 @@ define <8 x i8> @test_v8i8_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v8i8_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.8b { v0 }, [x0], #1 +; CHECK-NEXT: ldrb w8, [x0], #1 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.8b v0, w8 ; CHECK-NEXT: ret %tmp1 = load i8, ptr %bar %tmp2 = insertelement <8 x i8> , i8 %tmp1, i32 0 @@ -8367,8 +8369,9 @@ define <8 x i16> @test_v8i16_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v8i16_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.8h { v0 }, [x0], #2 +; CHECK-NEXT: ldrh w8, [x0], #2 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.8h v0, w8 ; CHECK-NEXT: ret %tmp1 = load i16, ptr %bar %tmp2 = insertelement <8 x i16> , i16 %tmp1, i32 0 @@ -8408,8 +8411,9 @@ define <4 x i16> @test_v4i16_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v4i16_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 +; CHECK-NEXT: ldrh w8, [x0], #2 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.4h v0, w8 ; CHECK-NEXT: ret %tmp1 = load i16, ptr %bar %tmp2 = insertelement <4 x i16> , i16 %tmp1, i32 0 @@ -8441,8 +8445,9 @@ define <4 x i32> @test_v4i32_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v4i32_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.4s { v0 }, [x0], #4 +; CHECK-NEXT: ldr w8, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.4s v0, w8 ; CHECK-NEXT: ret %tmp1 = load i32, ptr %bar %tmp2 = insertelement <4 x i32> , i32 %tmp1, i32 0 @@ -8474,8 +8479,9 @@ define <2 x i32> @test_v2i32_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v2i32_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.2s { v0 }, [x0], #4 +; CHECK-NEXT: ldr w8, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.2s v0, w8 ; CHECK-NEXT: ret %tmp1 = load i32, ptr %bar %tmp2 = insertelement <2 x i32> , i32 %tmp1, i32 0 @@ -8503,8 +8509,9 @@ define <2 x i64> @test_v2i64_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v2i64_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.2d { v0 }, [x0], #8 +; CHECK-NEXT: ldr x8, [x0], #8 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.2d v0, x8 ; CHECK-NEXT: ret %tmp1 = load i64, ptr %bar %tmp2 = insertelement <2 x i64> , i64 %tmp1, i32 0 @@ -8532,8 +8539,9 @@ define <4 x float> @test_v4f32_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v4f32_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.4s { v0 }, [x0], #4 +; CHECK-NEXT: ldr s0, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.4s v0, v0[0] ; CHECK-NEXT: ret %tmp1 = load float, ptr %bar %tmp2 = insertelement <4 x float> , float %tmp1, i32 0 @@ -8565,8 +8573,9 @@ define <2 x float> @test_v2f32_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v2f32_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.2s { v0 }, [x0], #4 +; CHECK-NEXT: ldr s0, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.2s v0, v0[0] ; CHECK-NEXT: ret %tmp1 = load float, ptr %bar %tmp2 = insertelement <2 x float> , float %tmp1, i32 0 @@ -8594,8 +8603,9 @@ define <2 x double> @test_v2f64_post_imm_ld1r(ptr %bar, ptr %ptr) { ; CHECK-LABEL: test_v2f64_post_imm_ld1r: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.2d { v0 }, [x0], #8 +; CHECK-NEXT: ldr d0, [x0], #8 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: dup.2d v0, v0[0] ; CHECK-NEXT: ret %tmp1 = load double, ptr %bar %tmp2 = insertelement <2 x double> , double %tmp1, i32 0 @@ -8623,8 +8633,9 @@ define <16 x i8> @test_v16i8_post_imm_ld1lane(ptr %bar, ptr %ptr, <16 x i8> %A) { ; CHECK-LABEL: test_v16i8_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.b { v0 }[1], [x0], #1 +; CHECK-NEXT: ldrb w8, [x0], #1 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.b v0[1], w8 ; CHECK-NEXT: ret %tmp1 = load i8, ptr %bar %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1 @@ -8649,10 +8660,11 @@ define <8 x i8> @test_v8i8_post_imm_ld1lane(ptr %bar, ptr %ptr, <8 x i8> %A) { ; CHECK-LABEL: test_v8i8_post_imm_ld1lane: ; CHECK: ; %bb.0: +; CHECK-NEXT: ldrb w8, [x0], #1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.b { v0 }[1], [x0], #1 -; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.b v0[1], w8 +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = load i8, ptr %bar %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1 @@ -8679,8 +8691,9 @@ define <8 x i16> @test_v8i16_post_imm_ld1lane(ptr %bar, ptr %ptr, <8 x i16> %A) { ; CHECK-LABEL: test_v8i16_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.h { v0 }[1], [x0], #2 +; CHECK-NEXT: ldrh w8, [x0], #2 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.h v0[1], w8 ; CHECK-NEXT: ret %tmp1 = load i16, ptr %bar %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1 @@ -8706,10 +8719,11 @@ define <4 x i16> @test_v4i16_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x i16> %A) { ; CHECK-LABEL: test_v4i16_post_imm_ld1lane: ; CHECK: ; %bb.0: +; CHECK-NEXT: ldrh w8, [x0], #2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.h { v0 }[1], [x0], #2 -; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.h v0[1], w8 +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = load i16, ptr %bar %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1 @@ -8737,8 +8751,9 @@ define <4 x i32> @test_v4i32_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x i32> %A) { ; CHECK-LABEL: test_v4i32_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.s { v0 }[1], [x0], #4 +; CHECK-NEXT: ldr w8, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.s v0[1], w8 ; CHECK-NEXT: ret %tmp1 = load i32, ptr %bar %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1 @@ -8764,10 +8779,11 @@ define <2 x i32> @test_v2i32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x i32> %A) { ; CHECK-LABEL: test_v2i32_post_imm_ld1lane: ; CHECK: ; %bb.0: +; CHECK-NEXT: ldr w8, [x0], #4 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.s { v0 }[1], [x0], #4 -; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.s v0[1], w8 +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = load i32, ptr %bar %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1 @@ -8795,8 +8811,9 @@ define <2 x i64> @test_v2i64_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x i64> %A) { ; CHECK-LABEL: test_v2i64_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.d { v0 }[1], [x0], #8 +; CHECK-NEXT: ldr x8, [x0], #8 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.d v0[1], x8 ; CHECK-NEXT: ret %tmp1 = load i64, ptr %bar %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1 @@ -8822,8 +8839,9 @@ define <4 x float> @test_v4f32_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x float> %A) { ; CHECK-LABEL: test_v4f32_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.s { v0 }[1], [x0], #4 +; CHECK-NEXT: ldr s1, [x0], #4 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.s v0[1], v1[0] ; CHECK-NEXT: ret %tmp1 = load float, ptr %bar %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1 @@ -8849,10 +8867,11 @@ define <2 x float> @test_v2f32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x float> %A) { ; CHECK-LABEL: test_v2f32_post_imm_ld1lane: ; CHECK: ; %bb.0: +; CHECK-NEXT: ldr s1, [x0], #4 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ld1.s { v0 }[1], [x0], #4 -; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.s v0[1], v1[0] +; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = load float, ptr %bar %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1 @@ -8880,8 +8899,9 @@ define <2 x double> @test_v2f64_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x double> %A) { ; CHECK-LABEL: test_v2f64_post_imm_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1.d { v0 }[1], [x0], #8 +; CHECK-NEXT: ldr d1, [x0], #8 ; CHECK-NEXT: str x0, [x1] +; CHECK-NEXT: mov.d v0[1], v1[0] ; CHECK-NEXT: ret %tmp1 = load double, ptr %bar %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1 @@ -9143,7 +9163,7 @@ ; CHECK-LABEL: load_single_extract_variable_index_v3i32_small_align: ; CHECK: ; %bb.0: ; CHECK-NEXT: mov w9, w1 -; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov w8, #2 ; =0x2 ; CHECK-NEXT: cmp x9, #2 ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: ldr w0, [x0, x8, lsl #2] @@ -9157,7 +9177,7 @@ ; CHECK-LABEL: load_single_extract_variable_index_v3i32_default_align: ; CHECK: ; %bb.0: ; CHECK-NEXT: mov w9, w1 -; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: mov w8, #2 ; =0x2 ; CHECK-NEXT: cmp x9, #2 ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: ldr w0, [x0, x8, lsl #2] diff --git a/llvm/test/CodeGen/AArch64/arm64-ld-from-st.ll b/llvm/test/CodeGen/AArch64/arm64-ld-from-st.ll --- a/llvm/test/CodeGen/AArch64/arm64-ld-from-st.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ld-from-st.ll @@ -319,9 +319,10 @@ define i16 @Str16Ldr16(ptr nocapture %P, i16 %v, i64 %n) { ; CHECK-LABEL: Str16Ldr16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w0, w1 -; CHECK-NEXT: strh w1, [x8, #2] +; CHECK-NEXT: and w8, w1, #0xffff +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: strh w1, [x9, #2] ; CHECK-NEXT: ret entry: %arrayidx0 = getelementptr inbounds i16, ptr %P, i64 1 @@ -334,9 +335,8 @@ define i8 @Str16Ldr8_0(ptr nocapture %P, i16 %v, i64 %n) { ; CHECK-LABEL: Str16Ldr8_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w0, w1 -; CHECK-NEXT: strh w1, [x8, #2] +; CHECK-NEXT: strh w1, [x0, #2] +; CHECK-NEXT: and w0, w1, #0xff ; CHECK-NEXT: ret entry: %arrayidx0 = getelementptr inbounds i16, ptr %P, i64 1 @@ -679,9 +679,10 @@ define i16 @Unscaled_Str16Ldr16(ptr nocapture %P, i16 %v, i64 %n) { ; CHECK-LABEL: Unscaled_Str16Ldr16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w0, w1 -; CHECK-NEXT: sturh w1, [x8, #-2] +; CHECK-NEXT: and w8, w1, #0xffff +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: sturh w1, [x9, #-2] ; CHECK-NEXT: ret entry: %arrayidx0 = getelementptr inbounds i16, ptr %P, i64 -1 @@ -694,9 +695,8 @@ define i8 @Unscaled_Str16Ldr8_0(ptr nocapture %P, i16 %v, i64 %n) { ; CHECK-LABEL: Unscaled_Str16Ldr8_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w0, w1 -; CHECK-NEXT: sturh w1, [x8, #-2] +; CHECK-NEXT: sturh w1, [x0, #-2] +; CHECK-NEXT: and w0, w1, #0xff ; CHECK-NEXT: ret entry: %arrayidx0 = getelementptr inbounds i16, ptr %P, i64 -1 diff --git a/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll --- a/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll +++ b/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll @@ -19,12 +19,12 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, src ; CHECK-NEXT: add x8, x8, :lo12:src -; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: ldur w9, [x8, #7] ; CHECK-NEXT: adrp x10, dst ; CHECK-NEXT: add x10, x10, :lo12:dst -; CHECK-NEXT: str x9, [x10] -; CHECK-NEXT: ldur w8, [x8, #7] -; CHECK-NEXT: stur w8, [x10, #7] +; CHECK-NEXT: stur w9, [x10, #7] +; CHECK-NEXT: ldr x8, [x8] +; CHECK-NEXT: str x8, [x10] ; CHECK-NEXT: mov w0, #0 // =0x0 ; CHECK-NEXT: ret entry: @@ -37,10 +37,10 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, .L.str1 ; CHECK-NEXT: add x8, x8, :lo12:.L.str1 -; CHECK-NEXT: ldr q0, [x8] -; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ldur q0, [x8, #15] ; CHECK-NEXT: stur q0, [x0, #15] +; CHECK-NEXT: ldr q0, [x8] +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str1, i64 31, i1 false) @@ -55,8 +55,8 @@ ; CHECK-NEXT: str w8, [x0, #32] ; CHECK-NEXT: adrp x8, .L.str2 ; CHECK-NEXT: add x8, x8, :lo12:.L.str2 -; CHECK-NEXT: ldp q0, q1, [x8] -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x8] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str2, i64 36, i1 false) @@ -68,10 +68,10 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, .L.str3 ; CHECK-NEXT: add x8, x8, :lo12:.L.str3 +; CHECK-NEXT: ldr x9, [x8, #16] +; CHECK-NEXT: str x9, [x0, #16] ; CHECK-NEXT: ldr q0, [x8] ; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: ldr x8, [x8, #16] -; CHECK-NEXT: str x8, [x0, #16] ; CHECK-NEXT: ret entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %C, ptr @.str3, i64 24, i1 false) @@ -113,12 +113,12 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, .L.str6 ; CHECK-NEXT: add x8, x8, :lo12:.L.str6 -; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: ldur x9, [x8, #6] ; CHECK-NEXT: adrp x10, spool.splbuf ; CHECK-NEXT: add x10, x10, :lo12:spool.splbuf -; CHECK-NEXT: str x9, [x10] -; CHECK-NEXT: ldur x8, [x8, #6] -; CHECK-NEXT: stur x8, [x10, #6] +; CHECK-NEXT: stur x9, [x10, #6] +; CHECK-NEXT: ldr x8, [x8] +; CHECK-NEXT: str x8, [x10] ; CHECK-NEXT: ret entry: call void @llvm.memcpy.p0.p0.i64(ptr @spool.splbuf, ptr @.str6, i64 14, i1 false) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -2541,9 +2541,9 @@ ; CHECK-NEXT: adrp x9, .LCPI196_0 ; CHECK-NEXT: fmov d4, x0 ; CHECK-NEXT: rev32 v5.8h, v0.8h -; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: dup v2.8h, w8 ; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI196_0] -; CHECK-NEXT: sqneg v2.8h, v1.8h +; CHECK-NEXT: sqneg v1.8h, v2.8h ; CHECK-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b ; CHECK-NEXT: sqdmull v2.4s, v0.4h, v4.h[0] ; CHECK-NEXT: sqdmull2 v0.4s, v0.8h, v4.h[0] diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1147,7 +1147,15 @@ ; CHECK-LABEL: testDUP.v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.8b, v0.b[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.b[1], v0.b[0] +; CHECK-NEXT: mov v1.b[2], v0.b[0] +; CHECK-NEXT: mov v1.b[3], v0.b[0] +; CHECK-NEXT: mov v1.b[4], v0.b[0] +; CHECK-NEXT: mov v1.b[5], v0.b[0] +; CHECK-NEXT: mov v1.b[6], v0.b[0] +; CHECK-NEXT: mov v1.b[7], v0.b[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %b = extractelement <1 x i8> %a, i32 0 %c = insertelement <8 x i8> undef, i8 %b, i32 0 @@ -1165,7 +1173,15 @@ ; CHECK-LABEL: testDUP.v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.h[1], v0.h[0] +; CHECK-NEXT: mov v1.h[2], v0.h[0] +; CHECK-NEXT: mov v1.h[3], v0.h[0] +; CHECK-NEXT: mov v1.h[4], v0.h[0] +; CHECK-NEXT: mov v1.h[5], v0.h[0] +; CHECK-NEXT: mov v1.h[6], v0.h[0] +; CHECK-NEXT: mov v1.h[7], v0.h[0] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %b = extractelement <1 x i16> %a, i32 0 %c = insertelement <8 x i16> undef, i16 %b, i32 0 @@ -1183,7 +1199,11 @@ ; CHECK-LABEL: testDUP.v1i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: mov v1.s[2], v0.s[0] +; CHECK-NEXT: mov v1.s[3], v0.s[0] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %b = extractelement <1 x i32> %a, i32 0 %c = insertelement <4 x i32> undef, i32 %b, i32 0 @@ -1196,7 +1216,15 @@ define <8 x i8> @getl(<16 x i8> %x) #0 { ; CHECK-LABEL: getl: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.b[1], v0.b[1] +; CHECK-NEXT: mov v1.b[2], v0.b[2] +; CHECK-NEXT: mov v1.b[3], v0.b[3] +; CHECK-NEXT: mov v1.b[4], v0.b[4] +; CHECK-NEXT: mov v1.b[5], v0.b[5] +; CHECK-NEXT: mov v1.b[6], v0.b[6] +; CHECK-NEXT: mov v1.b[7], v0.b[7] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %vecext = extractelement <16 x i8> %x, i32 0 %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0 @@ -1310,7 +1338,11 @@ ; CHECK-LABEL: test_dup_v1i64_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.h[1], v0.h[0] +; CHECK-NEXT: mov v1.h[2], v0.h[0] +; CHECK-NEXT: mov v1.h[3], v0.h[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret entry: %x = extractelement <1 x i64> %a, i32 0 @@ -1326,7 +1358,8 @@ ; CHECK-LABEL: test_dup_v1i64_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: mov v0.s[1], v0.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %x = extractelement <1 x i64> %a, i32 0 @@ -1388,7 +1421,11 @@ define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v1.h[1], v0.h[0] +; CHECK-NEXT: mov v1.h[2], v0.h[0] +; CHECK-NEXT: mov v1.h[3], v0.h[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret entry: %x = extractelement <2 x i64> %a, i32 0 @@ -1403,7 +1440,8 @@ define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: mov v0.s[1], v0.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %x = extractelement <2 x i64> %a, i32 0 @@ -1472,7 +1510,8 @@ ; CHECK-LABEL: test_concat_same_v1i32_v1i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: mov v0.s[1], v0.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: %0 = extractelement <2 x i32> %a, i32 0 @@ -1515,7 +1554,16 @@ ; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v2.b[1], v0.b[1] +; CHECK-NEXT: mov v2.b[2], v0.b[2] +; CHECK-NEXT: mov v2.b[3], v0.b[3] +; CHECK-NEXT: mov v2.b[4], v0.b[4] +; CHECK-NEXT: mov v2.b[5], v0.b[5] +; CHECK-NEXT: mov v2.b[6], v0.b[6] +; CHECK-NEXT: mov v2.b[7], v0.b[7] +; CHECK-NEXT: mov v2.d[1], v1.d[0] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1542,7 +1590,14 @@ ; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v0.b[8], v1.b[0] +; CHECK-NEXT: mov v0.b[9], v1.b[1] +; CHECK-NEXT: mov v0.b[10], v1.b[2] +; CHECK-NEXT: mov v0.b[11], v1.b[3] +; CHECK-NEXT: mov v0.b[12], v1.b[4] +; CHECK-NEXT: mov v0.b[13], v1.b[5] +; CHECK-NEXT: mov v0.b[14], v1.b[6] +; CHECK-NEXT: mov v0.b[15], v1.b[7] ; CHECK-NEXT: ret entry: %vecext = extractelement <16 x i8> %x, i32 0 @@ -1584,8 +1639,24 @@ ; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v2.16b, v0.16b ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.b[1], v0.b[1] +; CHECK-NEXT: mov v2.b[2], v0.b[2] +; CHECK-NEXT: mov v2.b[3], v0.b[3] +; CHECK-NEXT: mov v2.b[4], v0.b[4] +; CHECK-NEXT: mov v2.b[5], v0.b[5] +; CHECK-NEXT: mov v2.b[6], v0.b[6] +; CHECK-NEXT: mov v2.b[7], v0.b[7] +; CHECK-NEXT: mov v2.b[8], v1.b[0] +; CHECK-NEXT: mov v2.b[9], v1.b[1] +; CHECK-NEXT: mov v2.b[10], v1.b[2] +; CHECK-NEXT: mov v2.b[11], v1.b[3] +; CHECK-NEXT: mov v2.b[12], v1.b[4] +; CHECK-NEXT: mov v2.b[13], v1.b[5] +; CHECK-NEXT: mov v2.b[14], v1.b[6] +; CHECK-NEXT: mov v2.b[15], v1.b[7] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1637,7 +1708,12 @@ ; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov v2.h[1], v0.h[1] +; CHECK-NEXT: mov v2.h[2], v0.h[2] +; CHECK-NEXT: mov v2.h[3], v0.h[3] +; CHECK-NEXT: mov v2.d[1], v1.d[0] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -1656,7 +1732,10 @@ ; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v0.h[4], v1.h[0] +; CHECK-NEXT: mov v0.h[5], v1.h[1] +; CHECK-NEXT: mov v0.h[6], v1.h[2] +; CHECK-NEXT: mov v0.h[7], v1.h[3] ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i16> %x, i32 0 @@ -1682,8 +1761,16 @@ ; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v2.16b, v0.16b ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.h[1], v0.h[1] +; CHECK-NEXT: mov v2.h[2], v0.h[2] +; CHECK-NEXT: mov v2.h[3], v0.h[3] +; CHECK-NEXT: mov v2.h[4], v1.h[0] +; CHECK-NEXT: mov v2.h[5], v1.h[1] +; CHECK-NEXT: mov v2.h[6], v1.h[2] +; CHECK-NEXT: mov v2.h[7], v1.h[3] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -1719,6 +1806,7 @@ ; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.s[1], v0.s[1] ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: @@ -1734,7 +1822,8 @@ ; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: mov v0.s[3], v1.s[1] ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i32> %x, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll b/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-vector-shuffle-extract.ll @@ -4,10 +4,10 @@ define void @test(ptr %p1, ptr %p2) { ; CHECK-LABEL: test: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #3 // =0x3 -; CHECK-NEXT: mov w9, #1 // =0x1 -; CHECK-NEXT: str w8, [x0] -; CHECK-NEXT: str w9, [x1] +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: st1 { v0.s }[1], [x0] +; CHECK-NEXT: st1 { v0.s }[2], [x1] ; CHECK-NEXT: ret %tmp = shufflevector <1 x i32> , <1 x i32> undef, <3 x i32> %tmp2 = shufflevector <3 x i32> , <3 x i32> %tmp, <3 x i32> diff --git a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll --- a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll @@ -128,15 +128,9 @@ define void @i56_or(ptr %a) { ; CHECK-LABEL: i56_or: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: ldrh w10, [x8, #4]! -; CHECK-NEXT: ldrb w11, [x8, #2] -; CHECK-NEXT: orr w9, w9, #0x180 -; CHECK-NEXT: orr w10, w10, w11, lsl #16 -; CHECK-NEXT: str w9, [x0] -; CHECK-NEXT: strb w11, [x8, #2] -; CHECK-NEXT: strh w10, [x8] +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: orr w8, w8, #0x180 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %aa = load i56, ptr %a, align 1 %b = or i56 %aa, 384 @@ -147,16 +141,10 @@ define void @i56_and_or(ptr %a) { ; CHECK-LABEL: i56_and_or: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: ldrh w10, [x8, #4]! -; CHECK-NEXT: ldrb w11, [x8, #2] -; CHECK-NEXT: orr w9, w9, #0x180 -; CHECK-NEXT: and w9, w9, #0xffffff80 -; CHECK-NEXT: orr w10, w10, w11, lsl #16 -; CHECK-NEXT: strb w11, [x8, #2] -; CHECK-NEXT: str w9, [x0] -; CHECK-NEXT: strh w10, [x8] +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: orr w8, w8, #0x180 +; CHECK-NEXT: and w8, w8, #0xffffff80 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %b = load i56, ptr %a, align 1 %c = and i56 %b, -128 @@ -168,17 +156,18 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) { ; CHECK-LABEL: i56_insert_bit: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: ldr w11, [x0] -; CHECK-NEXT: ldrh w9, [x8, #4]! -; CHECK-NEXT: ldrb w10, [x8, #2] -; CHECK-NEXT: orr w9, w9, w10, lsl #16 -; CHECK-NEXT: strb w10, [x8, #2] -; CHECK-NEXT: orr x11, x11, x9, lsl #32 -; CHECK-NEXT: and x11, x11, #0xffffffffffffdfff -; CHECK-NEXT: strh w9, [x8] -; CHECK-NEXT: orr w11, w11, w1, lsl #13 -; CHECK-NEXT: str w11, [x0] +; CHECK-NEXT: ldrb w8, [x0, #6] +; CHECK-NEXT: ldrh w9, [x0, #4] +; CHECK-NEXT: ldr w10, [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: orr x8, x10, x8, lsl #32 +; CHECK-NEXT: and x8, x8, #0xffffffffffffdfff +; CHECK-NEXT: lsr x9, x8, #48 +; CHECK-NEXT: lsr x10, x8, #32 +; CHECK-NEXT: orr w8, w8, w1, lsl #13 +; CHECK-NEXT: strb w9, [x0, #6] +; CHECK-NEXT: strh w10, [x0, #4] +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %extbit = zext i1 %bit to i56 %b = load i56, ptr %a, align 1 diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll --- a/llvm/test/CodeGen/AArch64/arm64-rev.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll @@ -60,8 +60,7 @@ ; CHECK-LABEL: test_rev_w_srl16_load: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: rev w8, w8 -; CHECK-NEXT: lsr w0, w8, #16 +; CHECK-NEXT: rev16 w0, w8 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_rev_w_srl16_load: @@ -129,8 +128,7 @@ ; CHECK-LABEL: test_rev_x_srl32_load: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: rev x8, x8 -; CHECK-NEXT: lsr x0, x8, #32 +; CHECK-NEXT: rev32 x0, x8 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_rev_x_srl32_load: diff --git a/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll b/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll --- a/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll +++ b/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll @@ -34,8 +34,9 @@ define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp { ; CHECK-LABEL: extendedLeftShiftcharToshortBy8: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: add w8, w0, #1 -; CHECK-NEXT: sbfiz w0, w8, #8, #8 +; CHECK-NEXT: lsl w8, w0, #8 +; CHECK-NEXT: add w8, w8, #256 +; CHECK-NEXT: sxth w0, w8 ; CHECK-NEXT: ret entry: %inc = add i8 %a, 1 @@ -328,8 +329,9 @@ define i64 @sign_extend_inreg_isdef32(i64) { ; CHECK-LABEL: sign_extend_inreg_isdef32: ; CHECK: ; %bb.0: -; CHECK-NEXT: sbfx x8, x0, #32, #16 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: lsr x8, x0, #16 +; CHECK-NEXT: and w8, w8, #0xffff0000 +; CHECK-NEXT: asr w0, w8, #16 ; CHECK-NEXT: ret %2 = lshr i64 %0, 32 %3 = shl i64 %2, 16 diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -49,12 +49,12 @@ ; FALLBACK-NOT: remark:{{.*}} sabdl2_8h define <8 x i16> @sabdl2_8h(ptr %A, ptr %B) nounwind { -; DAG-LABEL: sabdl2_8h: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: sabdl.8h v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: sabdl2_8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: sabdl.8h v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabdl2_8h: ; GISEL: // %bb.0: @@ -62,7 +62,7 @@ ; GISEL-NEXT: ldr q1, [x1] ; GISEL-NEXT: ext.16b v0, v0, v0, #8 ; GISEL-NEXT: ext.16b v1, v1, v0, #8 -; GISEL-NEXT: sabdl.8h v0, v0, v1 +; GISEL-NEXT: sabdl.8h v0, v0, v1 ; GISEL-NEXT: ret %load1 = load <16 x i8>, ptr %A %load2 = load <16 x i8>, ptr %B @@ -75,12 +75,12 @@ ; FALLBACK-NOT: remark:{{.*}} sabdl2_4s define <4 x i32> @sabdl2_4s(ptr %A, ptr %B) nounwind { -; DAG-LABEL: sabdl2_4s: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: sabdl.4s v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: sabdl2_4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: sabdl.4s v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabdl2_4s: ; GISEL: // %bb.0: @@ -101,12 +101,12 @@ ; FALLBACK-NOT: remark:{{.*}} sabdl2_2d define <2 x i64> @sabdl2_2d(ptr %A, ptr %B) nounwind { -; DAG-LABEL: sabdl2_2d: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: sabdl.2d v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: sabdl2_2d: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: sabdl.2d v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabdl2_2d: ; GISEL: // %bb.0: @@ -172,12 +172,12 @@ ; FALLBACK-NOT: remark:{{.*}} uabdl2_8h define <8 x i16> @uabdl2_8h(ptr %A, ptr %B) nounwind { -; DAG-LABEL: uabdl2_8h: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: uabdl.8h v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: uabdl2_8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: uabdl.8h v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabdl2_8h: ; GISEL: // %bb.0: @@ -185,7 +185,7 @@ ; GISEL-NEXT: ldr q1, [x1] ; GISEL-NEXT: ext.16b v0, v0, v0, #8 ; GISEL-NEXT: ext.16b v1, v1, v0, #8 -; GISEL-NEXT: uabdl.8h v0, v0, v1 +; GISEL-NEXT: uabdl.8h v0, v0, v1 ; GISEL-NEXT: ret %load1 = load <16 x i8>, ptr %A %load2 = load <16 x i8>, ptr %B @@ -199,12 +199,12 @@ ; FALLBACK-NOT: remark:{{.*}} uabdl2_4s define <4 x i32> @uabdl2_4s(ptr %A, ptr %B) nounwind { -; DAG-LABEL: uabdl2_4s: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: uabdl.4s v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: uabdl2_4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: uabdl.4s v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabdl2_4s: ; GISEL: // %bb.0: @@ -212,7 +212,7 @@ ; GISEL-NEXT: ldr q1, [x1] ; GISEL-NEXT: ext.16b v0, v0, v0, #8 ; GISEL-NEXT: ext.16b v1, v1, v0, #8 -; GISEL-NEXT: uabdl.4s v0, v0, v1 +; GISEL-NEXT: uabdl.4s v0, v0, v1 ; GISEL-NEXT: ret %load1 = load <8 x i16>, ptr %A %load2 = load <8 x i16>, ptr %B @@ -225,12 +225,12 @@ ; FALLBACK-NOT: remark:{{.*}} uabdl2_2d define <2 x i64> @uabdl2_2d(ptr %A, ptr %B) nounwind { -; DAG-LABEL: uabdl2_2d: -; DAG: // %bb.0: -; DAG-NEXT: ldr d0, [x0, #8] -; DAG-NEXT: ldr d1, [x1, #8] -; DAG-NEXT: uabdl.2d v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: uabdl2_2d: +; DAG: // %bb.0: +; DAG-NEXT: ldr d0, [x0, #8] +; DAG-NEXT: ldr d1, [x1, #8] +; DAG-NEXT: uabdl.2d v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabdl2_2d: ; GISEL: // %bb.0: @@ -238,7 +238,7 @@ ; GISEL-NEXT: ldr q1, [x1] ; GISEL-NEXT: ext.16b v0, v0, v0, #8 ; GISEL-NEXT: ext.16b v1, v1, v0, #8 -; GISEL-NEXT: uabdl.2d v0, v0, v1 +; GISEL-NEXT: uabdl.2d v0, v0, v1 ; GISEL-NEXT: ret %load1 = load <4 x i32>, ptr %A %load2 = load <4 x i32>, ptr %B @@ -276,9 +276,20 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: uabd16b_rdx_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: uabdl.8h v2, v0, v1 -; CHECK-NEXT: uabal2.8h v2, v0, v1 -; CHECK-NEXT: uaddlv.8h s0, v2 +; CHECK-NEXT: usubl.8h v2, v0, v1 +; CHECK-NEXT: usubl2.8h v0, v0, v1 +; CHECK-NEXT: sshll2.4s v1, v2, #0 +; CHECK-NEXT: sshll2.4s v3, v0, #0 +; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: sshll.4s v2, v2, #0 +; CHECK-NEXT: abs.4s v0, v0 +; CHECK-NEXT: abs.4s v3, v3 +; CHECK-NEXT: abs.4s v1, v1 +; CHECK-NEXT: abs.4s v2, v2 +; CHECK-NEXT: add.4s v1, v1, v3 +; CHECK-NEXT: add.4s v0, v2, v0 +; CHECK-NEXT: add.4s v0, v0, v1 +; CHECK-NEXT: addv.4s s0, v0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %aext = zext <16 x i8> %a to <16 x i32> @@ -294,9 +305,20 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: sabd16b_rdx_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sabdl.8h v2, v0, v1 -; CHECK-NEXT: sabal2.8h v2, v0, v1 -; CHECK-NEXT: uaddlv.8h s0, v2 +; CHECK-NEXT: ssubl.8h v2, v0, v1 +; CHECK-NEXT: ssubl2.8h v0, v0, v1 +; CHECK-NEXT: sshll2.4s v1, v2, #0 +; CHECK-NEXT: sshll2.4s v3, v0, #0 +; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: sshll.4s v2, v2, #0 +; CHECK-NEXT: abs.4s v0, v0 +; CHECK-NEXT: abs.4s v3, v3 +; CHECK-NEXT: abs.4s v1, v1 +; CHECK-NEXT: abs.4s v2, v2 +; CHECK-NEXT: add.4s v1, v1, v3 +; CHECK-NEXT: add.4s v0, v2, v0 +; CHECK-NEXT: add.4s v0, v0, v1 +; CHECK-NEXT: addv.4s s0, v0 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %aext = sext <16 x i8> %a to <16 x i32> @@ -1033,13 +1055,13 @@ ; FALLBACK-NOT: remark:{{.*}} sabal2_8h define <8 x i16> @sabal2_8h(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: sabal2_8h: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: sabal.8h v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: sabal2_8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: sabal.8h v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabal2_8h: ; GISEL: // %bb.0: @@ -1063,13 +1085,13 @@ ; FALLBACK-NOT: remark:{{.*}} sabal2_4s define <4 x i32> @sabal2_4s(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: sabal2_4s: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: sabal.4s v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: sabal2_4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: sabal.4s v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabal2_4s: ; GISEL: // %bb.0: @@ -1093,13 +1115,13 @@ ; FALLBACK-NOT: remark:{{.*}} sabal2_2d define <2 x i64> @sabal2_2d(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: sabal2_2d: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: sabal.2d v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: sabal2_2d: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: sabal.2d v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabal2_2d: ; GISEL: // %bb.0: @@ -1201,13 +1223,13 @@ ; FALLBACK-NOT: remark:{{.*}} uabal2_8h define <8 x i16> @uabal2_8h(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: uabal2_8h: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: uabal.8h v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: uabal2_8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: uabal.8h v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabal2_8h: ; GISEL: // %bb.0: @@ -1231,13 +1253,13 @@ ; FALLBACK-NOT: remark:{{.*}} uabal2_4s define <4 x i32> @uabal2_4s(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: uabal2_4s: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: uabal.4s v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: uabal2_4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: uabal.4s v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabal2_4s: ; GISEL: // %bb.0: @@ -1261,13 +1283,13 @@ ; FALLBACK-NOT: remark:{{.*}} uabal2_2d define <2 x i64> @uabal2_2d(ptr %A, ptr %B, ptr %C) nounwind { -; DAG-LABEL: uabal2_2d: -; DAG: // %bb.0: -; DAG-NEXT: ldr q0, [x2] -; DAG-NEXT: ldr d1, [x0, #8] -; DAG-NEXT: ldr d2, [x1, #8] -; DAG-NEXT: uabal.2d v0, v1, v2 -; DAG-NEXT: ret +; DAG-LABEL: uabal2_2d: +; DAG: // %bb.0: +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: ldr d1, [x0, #8] +; DAG-NEXT: ldr d2, [x1, #8] +; DAG-NEXT: uabal.2d v0, v1, v2 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabal2_2d: ; GISEL: // %bb.0: @@ -1624,12 +1646,18 @@ ; FALLBACK-NOT: remark:{{.*}} uabdl_from_extract_dup define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { -; CHECK-LABEL: uabdl_from_extract_dup: -; CHECK: // %bb.0: -; CHECK-NEXT: dup.2s v1, w0 +; DAG-LABEL: uabdl_from_extract_dup: +; DAG: // %bb.0: +; DAG-NEXT: dup.2s v1, w0 +; DAG-NEXT: uabdl.2d v0, v0, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uabdl_from_extract_dup: +; GISEL: // %bb.0: +; GISEL-NEXT: dup.2s v1, w0 ; GISEL-NEXT: ext.16b v0, v0, v0, #0 -; CHECK-NEXT: uabdl.2d v0, v0, v1 -; CHECK-NEXT: ret +; GISEL-NEXT: uabdl.2d v0, v0, v1 +; GISEL-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 @@ -1642,11 +1670,11 @@ ; FALLBACK-NOT: remark:{{.*}} uabdl2_from_extract_dup define <2 x i64> @uabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { -; DAG-LABEL: uabdl2_from_extract_dup: -; DAG: // %bb.0: -; DAG-NEXT: dup.4s v1, w0 -; DAG-NEXT: uabdl2.2d v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: uabdl2_from_extract_dup: +; DAG: // %bb.0: +; DAG-NEXT: dup.4s v1, w0 +; DAG-NEXT: uabdl2.2d v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: uabdl2_from_extract_dup: ; GISEL: // %bb.0: @@ -1666,12 +1694,18 @@ ; FALLBACK-NOT: remark:{{.*}} sabdl_from_extract_dup define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { -; CHECK-LABEL: sabdl_from_extract_dup: -; CHECK: // %bb.0: -; CHECK-NEXT: dup.2s v1, w0 +; DAG-LABEL: sabdl_from_extract_dup: +; DAG: // %bb.0: +; DAG-NEXT: dup.2s v1, w0 +; DAG-NEXT: sabdl.2d v0, v0, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: sabdl_from_extract_dup: +; GISEL: // %bb.0: +; GISEL-NEXT: dup.2s v1, w0 ; GISEL-NEXT: ext.16b v0, v0, v0, #0 -; CHECK-NEXT: sabdl.2d v0, v0, v1 -; CHECK-NEXT: ret +; GISEL-NEXT: sabdl.2d v0, v0, v1 +; GISEL-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 @@ -1684,11 +1718,11 @@ ; FALLBACK-NOT: remark:{{.*}} sabdl2_from_extract_dup define <2 x i64> @sabdl2_from_extract_dup(<4 x i32> %lhs, i32 %rhs) { -; DAG-LABEL: sabdl2_from_extract_dup: -; DAG: // %bb.0: -; DAG-NEXT: dup.4s v1, w0 -; DAG-NEXT: sabdl2.2d v0, v0, v1 -; DAG-NEXT: ret +; DAG-LABEL: sabdl2_from_extract_dup: +; DAG: // %bb.0: +; DAG-NEXT: dup.4s v1, w0 +; DAG-NEXT: sabdl2.2d v0, v0, v1 +; DAG-NEXT: ret ; ; GISEL-LABEL: sabdl2_from_extract_dup: ; GISEL: // %bb.0: diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -329,7 +329,11 @@ define void @testLowerToSRHADD8b(<8 x i8> %src1, <8 x i8> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD8b: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.8b v0, v0, v1 +; CHECK-NEXT: sshll.8h v0, v0, #0 +; CHECK-NEXT: sshll.8h v1, v1, #0 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.8h v0, v1, v0 +; CHECK-NEXT: shrn.8b v0, v0, #1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <8 x i8> %src1 to <8 x i16> @@ -345,7 +349,11 @@ define void @testLowerToSRHADD4h(<4 x i16> %src1, <4 x i16> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD4h: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.4h v0, v0, v1 +; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: sshll.4s v1, v1, #0 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.4s v0, v1, v0 +; CHECK-NEXT: shrn.4h v0, v0, #1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <4 x i16> %src1 to <4 x i32> @@ -361,7 +369,11 @@ define void @testLowerToSRHADD2s(<2 x i32> %src1, <2 x i32> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD2s: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.2s v0, v0, v1 +; CHECK-NEXT: sshll.2d v0, v0, #0 +; CHECK-NEXT: sshll.2d v1, v1, #0 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.2d v0, v1, v0 +; CHECK-NEXT: shrn.2s v0, v0, #1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <2 x i32> %src1 to <2 x i64> @@ -377,8 +389,17 @@ define void @testLowerToSRHADD16b(<16 x i8> %src1, <16 x i8> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD16b: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.16b v0, v0, v1 -; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: sshll.8h v2, v0, #0 +; CHECK-NEXT: sshll.8h v3, v1, #0 +; CHECK-NEXT: sshll2.8h v0, v0, #0 +; CHECK-NEXT: mvn.16b v2, v2 +; CHECK-NEXT: sshll2.8h v1, v1, #0 +; CHECK-NEXT: sub.8h v2, v3, v2 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.8h v0, v1, v0 +; CHECK-NEXT: shrn.8b v1, v2, #1 +; CHECK-NEXT: shrn2.16b v1, v0, #1 +; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> @@ -393,8 +414,17 @@ define void @testLowerToSRHADD8h(<8 x i16> %src1, <8 x i16> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD8h: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.8h v0, v0, v1 -; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: sshll.4s v2, v0, #0 +; CHECK-NEXT: sshll.4s v3, v1, #0 +; CHECK-NEXT: sshll2.4s v0, v0, #0 +; CHECK-NEXT: mvn.16b v2, v2 +; CHECK-NEXT: sshll2.4s v1, v1, #0 +; CHECK-NEXT: sub.4s v2, v3, v2 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.4s v0, v1, v0 +; CHECK-NEXT: shrn.4h v1, v2, #1 +; CHECK-NEXT: shrn2.8h v1, v0, #1 +; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> %sextsrc2 = sext <8 x i16> %src2 to <8 x i32> @@ -409,8 +439,17 @@ define void @testLowerToSRHADD4s(<4 x i32> %src1, <4 x i32> %src2, ptr nocapture writeonly %dest) { ; CHECK-LABEL: testLowerToSRHADD4s: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd.4s v0, v0, v1 -; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: sshll.2d v2, v0, #0 +; CHECK-NEXT: sshll.2d v3, v1, #0 +; CHECK-NEXT: sshll2.2d v0, v0, #0 +; CHECK-NEXT: mvn.16b v2, v2 +; CHECK-NEXT: sshll2.2d v1, v1, #0 +; CHECK-NEXT: sub.2d v2, v3, v2 +; CHECK-NEXT: mvn.16b v0, v0 +; CHECK-NEXT: sub.2d v0, v1, v0 +; CHECK-NEXT: shrn.2s v1, v2, #1 +; CHECK-NEXT: shrn2.4s v1, v0, #1 +; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: ret %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -1004,7 +1043,9 @@ ; CHECK-NEXT: shl.2s v1, v1, #24 ; CHECK-NEXT: sshr.2s v0, v0, #24 ; CHECK-NEXT: sshr.2s v1, v1, #24 -; CHECK-NEXT: srhadd.2s v0, v0, v1 +; CHECK-NEXT: mvn.8b v0, v0 +; CHECK-NEXT: sub.2s v0, v1, v0 +; CHECK-NEXT: sshr.2s v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <2 x i8> %src1 to <2 x i16> %zextsrc2 = sext <2 x i8> %src2 to <2 x i16> @@ -1020,7 +1061,9 @@ ; CHECK-NEXT: movi d2, #0x0000ff000000ff ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: urhadd.2s v0, v0, v1 +; CHECK-NEXT: mvn.8b v0, v0 +; CHECK-NEXT: sub.2s v0, v1, v0 +; CHECK-NEXT: ushr.2s v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i8> %src1 to <2 x i16> %zextsrc2 = zext <2 x i8> %src2 to <2 x i16> @@ -1057,7 +1100,9 @@ ; CHECK-NEXT: movi d2, #0x0000ff000000ff ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: urhadd.2s v0, v0, v1 +; CHECK-NEXT: mvn.8b v0, v0 +; CHECK-NEXT: sub.2s v0, v1, v0 +; CHECK-NEXT: ushr.2s v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i8> %src1 to <2 x i16> %zextsrc2 = zext <2 x i8> %src2 to <2 x i16> @@ -1304,6 +1349,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: movi.8b v2, #7 ; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: bic.8h v1, #255, lsl #8 ; CHECK-NEXT: xtn.8b v1, v1 ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: uhadd.8b v0, v0, v1 diff --git a/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll b/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll --- a/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll +++ b/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll @@ -40,10 +40,10 @@ ; CHECK-NEXT: stp x28, x27, [sp, #384] ; 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w27, -8 ; CHECK-NEXT: .cfi_offset w28, -16 -; CHECK-NEXT: ldr q0, [x0, #272] ; CHECK-NEXT: ldr x8, [x0, #288] -; CHECK-NEXT: stur q0, [sp, #216] +; CHECK-NEXT: ldr q0, [x0, #272] ; CHECK-NEXT: str x8, [sp, #232] +; CHECK-NEXT: stur q0, [sp, #216] ; CHECK-NEXT: ldp x28, x27, [sp, #384] ; 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #400 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -406,7 +406,7 @@ define void @smlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { ; CHECK-LABEL: smlal2d_chain_with_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov w8, #257 // =0x101 ; CHECK-NEXT: dup.2d v3, x8 ; CHECK-NEXT: smlal.2d v3, v0, v2 ; CHECK-NEXT: mvn.8b v0, v2 @@ -475,7 +475,7 @@ define void @smlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { ; CHECK-LABEL: smlsl2d_chain_with_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov w8, #257 // =0x101 ; CHECK-NEXT: dup.2d v3, x8 ; CHECK-NEXT: smlsl.2d v3, v0, v2 ; CHECK-NEXT: mvn.8b v0, v2 @@ -685,7 +685,7 @@ define void @umlal2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { ; CHECK-LABEL: umlal2d_chain_with_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov w8, #257 // =0x101 ; CHECK-NEXT: dup.2d v3, x8 ; CHECK-NEXT: umlal.2d v3, v0, v2 ; CHECK-NEXT: mvn.8b v0, v2 @@ -754,7 +754,7 @@ define void @umlsl2d_chain_with_constant(ptr %dst, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { ; CHECK-LABEL: umlsl2d_chain_with_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #257 +; CHECK-NEXT: mov w8, #257 // =0x101 ; CHECK-NEXT: dup.2d v3, x8 ; CHECK-NEXT: umlsl.2d v3, v0, v2 ; CHECK-NEXT: mvn.8b v0, v2 @@ -2416,7 +2416,15 @@ ; CHECK-LABEL: vmulq_built_dup_fromsmall_test: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mul.8h v0, v0, v1[0] +; CHECK-NEXT: mov.16b v2, v1 +; CHECK-NEXT: mov.h v2[1], v1[0] +; CHECK-NEXT: mov.h v2[2], v1[0] +; CHECK-NEXT: mov.h v2[3], v1[0] +; CHECK-NEXT: mov.h v2[4], v1[0] +; CHECK-NEXT: mov.h v2[5], v1[0] +; CHECK-NEXT: mov.h v2[6], v1[0] +; CHECK-NEXT: mov.h v2[7], v1[0] +; CHECK-NEXT: mul.8h v0, v0, v2 ; CHECK-NEXT: ret %vget_lane = extractelement <4 x i16> %b, i32 0 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll --- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll +++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll @@ -152,8 +152,8 @@ ; CHECK-NEXT: add x19, x19, :lo12:Pod ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl copy_pod -; CHECK-NEXT: str d0, [x19] ; CHECK-NEXT: str d1, [x19, #8] +; CHECK-NEXT: str d0, [x19] ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 8 @@ -186,8 +186,8 @@ ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: mov x1, x19 ; CHECK-NEXT: bl copy_notcxx14aggregate -; CHECK-NEXT: ldp d0, d1, [sp] -; CHECK-NEXT: stp d0, d1, [x19] +; CHECK-NEXT: ldp d1, d0, [sp] +; CHECK-NEXT: stp d1, d0, [x19] ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: .seh_save_reg x30, 24 diff --git a/llvm/test/CodeGen/AArch64/arm64-xaluo.ll b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll --- a/llvm/test/CodeGen/AArch64/arm64-xaluo.ll +++ b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll @@ -102,7 +102,7 @@ define zeroext i1 @saddo4.i32(i32 %v1, ptr %res) { ; SDAG-LABEL: saddo4.i32: ; SDAG: // %bb.0: // %entry -; SDAG-NEXT: mov w8, #16777215 +; SDAG-NEXT: mov w8, #16777215 // =0xffffff ; SDAG-NEXT: adds w8, w0, w8 ; SDAG-NEXT: cset w0, vs ; SDAG-NEXT: str w8, [x1] @@ -110,7 +110,7 @@ ; ; FAST-LABEL: saddo4.i32: ; FAST: // %bb.0: // %entry -; FAST-NEXT: mov w8, #16777215 +; FAST-NEXT: mov w8, #16777215 // =0xffffff ; FAST-NEXT: adds w8, w0, w8 ; FAST-NEXT: cset w9, vs ; FAST-NEXT: and w0, w9, #0x1 @@ -119,7 +119,7 @@ ; ; GISEL-LABEL: saddo4.i32: ; GISEL: // %bb.0: // %entry -; GISEL-NEXT: mov w8, #16777215 +; GISEL-NEXT: mov w8, #16777215 // =0xffffff ; GISEL-NEXT: adds w8, w0, w8 ; GISEL-NEXT: cset w0, vs ; GISEL-NEXT: str w8, [x1] @@ -1327,25 +1327,27 @@ ; SDAG-LABEL: uaddo.selectboth.i8: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: and w8, w0, #0xff -; SDAG-NEXT: mov w9, #10 ; SDAG-NEXT: add w8, w8, w1, uxtb -; SDAG-NEXT: tst w8, #0x100 +; SDAG-NEXT: lsr w9, w8, #8 +; SDAG-NEXT: cmp w9, #0 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret ; ; FAST-LABEL: uaddo.selectboth.i8: ; FAST: // %bb.0: // %entry ; FAST-NEXT: and w8, w0, #0xff -; FAST-NEXT: mov w9, #10 ; FAST-NEXT: add w8, w8, w1, uxtb -; FAST-NEXT: tst w8, #0x100 +; FAST-NEXT: lsr w9, w8, #8 +; FAST-NEXT: cmp w9, #0 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret ; ; GISEL-LABEL: uaddo.selectboth.i8: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: and w8, w1, #0xff -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: add w8, w8, w0, uxtb ; GISEL-NEXT: cmp w8, w8, uxtb ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1362,7 +1364,7 @@ ; SDAG-LABEL: saddo.selectboth.i8: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: sxtb w8, w0 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: add w8, w8, w1, sxtb ; SDAG-NEXT: cmp w8, w8, sxtb ; SDAG-NEXT: csel w0, w8, w9, ne @@ -1371,7 +1373,7 @@ ; FAST-LABEL: saddo.selectboth.i8: ; FAST: // %bb.0: // %entry ; FAST-NEXT: sxtb w8, w0 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: add w8, w8, w1, sxtb ; FAST-NEXT: cmp w8, w8, sxtb ; FAST-NEXT: csel w0, w8, w9, ne @@ -1380,7 +1382,7 @@ ; GISEL-LABEL: saddo.selectboth.i8: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: sxtb w8, w1 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: add w8, w8, w0, sxtb ; GISEL-NEXT: cmp w8, w8, sxtb ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1397,25 +1399,27 @@ ; SDAG-LABEL: uaddo.selectboth.i16: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: and w8, w0, #0xffff -; SDAG-NEXT: mov w9, #10 ; SDAG-NEXT: add w8, w8, w1, uxth -; SDAG-NEXT: tst w8, #0x10000 +; SDAG-NEXT: lsr w9, w8, #16 +; SDAG-NEXT: cmp w9, #0 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret ; ; FAST-LABEL: uaddo.selectboth.i16: ; FAST: // %bb.0: // %entry ; FAST-NEXT: and w8, w0, #0xffff -; FAST-NEXT: mov w9, #10 ; FAST-NEXT: add w8, w8, w1, uxth -; FAST-NEXT: tst w8, #0x10000 +; FAST-NEXT: lsr w9, w8, #16 +; FAST-NEXT: cmp w9, #0 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret ; ; GISEL-LABEL: uaddo.selectboth.i16: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: and w8, w1, #0xffff -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: add w8, w8, w0, uxth ; GISEL-NEXT: cmp w8, w8, uxth ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1432,7 +1436,7 @@ ; SDAG-LABEL: saddo.selectboth.i16: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: sxth w8, w0 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: add w8, w8, w1, sxth ; SDAG-NEXT: cmp w8, w8, sxth ; SDAG-NEXT: csel w0, w8, w9, ne @@ -1441,7 +1445,7 @@ ; FAST-LABEL: saddo.selectboth.i16: ; FAST: // %bb.0: // %entry ; FAST-NEXT: sxth w8, w0 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: add w8, w8, w1, sxth ; FAST-NEXT: cmp w8, w8, sxth ; FAST-NEXT: csel w0, w8, w9, ne @@ -1450,7 +1454,7 @@ ; GISEL-LABEL: saddo.selectboth.i16: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: sxth w8, w1 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: add w8, w8, w0, sxth ; GISEL-NEXT: cmp w8, w8, sxth ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1467,21 +1471,21 @@ ; SDAG-LABEL: uaddo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: adds w8, w0, w1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, hs ; SDAG-NEXT: ret ; ; FAST-LABEL: uaddo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: adds w8, w0, w1 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, hs ; FAST-NEXT: ret ; ; GISEL-LABEL: uaddo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: adds w8, w0, w1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, hs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel w0, w8, w10, ne @@ -1498,21 +1502,21 @@ ; SDAG-LABEL: saddo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: adds w8, w0, w1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, vs ; SDAG-NEXT: ret ; ; FAST-LABEL: saddo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: adds w8, w0, w1 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, vs ; FAST-NEXT: ret ; ; GISEL-LABEL: saddo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: adds w8, w0, w1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, vs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel w0, w8, w10, ne @@ -1529,21 +1533,21 @@ ; SDAG-LABEL: uaddo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: adds x8, x0, x1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel x0, x8, x9, hs ; SDAG-NEXT: ret ; ; FAST-LABEL: uaddo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: adds x8, x0, x1 -; FAST-NEXT: mov x9, #10 +; FAST-NEXT: mov x9, #10 // =0xa ; FAST-NEXT: csel x0, x8, x9, hs ; FAST-NEXT: ret ; ; GISEL-LABEL: uaddo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: adds x8, x0, x1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, hs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel x0, x8, x10, ne @@ -1560,21 +1564,21 @@ ; SDAG-LABEL: saddo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: adds x8, x0, x1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel x0, x8, x9, vs ; SDAG-NEXT: ret ; ; FAST-LABEL: saddo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: adds x8, x0, x1 -; FAST-NEXT: mov x9, #10 +; FAST-NEXT: mov x9, #10 // =0xa ; FAST-NEXT: csel x0, x8, x9, vs ; FAST-NEXT: ret ; ; GISEL-LABEL: saddo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: adds x8, x0, x1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, vs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel x0, x8, x10, ne @@ -1591,7 +1595,7 @@ ; SDAG-LABEL: usubo.selectboth.i8: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: and w8, w0, #0xff -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: sub w8, w8, w1, uxtb ; SDAG-NEXT: tst w8, #0xffffff00 ; SDAG-NEXT: csel w0, w8, w9, ne @@ -1600,7 +1604,7 @@ ; FAST-LABEL: usubo.selectboth.i8: ; FAST: // %bb.0: // %entry ; FAST-NEXT: and w8, w0, #0xff -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: sub w8, w8, w1, uxtb ; FAST-NEXT: tst w8, #0xffffff00 ; FAST-NEXT: csel w0, w8, w9, ne @@ -1609,7 +1613,7 @@ ; GISEL-LABEL: usubo.selectboth.i8: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: and w8, w0, #0xff -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: sub w8, w8, w1, uxtb ; GISEL-NEXT: cmp w8, w8, uxtb ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1626,7 +1630,7 @@ ; CHECK-LABEL: ssubo.selectboth.i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: mov w9, #10 +; CHECK-NEXT: mov w9, #10 // =0xa ; CHECK-NEXT: sub w8, w8, w1, sxtb ; CHECK-NEXT: cmp w8, w8, sxtb ; CHECK-NEXT: csel w0, w8, w9, ne @@ -1643,7 +1647,7 @@ ; SDAG-LABEL: usubo.selectboth.i16: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: and w8, w0, #0xffff -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: sub w8, w8, w1, uxth ; SDAG-NEXT: tst w8, #0xffff0000 ; SDAG-NEXT: csel w0, w8, w9, ne @@ -1652,7 +1656,7 @@ ; FAST-LABEL: usubo.selectboth.i16: ; FAST: // %bb.0: // %entry ; FAST-NEXT: and w8, w0, #0xffff -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: sub w8, w8, w1, uxth ; FAST-NEXT: tst w8, #0xffff0000 ; FAST-NEXT: csel w0, w8, w9, ne @@ -1661,7 +1665,7 @@ ; GISEL-LABEL: usubo.selectboth.i16: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: and w8, w0, #0xffff -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: sub w8, w8, w1, uxth ; GISEL-NEXT: cmp w8, w8, uxth ; GISEL-NEXT: csel w0, w8, w9, ne @@ -1678,7 +1682,7 @@ ; CHECK-LABEL: ssubo.selectboth.i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: mov w9, #10 +; CHECK-NEXT: mov w9, #10 // =0xa ; CHECK-NEXT: sub w8, w8, w1, sxth ; CHECK-NEXT: cmp w8, w8, sxth ; CHECK-NEXT: csel w0, w8, w9, ne @@ -1695,21 +1699,21 @@ ; SDAG-LABEL: usubo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: subs w8, w0, w1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, lo ; SDAG-NEXT: ret ; ; FAST-LABEL: usubo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: subs w8, w0, w1 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, lo ; FAST-NEXT: ret ; ; GISEL-LABEL: usubo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: subs w8, w0, w1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, lo ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel w0, w8, w10, ne @@ -1726,21 +1730,21 @@ ; SDAG-LABEL: ssubo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: subs w8, w0, w1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel w0, w8, w9, vs ; SDAG-NEXT: ret ; ; FAST-LABEL: ssubo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: subs w8, w0, w1 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: csel w0, w8, w9, vs ; FAST-NEXT: ret ; ; GISEL-LABEL: ssubo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: subs w8, w0, w1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, vs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel w0, w8, w10, ne @@ -1757,21 +1761,21 @@ ; SDAG-LABEL: usubo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: subs x8, x0, x1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel x0, x8, x9, lo ; SDAG-NEXT: ret ; ; FAST-LABEL: usubo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: subs x8, x0, x1 -; FAST-NEXT: mov x9, #10 +; FAST-NEXT: mov x9, #10 // =0xa ; FAST-NEXT: csel x0, x8, x9, lo ; FAST-NEXT: ret ; ; GISEL-LABEL: usubo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: subs x8, x0, x1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, lo ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel x0, x8, x10, ne @@ -1788,21 +1792,21 @@ ; SDAG-LABEL: ssubo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: subs x8, x0, x1 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: csel x0, x8, x9, vs ; SDAG-NEXT: ret ; ; FAST-LABEL: ssubo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: subs x8, x0, x1 -; FAST-NEXT: mov x9, #10 +; FAST-NEXT: mov x9, #10 // =0xa ; FAST-NEXT: csel x0, x8, x9, vs ; FAST-NEXT: ret ; ; GISEL-LABEL: ssubo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: subs x8, x0, x1 -; GISEL-NEXT: mov w10, #10 +; GISEL-NEXT: mov w10, #10 // =0xa ; GISEL-NEXT: cset w9, vs ; GISEL-NEXT: tst w9, #0x1 ; GISEL-NEXT: csel x0, x8, x10, ne @@ -1822,7 +1826,7 @@ ; SDAG-NEXT: and w8, w1, #0xff ; SDAG-NEXT: and w9, w0, #0xff ; SDAG-NEXT: mul w8, w9, w8 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: tst w8, #0xff00 ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret @@ -1832,7 +1836,7 @@ ; FAST-NEXT: and w8, w1, #0xff ; FAST-NEXT: and w9, w0, #0xff ; FAST-NEXT: mul w8, w9, w8 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: tst w8, #0xff00 ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret @@ -1842,7 +1846,7 @@ ; GISEL-NEXT: and w8, w0, #0xff ; GISEL-NEXT: and w9, w1, #0xff ; GISEL-NEXT: mul w8, w8, w9 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: cmp w8, w8, uxtb ; GISEL-NEXT: csel w0, w8, w9, ne ; GISEL-NEXT: ret @@ -1860,7 +1864,7 @@ ; SDAG-NEXT: sxtb w8, w1 ; SDAG-NEXT: sxtb w9, w0 ; SDAG-NEXT: mul w8, w9, w8 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: cmp w8, w8, sxtb ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret @@ -1870,7 +1874,7 @@ ; FAST-NEXT: sxtb w8, w1 ; FAST-NEXT: sxtb w9, w0 ; FAST-NEXT: mul w8, w9, w8 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: cmp w8, w8, sxtb ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret @@ -1880,7 +1884,7 @@ ; GISEL-NEXT: sxtb w8, w0 ; GISEL-NEXT: sxtb w9, w1 ; GISEL-NEXT: mul w8, w8, w9 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: cmp w8, w8, sxtb ; GISEL-NEXT: csel w0, w8, w9, ne ; GISEL-NEXT: ret @@ -1898,7 +1902,7 @@ ; SDAG-NEXT: and w8, w1, #0xffff ; SDAG-NEXT: and w9, w0, #0xffff ; SDAG-NEXT: mul w8, w9, w8 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: tst w8, #0xffff0000 ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret @@ -1908,7 +1912,7 @@ ; FAST-NEXT: and w8, w1, #0xffff ; FAST-NEXT: and w9, w0, #0xffff ; FAST-NEXT: mul w8, w9, w8 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: tst w8, #0xffff0000 ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret @@ -1918,7 +1922,7 @@ ; GISEL-NEXT: and w8, w0, #0xffff ; GISEL-NEXT: and w9, w1, #0xffff ; GISEL-NEXT: mul w8, w8, w9 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: cmp w8, w8, uxth ; GISEL-NEXT: csel w0, w8, w9, ne ; GISEL-NEXT: ret @@ -1936,7 +1940,7 @@ ; SDAG-NEXT: sxth w8, w1 ; SDAG-NEXT: sxth w9, w0 ; SDAG-NEXT: mul w8, w9, w8 -; SDAG-NEXT: mov w9, #10 +; SDAG-NEXT: mov w9, #10 // =0xa ; SDAG-NEXT: cmp w8, w8, sxth ; SDAG-NEXT: csel w0, w8, w9, ne ; SDAG-NEXT: ret @@ -1946,7 +1950,7 @@ ; FAST-NEXT: sxth w8, w1 ; FAST-NEXT: sxth w9, w0 ; FAST-NEXT: mul w8, w9, w8 -; FAST-NEXT: mov w9, #10 +; FAST-NEXT: mov w9, #10 // =0xa ; FAST-NEXT: cmp w8, w8, sxth ; FAST-NEXT: csel w0, w8, w9, ne ; FAST-NEXT: ret @@ -1956,7 +1960,7 @@ ; GISEL-NEXT: sxth w8, w0 ; GISEL-NEXT: sxth w9, w1 ; GISEL-NEXT: mul w8, w8, w9 -; GISEL-NEXT: mov w9, #10 +; GISEL-NEXT: mov w9, #10 // =0xa ; GISEL-NEXT: cmp w8, w8, sxth ; GISEL-NEXT: csel w0, w8, w9, ne ; GISEL-NEXT: ret @@ -1972,7 +1976,7 @@ ; SDAG-LABEL: umulo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: umull x9, w0, w1 -; SDAG-NEXT: mov w8, #10 +; SDAG-NEXT: mov w8, #10 // =0xa ; SDAG-NEXT: tst x9, #0xffffffff00000000 ; SDAG-NEXT: csel w0, w9, w8, ne ; SDAG-NEXT: ret @@ -1980,7 +1984,7 @@ ; FAST-LABEL: umulo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: umull x9, w0, w1 -; FAST-NEXT: mov w8, #10 +; FAST-NEXT: mov w8, #10 // =0xa ; FAST-NEXT: tst x9, #0xffffffff00000000 ; FAST-NEXT: csel w0, w9, w8, ne ; FAST-NEXT: ret @@ -1988,7 +1992,7 @@ ; GISEL-LABEL: umulo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: umull x9, w0, w1 -; GISEL-NEXT: mov w8, #10 +; GISEL-NEXT: mov w8, #10 // =0xa ; GISEL-NEXT: mul w10, w0, w1 ; GISEL-NEXT: lsr x9, x9, #32 ; GISEL-NEXT: cmp w9, #0 @@ -2006,7 +2010,7 @@ ; SDAG-LABEL: smulo.selectboth.i32: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: smull x9, w0, w1 -; SDAG-NEXT: mov w8, #10 +; SDAG-NEXT: mov w8, #10 // =0xa ; SDAG-NEXT: cmp x9, w9, sxtw ; SDAG-NEXT: csel w0, w9, w8, ne ; SDAG-NEXT: ret @@ -2014,7 +2018,7 @@ ; FAST-LABEL: smulo.selectboth.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: smull x9, w0, w1 -; FAST-NEXT: mov w8, #10 +; FAST-NEXT: mov w8, #10 // =0xa ; FAST-NEXT: cmp x9, w9, sxtw ; FAST-NEXT: csel w0, w9, w8, ne ; FAST-NEXT: ret @@ -2022,7 +2026,7 @@ ; GISEL-LABEL: smulo.selectboth.i32: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: smull x9, w0, w1 -; GISEL-NEXT: mov w8, #10 +; GISEL-NEXT: mov w8, #10 // =0xa ; GISEL-NEXT: mul w10, w0, w1 ; GISEL-NEXT: asr x9, x9, #32 ; GISEL-NEXT: cmp w9, w10, asr #31 @@ -2040,7 +2044,7 @@ ; SDAG-LABEL: umulo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: umulh x9, x0, x1 -; SDAG-NEXT: mov w8, #10 +; SDAG-NEXT: mov w8, #10 // =0xa ; SDAG-NEXT: mul x10, x0, x1 ; SDAG-NEXT: cmp xzr, x9 ; SDAG-NEXT: csel x0, x10, x8, ne @@ -2049,7 +2053,7 @@ ; FAST-LABEL: umulo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: umulh x9, x0, x1 -; FAST-NEXT: mov x8, #10 +; FAST-NEXT: mov x8, #10 // =0xa ; FAST-NEXT: mul x10, x0, x1 ; FAST-NEXT: cmp xzr, x9 ; FAST-NEXT: csel x0, x10, x8, ne @@ -2058,7 +2062,7 @@ ; GISEL-LABEL: umulo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: umulh x9, x0, x1 -; GISEL-NEXT: mov w8, #10 +; GISEL-NEXT: mov w8, #10 // =0xa ; GISEL-NEXT: mul x10, x0, x1 ; GISEL-NEXT: cmp x9, #0 ; GISEL-NEXT: csel x0, x10, x8, ne @@ -2075,7 +2079,7 @@ ; SDAG-LABEL: smulo.selectboth.i64: ; SDAG: // %bb.0: // %entry ; SDAG-NEXT: mul x9, x0, x1 -; SDAG-NEXT: mov w8, #10 +; SDAG-NEXT: mov w8, #10 // =0xa ; SDAG-NEXT: smulh x10, x0, x1 ; SDAG-NEXT: cmp x10, x9, asr #63 ; SDAG-NEXT: csel x0, x9, x8, ne @@ -2084,7 +2088,7 @@ ; FAST-LABEL: smulo.selectboth.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: mul x9, x0, x1 -; FAST-NEXT: mov x8, #10 +; FAST-NEXT: mov x8, #10 // =0xa ; FAST-NEXT: smulh x10, x0, x1 ; FAST-NEXT: cmp x10, x9, asr #63 ; FAST-NEXT: csel x0, x9, x8, ne @@ -2093,7 +2097,7 @@ ; GISEL-LABEL: smulo.selectboth.i64: ; GISEL: // %bb.0: // %entry ; GISEL-NEXT: mul x9, x0, x1 -; GISEL-NEXT: mov w8, #10 +; GISEL-NEXT: mov w8, #10 // =0xa ; GISEL-NEXT: smulh x10, x0, x1 ; GISEL-NEXT: cmp x10, x9, asr #63 ; GISEL-NEXT: csel x0, x9, x8, ne @@ -2120,7 +2124,7 @@ ; FAST-LABEL: saddo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn w0, w1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, vs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2155,7 +2159,7 @@ ; FAST-LABEL: saddo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn x0, x1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, vs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2190,7 +2194,7 @@ ; FAST-LABEL: uaddo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn w0, w1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, hs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2225,7 +2229,7 @@ ; FAST-LABEL: uaddo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn x0, x1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, hs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2260,7 +2264,7 @@ ; FAST-LABEL: ssubo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmp w0, w1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, vs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2295,7 +2299,7 @@ ; FAST-LABEL: ssubo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmp x0, x1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, vs ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2330,7 +2334,7 @@ ; FAST-LABEL: usubo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmp w0, w1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, lo ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2365,7 +2369,7 @@ ; FAST-LABEL: usubo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmp x0, x1 -; FAST-NEXT: mov w9, #1 +; FAST-NEXT: mov w9, #1 // =0x1 ; FAST-NEXT: cset w8, lo ; FAST-NEXT: bic w8, w9, w8 ; FAST-NEXT: and w0, w8, #0x1 @@ -2401,7 +2405,7 @@ ; FAST-LABEL: smulo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: smull x9, w0, w1 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: cmp x9, w9, sxtw ; FAST-NEXT: cset w9, ne ; FAST-NEXT: bic w8, w8, w9 @@ -2442,7 +2446,7 @@ ; FAST-LABEL: smulo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: mul x9, x0, x1 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: smulh x10, x0, x1 ; FAST-NEXT: cmp x10, x9, asr #63 ; FAST-NEXT: cset w9, ne @@ -2481,7 +2485,7 @@ ; FAST-LABEL: smulo2.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn x0, x0 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: cset w9, vs ; FAST-NEXT: bic w8, w8, w9 ; FAST-NEXT: and w0, w8, #0x1 @@ -2517,7 +2521,7 @@ ; FAST-LABEL: umulo.br.i32: ; FAST: // %bb.0: // %entry ; FAST-NEXT: umull x9, w0, w1 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: tst x9, #0xffffffff00000000 ; FAST-NEXT: cset w9, ne ; FAST-NEXT: bic w8, w8, w9 @@ -2556,7 +2560,7 @@ ; FAST-LABEL: umulo.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: umulh x9, x0, x1 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: cmp xzr, x9 ; FAST-NEXT: cset w9, ne ; FAST-NEXT: bic w8, w8, w9 @@ -2593,7 +2597,7 @@ ; FAST-LABEL: umulo2.br.i64: ; FAST: // %bb.0: // %entry ; FAST-NEXT: cmn x0, x0 -; FAST-NEXT: mov w8, #1 +; FAST-NEXT: mov w8, #1 // =0x1 ; FAST-NEXT: cset w9, hs ; FAST-NEXT: bic w8, w8, w9 ; FAST-NEXT: and w0, w8, #0x1 @@ -2621,17 +2625,17 @@ define i8 @pr60530() { ; SDAG-LABEL: pr60530: ; SDAG: // %bb.0: -; SDAG-NEXT: mov w0, #-1 +; SDAG-NEXT: mov w0, #-1 // =0xffffffff ; SDAG-NEXT: ret ; ; FAST-LABEL: pr60530: ; FAST: // %bb.0: -; FAST-NEXT: mov w0, #-1 +; FAST-NEXT: mov w0, #-1 // =0xffffffff ; FAST-NEXT: ret ; ; GISEL-LABEL: pr60530: ; GISEL: // %bb.0: -; GISEL-NEXT: mov w8, #1 +; GISEL-NEXT: mov w8, #1 // =0x1 ; GISEL-NEXT: sbfx w0, w8, #0, #1 ; GISEL-NEXT: ret %1 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 0, i8 1) diff --git a/llvm/test/CodeGen/AArch64/arm64_32-neon.ll b/llvm/test/CodeGen/AArch64/arm64_32-neon.ll --- a/llvm/test/CodeGen/AArch64/arm64_32-neon.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32-neon.ll @@ -1,22 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=arm64_32-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s define <2 x double> @test_insert_elt(<2 x double> %vec, double %val) { ; CHECK-LABEL: test_insert_elt: -; CHECK: mov.d v0[0], v1[0] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov.d v0[0], v1[0] +; CHECK-NEXT: ret %res = insertelement <2 x double> %vec, double %val, i32 0 ret <2 x double> %res } define void @test_split_16B(<4 x float> %val, ptr %addr) { ; CHECK-LABEL: test_split_16B: -; CHECK: str q0, [x0] +; CHECK: ; %bb.0: +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret store <4 x float> %val, ptr %addr, align 8 ret void } define void @test_split_16B_splat(<4 x i32>, ptr %addr) { ; CHECK-LABEL: test_split_16B_splat: -; CHECK: str {{q[0-9]+}} +; CHECK: ; %bb.0: +; CHECK-NEXT: movi.4s v0, #42 +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret %vec.tmp0 = insertelement <4 x i32> undef, i32 42, i32 0 %vec.tmp1 = insertelement <4 x i32> %vec.tmp0, i32 42, i32 1 @@ -33,7 +42,9 @@ declare {%vec, %vec} @llvm.aarch64.neon.ld2r.v2f64.p0(ptr) define {%vec, %vec} @test_neon_load(ptr %addr) { ; CHECK-LABEL: test_neon_load: -; CHECK: ld2r.2d { v0, v1 }, [x0] +; CHECK: ; %bb.0: +; CHECK-NEXT: ld2r.2d { v0, v1 }, [x0] +; CHECK-NEXT: ret %res = call {%vec, %vec} @llvm.aarch64.neon.ld2r.v2f64.p0(ptr %addr) ret {%vec, %vec} %res } @@ -41,7 +52,11 @@ declare {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0(%vec, %vec, i64, ptr) define {%vec, %vec} @test_neon_load_lane(ptr %addr, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_load_lane: -; CHECK: ld2.d { v0, v1 }[0], [x0] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ld2.d { v0, v1 }[0], [x0] +; CHECK-NEXT: ret %res = call {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0(%vec %in1, %vec %in2, i64 0, ptr %addr) ret {%vec, %vec} %res } @@ -49,7 +64,11 @@ declare void @llvm.aarch64.neon.st2.v2f64.p0(%vec, %vec, ptr) define void @test_neon_store(ptr %addr, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_store: -; CHECK: st2.2d { v0, v1 }, [x0] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: st2.2d { v0, v1 }, [x0] +; CHECK-NEXT: ret call void @llvm.aarch64.neon.st2.v2f64.p0(%vec %in1, %vec %in2, ptr %addr) ret void } @@ -57,7 +76,11 @@ declare void @llvm.aarch64.neon.st2lane.v2f64.p0(%vec, %vec, i64, ptr) define void @test_neon_store_lane(ptr %addr, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_store_lane: -; CHECK: st2.d { v0, v1 }[1], [x0] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: st2.d { v0, v1 }[1], [x0] +; CHECK-NEXT: ret call void @llvm.aarch64.neon.st2lane.v2f64.p0(%vec %in1, %vec %in2, i64 1, ptr %addr) ret void } @@ -65,8 +88,11 @@ declare {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0(ptr) define {{%vec, %vec}, ptr} @test_neon_load_post(ptr %addr, i32 %offset) { ; CHECK-LABEL: test_neon_load_post: -; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1 -; CHECK: ld2.2d { v0, v1 }, [x0], [[OFFSET]] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: ld2.2d { v0, v1 }, [x0], x8 +; CHECK-NEXT: ret %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0(ptr %addr) @@ -79,8 +105,13 @@ define {{%vec, %vec}, ptr} @test_neon_load_post_lane(ptr %addr, i32 %offset, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_load_post_lane: -; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1 -; CHECK: ld2.d { v0, v1 }[1], [x0], [[OFFSET]] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ld2.d { v0, v1 }[1], [x0], x8 +; CHECK-NEXT: ret %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0(%vec %in1, %vec %in2, i64 1, ptr %addr) @@ -93,8 +124,13 @@ define ptr @test_neon_store_post(ptr %addr, i32 %offset, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_store_post: -; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1 -; CHECK: st2.2d { v0, v1 }, [x0], [[OFFSET]] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: st2.2d { v0, v1 }, [x0], x8 +; CHECK-NEXT: ret call void @llvm.aarch64.neon.st2.v2f64.p0(%vec %in1, %vec %in2, ptr %addr) @@ -105,8 +141,13 @@ define ptr @test_neon_store_post_lane(ptr %addr, i32 %offset, %vec %in1, %vec %in2) { ; CHECK-LABEL: test_neon_store_post_lane: -; CHECK: sxtw [[OFFSET:x[0-9]+]], w1 -; CHECK: st2.d { v0, v1 }[0], [x0], [[OFFSET]] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: st2.d { v0, v1 }[0], [x0], x8 +; CHECK-NEXT: ret call void @llvm.aarch64.neon.st2lane.v2f64.p0(%vec %in1, %vec %in2, i64 0, ptr %addr) @@ -119,8 +160,11 @@ ; rather than an intrinsic. define {%vec, ptr} @test_neon_ld1_post_lane(ptr %addr, i32 %offset, %vec %in) { ; CHECK-LABEL: test_neon_ld1_post_lane: -; CHECK: sbfiz [[OFFSET:x[0-9]+]], x1, #3, #32 -; CHECK: ld1.d { v0 }[0], [x0], [[OFFSET]] +; CHECK: ; %bb.0: +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sbfiz x8, x1, #3, #32 +; CHECK-NEXT: ld1.d { v0 }[0], [x0], x8 +; CHECK-NEXT: ret %loaded = load double, ptr %addr, align 8 %newvec = insertelement %vec %in, double %loaded, i32 0 @@ -135,7 +179,9 @@ define {{%vec, %vec}, ptr} @test_neon_load_post_exact(ptr %addr) { ; CHECK-LABEL: test_neon_load_post_exact: -; CHECK: ld2.2d { v0, v1 }, [x0], #32 +; CHECK: ; %bb.0: +; CHECK-NEXT: ld2.2d { v0, v1 }, [x0], #32 +; CHECK-NEXT: ret %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0(ptr %addr) @@ -148,7 +194,10 @@ define {%vec, ptr} @test_neon_ld1_post_lane_exact(ptr %addr, %vec %in) { ; CHECK-LABEL: test_neon_ld1_post_lane_exact: -; CHECK: ld1.d { v0 }[0], [x0], #8 +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr d1, [x0], #8 +; CHECK-NEXT: mov.d v0[0], v1[0] +; CHECK-NEXT: ret %loaded = load double, ptr %addr, align 8 %newvec = insertelement %vec %in, double %loaded, i32 0 @@ -165,9 +214,10 @@ ; address wraps. We cannot use post-indexed addressing. define {%vec, ptr} @test_neon_ld1_notpost_lane_exact(ptr %addr, %vec %in) { ; CHECK-LABEL: test_neon_ld1_notpost_lane_exact: -; CHECK-NOT: ld1.d { {{v[0-9]+}} }[0], [{{x[0-9]+|sp}}], #8 -; CHECK: add w0, w0, #8 -; CHECK: ret +; CHECK: ; %bb.0: +; CHECK-NEXT: ld1.d { v0 }[0], [x0] +; CHECK-NEXT: add w0, w0, #8 +; CHECK-NEXT: ret %loaded = load double, ptr %addr, align 8 %newvec = insertelement %vec %in, double %loaded, i32 0 @@ -182,9 +232,10 @@ define {%vec, ptr} @test_neon_ld1_notpost_lane(ptr %addr, i32 %offset, %vec %in) { ; CHECK-LABEL: test_neon_ld1_notpost_lane: -; CHECK-NOT: ld1.d { {{v[0-9]+}} }[0], [{{x[0-9]+|sp}}], {{x[0-9]+|sp}} -; CHECK: add w0, w0, w1, lsl #3 -; CHECK: ret +; CHECK: ; %bb.0: +; CHECK-NEXT: ld1.d { v0 }[0], [x0] +; CHECK-NEXT: add w0, w0, w1, lsl #3 +; CHECK-NEXT: ret %loaded = load double, ptr %addr, align 8 %newvec = insertelement %vec %in, double %loaded, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64_32.ll b/llvm/test/CodeGen/AArch64/arm64_32.ll --- a/llvm/test/CodeGen/AArch64/arm64_32.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=arm64_32-apple-ios7.0 %s -filetype=obj -o - -disable-post-ra -frame-pointer=non-leaf | \ ; RUN: llvm-objdump --private-headers - | \ ; RUN: FileCheck %s --check-prefix=CHECK-MACHO @@ -13,11 +14,24 @@ @var_got = external global i8 define ptr @test_global_addr() { -; CHECK-LABEL: test_global_addr: -; CHECK: adrp [[PAGE:x[0-9]+]], _var32@PAGE -; CHECK-OPT: add x0, [[PAGE]], _var32@PAGEOFF -; CHECK-FAST: add [[TMP:x[0-9]+]], [[PAGE]], _var32@PAGEOFF -; CHECK-FAST: and x0, [[TMP]], #0xffffffff +; CHECK-OPT-LABEL: test_global_addr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: Lloh0: +; CHECK-OPT-NEXT: adrp x0, _var32@PAGE +; CHECK-OPT-NEXT: Lloh1: +; CHECK-OPT-NEXT: add x0, x0, _var32@PAGEOFF +; CHECK-OPT-NEXT: ret +; CHECK-OPT-NEXT: .loh AdrpAdd Lloh0, Lloh1 +; +; CHECK-FAST-LABEL: test_global_addr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: Lloh0: +; CHECK-FAST-NEXT: adrp x8, _var32@PAGE +; CHECK-FAST-NEXT: Lloh1: +; CHECK-FAST-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-FAST-NEXT: and x0, x8, #0xffffffff +; CHECK-FAST-NEXT: ret +; CHECK-FAST-NEXT: .loh AdrpAdd Lloh0, Lloh1 ret ptr @var32 } @@ -25,19 +39,36 @@ ; gets truncated to 32-bits, it's free. No need to zero out higher bits of that ; register. define i64 @test_global_addr_extension() { -; CHECK-LABEL: test_global_addr_extension: -; CHECK: adrp [[PAGE:x[0-9]+]], _var32@PAGE -; CHECK: add x0, [[PAGE]], _var32@PAGEOFF -; CHECK-NOT: and -; CHECK: ret +; CHECK-OPT-LABEL: test_global_addr_extension: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: Lloh2: +; CHECK-OPT-NEXT: adrp x0, _var32@PAGE +; CHECK-OPT-NEXT: Lloh3: +; CHECK-OPT-NEXT: add x0, x0, _var32@PAGEOFF +; CHECK-OPT-NEXT: ret +; CHECK-OPT-NEXT: .loh AdrpAdd Lloh2, Lloh3 +; +; CHECK-FAST-LABEL: test_global_addr_extension: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: Lloh2: +; CHECK-FAST-NEXT: adrp x8, _var32@PAGE +; CHECK-FAST-NEXT: Lloh3: +; CHECK-FAST-NEXT: add x0, x8, _var32@PAGEOFF +; CHECK-FAST-NEXT: ret +; CHECK-FAST-NEXT: .loh AdrpAdd Lloh2, Lloh3 ret i64 ptrtoint(ptr @var32 to i64) } define i32 @test_global_value() { ; CHECK-LABEL: test_global_value: -; CHECK: adrp x[[PAGE:[0-9]+]], _var32@PAGE -; CHECK: ldr w0, [x[[PAGE]], _var32@PAGEOFF] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr w0, [x8, _var32@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5 %val = load i32, ptr @var32, align 4 ret i32 %val } @@ -45,9 +76,15 @@ ; Because the addition may wrap, it is not safe to use "ldr w0, [xN, #32]" here. define i32 @test_unsafe_indexed_add() { ; CHECK-LABEL: test_unsafe_indexed_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #32 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh6, Lloh7 %addr_int = ptrtoint ptr @var32 to i32 %addr_plus_32 = add i32 %addr_int, 32 %addr = inttoptr i32 %addr_plus_32 to ptr @@ -59,9 +96,15 @@ ; 32-bytes below 2^32, and we can use the load this time. define i32 @test_safe_indexed_add() { ; CHECK-LABEL: test_safe_indexed_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh9: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #32 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh8, Lloh9 %addr_int = ptrtoint ptr @var32 to i64 %addr_plus_32 = add nuw i64 %addr_int, 32 %addr = inttoptr i64 %addr_plus_32 to ptr @@ -71,9 +114,11 @@ define i32 @test_safe_indexed_or(i32 %in) { ; CHECK-LABEL: test_safe_indexed_or: -; CHECK: and [[TMP:w[0-9]+]], {{w[0-9]+}}, #0xfffffff0 -; CHECK: orr w[[ADDR:[0-9]+]], [[TMP]], #0x4 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: and w8, w0, #0xfffffff0 +; CHECK-NEXT: orr w8, w8, #0x4 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret %addr_int = and i32 %in, -16 %addr_plus_4 = or i32 %addr_int, 4 %addr = inttoptr i32 %addr_plus_4 to ptr @@ -87,10 +132,15 @@ ; "sext(base) + sext(offset) == base + offset". define i32 @test_unsafe_nsw_indexed_add() { ; CHECK-LABEL: test_unsafe_nsw_indexed_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32 -; CHECK-NOT: ubfx -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh10: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh11: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #32 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh10, Lloh11 %addr_int = ptrtoint ptr @var32 to i32 %addr_plus_32 = add nsw i32 %addr_int, 32 %addr = inttoptr i32 %addr_plus_32 to ptr @@ -101,9 +151,15 @@ ; Because the addition may wrap, it is not safe to use "ldr w0, [xN, #32]" here. define i32 @test_unsafe_unscaled_add() { ; CHECK-LABEL: test_unsafe_unscaled_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh12: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh13: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #3 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh12, Lloh13 %addr_int = ptrtoint ptr @var32 to i32 %addr_plus_3 = add i32 %addr_int, 3 %addr = inttoptr i32 %addr_plus_3 to ptr @@ -115,9 +171,15 @@ ; 32-bytes below 2^32, and we can use the load this time. define i32 @test_safe_unscaled_add() { ; CHECK-LABEL: test_safe_unscaled_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh14: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh15: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #3 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh14, Lloh15 %addr_int = ptrtoint ptr @var32 to i32 %addr_plus_3 = add nuw i32 %addr_int, 3 %addr = inttoptr i32 %addr_plus_3 to ptr @@ -130,10 +192,15 @@ ; "sext(base) + sext(offset) == base + offset". define i32 @test_unsafe_nsw_unscaled_add() { ; CHECK-LABEL: test_unsafe_nsw_unscaled_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3 -; CHECK-NOT: ubfx -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh16: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh17: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: add w8, w8, #3 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh16, Lloh17 %addr_int = ptrtoint ptr @var32 to i32 %addr_plus_3 = add nsw i32 %addr_int, 3 %addr = inttoptr i32 %addr_plus_3 to ptr @@ -145,9 +212,15 @@ ; here. define i32 @test_unsafe_negative_unscaled_add() { ; CHECK-LABEL: test_unsafe_negative_unscaled_add: -; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF -; CHECK: sub w[[ADDR:[0-9]+]], w[[VAR32]], #3 -; CHECK: ldr w0, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh18: +; CHECK-NEXT: adrp x8, _var32@PAGE +; CHECK-NEXT: Lloh19: +; CHECK-NEXT: add x8, x8, _var32@PAGEOFF +; CHECK-NEXT: sub w8, w8, #3 +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpAdd Lloh18, Lloh19 %addr_int = ptrtoint ptr @var32 to i32 %addr_minus_3 = add i32 %addr_int, -3 %addr = inttoptr i32 %addr_minus_3 to ptr @@ -156,24 +229,39 @@ } define ptr @test_got_addr() { -; CHECK-LABEL: test_got_addr: -; CHECK: adrp x[[PAGE:[0-9]+]], _var_got@GOTPAGE -; CHECK-OPT: ldr w0, [x[[PAGE]], _var_got@GOTPAGEOFF] -; CHECK-FAST: ldr w[[TMP:[0-9]+]], [x[[PAGE]], _var_got@GOTPAGEOFF] -; CHECK-FAST: and x0, x[[TMP]], #0xffffffff +; CHECK-OPT-LABEL: test_got_addr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: Lloh20: +; CHECK-OPT-NEXT: adrp x0, _var_got@GOTPAGE +; CHECK-OPT-NEXT: Lloh21: +; CHECK-OPT-NEXT: ldr w0, [x0, _var_got@GOTPAGEOFF] +; CHECK-OPT-NEXT: ret +; CHECK-OPT-NEXT: .loh AdrpLdrGot Lloh20, Lloh21 +; +; CHECK-FAST-LABEL: test_got_addr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: Lloh20: +; CHECK-FAST-NEXT: adrp x8, _var_got@GOTPAGE +; CHECK-FAST-NEXT: Lloh21: +; CHECK-FAST-NEXT: ldr w8, [x8, _var_got@GOTPAGEOFF] +; CHECK-FAST-NEXT: and x0, x8, #0xffffffff +; CHECK-FAST-NEXT: ret +; CHECK-FAST-NEXT: .loh AdrpLdrGot Lloh20, Lloh21 ret ptr @var_got } define float @test_va_arg_f32(ptr %list) { ; CHECK-LABEL: test_va_arg_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: add w9, w8, #8 +; CHECK-NEXT: str w9, [x0] +; CHECK-NEXT: ldr d0, [x8] +; CHECK-NEXT: fcvt s0, d0 +; CHECK-NEXT: ret -; CHECK: ldr w[[START:[0-9]+]], [x0] -; CHECK: add [[AFTER:w[0-9]+]], w[[START]], #8 -; CHECK: str [[AFTER]], [x0] ; Floating point arguments get promoted to double as per C99. -; CHECK: ldr [[DBL:d[0-9]+]], [x[[START]]] -; CHECK: fcvt s0, [[DBL]] %res = va_arg ptr %list, float ret float %res } @@ -181,13 +269,15 @@ ; Interesting point is that the slot is 4 bytes. define i8 @test_va_arg_i8(ptr %list) { ; CHECK-LABEL: test_va_arg_i8: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: add w9, w8, #4 +; CHECK-NEXT: str w9, [x0] +; CHECK-NEXT: ldr w0, [x8] +; CHECK-NEXT: ret -; CHECK: ldr w[[START:[0-9]+]], [x0] -; CHECK: add [[AFTER:w[0-9]+]], w[[START]], #4 -; CHECK: str [[AFTER]], [x0] ; i8 gets promoted to int (again, as per C99). -; CHECK: ldr w0, [x[[START]]] %res = va_arg ptr %list, i8 ret i8 %res @@ -197,16 +287,18 @@ ; bytes). define i64 @test_va_arg_i64(ptr %list) { ; CHECK-LABEL: test_va_arg_i64: +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: add x8, x8, #7 +; CHECK-NEXT: and x8, x8, #0x1fffffff8 +; CHECK-NEXT: add w9, w8, #8 +; CHECK-NEXT: str w9, [x0] +; CHECK-NEXT: ldr x0, [x8] +; CHECK-NEXT: ret ; Update the list for the next user (minimum slot size is 4, but the actual ; argument is 8 which had better be reflected!) -; CHECK: ldr w[[UNALIGNED_START:[0-9]+]], [x0] -; CHECK: add [[ALIGN_TMP:x[0-9]+]], x[[UNALIGNED_START]], #7 -; CHECK: and x[[START:[0-9]+]], [[ALIGN_TMP]], #0x1fffffff8 -; CHECK: add w[[AFTER:[0-9]+]], w[[START]], #8 -; CHECK: str w[[AFTER]], [x0] -; CHECK: ldr x0, [x[[START]]] %res = va_arg ptr %list, i64 ret i64 %res @@ -214,14 +306,47 @@ declare void @bar(...) define void @test_va_call(i8 %l, i8 %r, float %in, ptr %ptr) { -; CHECK-LABEL: test_va_call: -; CHECK: add [[SUM:w[0-9]+]], {{w[0-9]+}}, w1 +; CHECK-OPT-LABEL: test_va_call: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: sub sp, sp, #64 +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 64 +; CHECK-OPT-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-OPT-NEXT: add x29, sp, #48 +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: add w8, w0, w1 +; CHECK-OPT-NEXT: str w2, [sp, #32] +; CHECK-OPT-NEXT: str xzr, [sp, #24] +; CHECK-OPT-NEXT: str s0, [sp, #16] +; CHECK-OPT-NEXT: str xzr, [sp, #8] +; CHECK-OPT-NEXT: str w8, [sp] +; CHECK-OPT-NEXT: bl _bar +; CHECK-OPT-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-OPT-NEXT: add sp, sp, #64 +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_va_call: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: sub sp, sp, #64 +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 64 +; CHECK-FAST-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-FAST-NEXT: add x29, sp, #48 +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: sxtb w8, w0 +; CHECK-FAST-NEXT: add w8, w8, w1, sxtb +; CHECK-FAST-NEXT: str w2, [sp, #32] +; CHECK-FAST-NEXT: str xzr, [sp, #24] +; CHECK-FAST-NEXT: str s0, [sp, #16] +; CHECK-FAST-NEXT: str xzr, [sp, #8] +; CHECK-FAST-NEXT: str w8, [sp] +; CHECK-FAST-NEXT: bl _bar +; CHECK-FAST-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-FAST-NEXT: add sp, sp, #64 +; CHECK-FAST-NEXT: ret -; CHECK-DAG: str w2, [sp, #32] -; CHECK-DAG: str xzr, [sp, #24] -; CHECK-DAG: str s0, [sp, #16] -; CHECK-DAG: str xzr, [sp, #8] -; CHECK-DAG: str [[SUM]], [sp] ; Add them to ensure real promotion occurs. %sum = add i8 %l, %r @@ -232,10 +357,30 @@ declare ptr @llvm.frameaddress(i32) define ptr @test_frameaddr() { -; CHECK-LABEL: test_frameaddr: -; CHECK-OPT: ldr x0, [x29] -; CHECK-FAST: ldr [[TMP:x[0-9]+]], [x29] -; CHECK-FAST: and x0, [[TMP]], #0xffffffff +; CHECK-OPT-LABEL: test_frameaddr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 16 +; CHECK-OPT-NEXT: mov x29, sp +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: ldr x0, [x29] +; CHECK-OPT-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_frameaddr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 16 +; CHECK-FAST-NEXT: mov x29, sp +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: ldr x8, [x29] +; CHECK-FAST-NEXT: and x0, x8, #0xffffffff +; CHECK-FAST-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: ret %val = call ptr @llvm.frameaddress(i32 1) ret ptr %val } @@ -243,28 +388,77 @@ declare ptr @llvm.returnaddress(i32) define ptr @test_toplevel_returnaddr() { -; CHECK-LABEL: test_toplevel_returnaddr: -; CHECK-OPT: mov x0, x30 -; CHECK-FAST: and x0, x30, #0xffffffff +; CHECK-OPT-LABEL: test_toplevel_returnaddr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: hint #7 +; CHECK-OPT-NEXT: mov x0, x30 +; CHECK-OPT-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_toplevel_returnaddr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: hint #7 +; CHECK-FAST-NEXT: and x0, x30, #0xffffffff +; CHECK-FAST-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: ret %val = call ptr @llvm.returnaddress(i32 0) ret ptr %val } define ptr @test_deep_returnaddr() { -; CHECK-LABEL: test_deep_returnaddr: -; CHECK: ldr x[[FRAME_REC:[0-9]+]], [x29] -; CHECK-OPT: ldr x30, [x[[FRAME_REC]], #8] -; CHECK-OPT: hint #7 -; CHECK-OPT: mov x0, x30 -; CHECK-FAST: ldr [[TMP:x[0-9]+]], [x[[FRAME_REC]], #8] -; CHECK-FAST: and x0, [[TMP]], #0xffffffff +; CHECK-OPT-LABEL: test_deep_returnaddr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 16 +; CHECK-OPT-NEXT: mov x29, sp +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: ldr x8, [x29] +; CHECK-OPT-NEXT: ldr x30, [x8, #8] +; CHECK-OPT-NEXT: hint #7 +; CHECK-OPT-NEXT: mov x0, x30 +; CHECK-OPT-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_deep_returnaddr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 16 +; CHECK-FAST-NEXT: mov x29, sp +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: ldr x8, [x29] +; CHECK-FAST-NEXT: ldr x30, [x8, #8] +; CHECK-FAST-NEXT: hint #7 +; CHECK-FAST-NEXT: and x0, x30, #0xffffffff +; CHECK-FAST-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: ret %val = call ptr @llvm.returnaddress(i32 1) ret ptr %val } define void @test_indirect_call(ptr %func) { ; CHECK-LABEL: test_indirect_call: -; CHECK: blr x0 +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: blr x0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret call void() %func() ret void } @@ -272,9 +466,17 @@ ; Safe to use the unextended address here define void @test_indirect_safe_call(ptr %weird_funcs) { ; CHECK-LABEL: test_indirect_safe_call: -; CHECK: add w[[ADDR32:[0-9]+]], w0, #4 -; CHECK-OPT-NOT: ubfx -; CHECK: blr x[[ADDR32]] +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add w8, w0, #4 +; CHECK-NEXT: blr x8 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret %addr = getelementptr i32, ptr %weird_funcs, i32 1 call void() %addr() ret void @@ -283,14 +485,16 @@ declare void @simple() define void @test_simple_tail_call() { ; CHECK-LABEL: test_simple_tail_call: -; CHECK: b _simple +; CHECK: ; %bb.0: +; CHECK-NEXT: b _simple tail call void @simple() ret void } define void @test_indirect_tail_call(ptr %func) { ; CHECK-LABEL: test_indirect_tail_call: -; CHECK: br x0 +; CHECK: ; %bb.0: +; CHECK-NEXT: br x0 tail call void() %func() ret void } @@ -298,9 +502,9 @@ ; Safe to use the unextended address here define void @test_indirect_safe_tail_call(ptr %weird_funcs) { ; CHECK-LABEL: test_indirect_safe_tail_call: -; CHECK: add w[[ADDR32:[0-9]+]], w0, #4 -; CHECK-OPT-NOT: ubfx -; CHECK-OPT: br x[[ADDR32]] +; CHECK: ; %bb.0: +; CHECK-NEXT: add w0, w0, #4 +; CHECK-NEXT: br x0 %addr = getelementptr i32, ptr %weird_funcs, i32 1 tail call void() %addr() ret void @@ -312,14 +516,20 @@ define i32 @test_in_smallstruct_low([3 x i32] %in) { ; CHECK-LABEL: test_in_smallstruct_low: -; CHECK: mov x0, x1 +; CHECK: ; %bb.0: +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret %val = extractvalue [3 x i32] %in, 2 ret i32 %val } define i32 @test_in_smallstruct_high([3 x i32] %in) { ; CHECK-LABEL: test_in_smallstruct_high: -; CHECK: lsr x0, x0, #32 +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr x0, x0, #32 +; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret %val = extractvalue [3 x i32] %in, 1 ret i32 %val } @@ -329,15 +539,19 @@ ; be incompatible with the armv7k ABI. define i32 @test_in_smallstruct_stack([8 x i64], i32, [3 x i32] %in) { ; CHECK-LABEL: test_in_smallstruct_stack: -; CHECK: ldr w0, [sp, #4] +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr w0, [sp, #4] +; CHECK-NEXT: ret %val = extractvalue [3 x i32] %in, 0 ret i32 %val } define [2 x i32] @test_ret_smallstruct([3 x i32] %in) { ; CHECK-LABEL: test_ret_smallstruct: -; CHECK: mov x0, #1 -; CHECK: movk x0, #2, lsl #32 +; CHECK: ; %bb.0: +; CHECK-NEXT: mov x0, #1 ; =0x1 +; CHECK-NEXT: movk x0, #2, lsl #32 +; CHECK-NEXT: ret ret [2 x i32] [i32 1, i32 2] } @@ -345,11 +559,20 @@ declare void @smallstruct_callee([4 x i32]) define void @test_call_smallstruct() { ; CHECK-LABEL: test_call_smallstruct: -; CHECK: mov x0, #1 -; CHECK: movk x0, #2, lsl #32 -; CHECK: mov x1, #3 -; CHECK: movk x1, #4, lsl #32 -; CHECK: bl _smallstruct_callee +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x0, #1 ; =0x1 +; CHECK-NEXT: movk x0, #2, lsl #32 +; CHECK-NEXT: mov x1, #3 ; =0x3 +; CHECK-NEXT: movk x1, #4, lsl #32 +; CHECK-NEXT: bl _smallstruct_callee +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret call void @smallstruct_callee([4 x i32] [i32 1, i32 2, i32 3, i32 4]) ret void @@ -358,9 +581,21 @@ declare void @smallstruct_callee_stack([8 x i64], i32, [2 x i32]) define void @test_call_smallstruct_stack() { ; CHECK-LABEL: test_call_smallstruct_stack: -; CHECK: mov [[VAL:x[0-9]+]], #1 -; CHECK: movk [[VAL]], #2, lsl #32 -; CHECK: stur [[VAL]], [sp, #4] +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, #1 ; =0x1 +; CHECK-NEXT: movk x8, #2, lsl #32 +; CHECK-NEXT: stur x8, [sp, #4] +; CHECK-NEXT: bl _smallstruct_callee_stack +; CHECK-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: ret call void @smallstruct_callee_stack([8 x i64] undef, i32 undef, [2 x i32] [i32 1, i32 2]) ret void @@ -369,8 +604,18 @@ declare [3 x i32] @returns_smallstruct() define i32 @test_use_smallstruct_low() { ; CHECK-LABEL: test_use_smallstruct_low: -; CHECK: bl _returns_smallstruct -; CHECK: mov x0, x1 +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: bl _returns_smallstruct +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret %struct = call [3 x i32] @returns_smallstruct() %val = extractvalue [3 x i32] %struct, 2 @@ -379,8 +624,18 @@ define i32 @test_use_smallstruct_high() { ; CHECK-LABEL: test_use_smallstruct_high: -; CHECK: bl _returns_smallstruct -; CHECK: lsr x0, x0, #32 +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: bl _returns_smallstruct +; CHECK-NEXT: lsr x0, x0, #32 +; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret %struct = call [3 x i32] @returns_smallstruct() %val = extractvalue [3 x i32] %struct, 1 @@ -391,10 +646,19 @@ ; be marked as unavailable and subsequent GPR arguments should also be on the ; stack. Obviously the struct itself should be passed entirely on the stack. define i32 @test_smallstruct_padding([7 x i64], [4 x i32] %struct, i32 %in) { -; CHECK-LABEL: test_smallstruct_padding: -; CHECK-DAG: ldr [[IN:w[0-9]+]], [sp, #16] -; CHECK-DAG: ldr [[LHS:w[0-9]+]], [sp] -; CHECK: add w0, [[LHS]], [[IN]] +; CHECK-OPT-LABEL: test_smallstruct_padding: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: ldr w8, [sp, #16] +; CHECK-OPT-NEXT: ldr w9, [sp] +; CHECK-OPT-NEXT: add w0, w9, w8 +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_smallstruct_padding: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: ldr w8, [sp] +; CHECK-FAST-NEXT: ldr w9, [sp, #16] +; CHECK-FAST-NEXT: add w0, w8, w9 +; CHECK-FAST-NEXT: ret %lhs = extractvalue [4 x i32] %struct, 0 %sum = add i32 %lhs, %in ret i32 %sum @@ -403,17 +667,31 @@ declare void @take_small_smallstruct(i64, [1 x i32]) define void @test_small_smallstruct() { ; CHECK-LABEL: test_small_smallstruct: -; CHECK-DAG: mov w0, #1 -; CHECK-DAG: mov w1, #2 -; CHECK: bl _take_small_smallstruct +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov w0, #1 ; =0x1 +; CHECK-NEXT: mov w1, #2 ; =0x2 +; CHECK-NEXT: bl _take_small_smallstruct +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret call void @take_small_smallstruct(i64 1, [1 x i32] [i32 2]) ret void } define void @test_bare_frameaddr(ptr %addr) { ; CHECK-LABEL: test_bare_frameaddr: -; CHECK: add x[[LOCAL:[0-9]+]], sp, #{{[0-9]+}} -; CHECK: str w[[LOCAL]], +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #15 +; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %ptr = alloca i8 store ptr %ptr, ptr %addr, align 4 @@ -422,15 +700,29 @@ define void @test_sret_use(ptr sret([8 x i64]) %out) { ; CHECK-LABEL: test_sret_use: -; CHECK: str xzr, [x8] +; CHECK: ; %bb.0: +; CHECK-NEXT: str xzr, [x8] +; CHECK-NEXT: ret store i64 0, ptr %out ret void } define i64 @test_sret_call() { ; CHECK-LABEL: test_sret_call: -; CHECK: mov x8, sp -; CHECK: bl _test_sret_use +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: stp x29, x30, [sp, #64] ; 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: bl _test_sret_use +; CHECK-NEXT: ldr x0, [sp] +; CHECK-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 +; CHECK-NEXT: ret %arr = alloca [8 x i64] call void @test_sret_use(ptr sret([8 x i64]) %arr) @@ -440,16 +732,27 @@ define double @test_constpool() { ; CHECK-LABEL: test_constpool: -; CHECK: adrp x[[PAGE:[0-9]+]], [[POOL:lCPI[0-9]+_[0-9]+]]@PAGE -; CHECK: ldr d0, [x[[PAGE]], [[POOL]]@PAGEOFF] +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh22: +; CHECK-NEXT: adrp x8, lCPI37_0@PAGE +; CHECK-NEXT: Lloh23: +; CHECK-NEXT: ldr d0, [x8, lCPI37_0@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh23 ret double 1.0e-6 } define ptr @test_blockaddress() { ; CHECK-LABEL: test_blockaddress: -; CHECK: [[BLOCK:Ltmp[0-9]+]]: -; CHECK: adrp x[[PAGE:[0-9]+]], lCPI{{[0-9]+_[0-9]+}}@PAGE -; CHECK: ldr x0, [x[[PAGE]], lCPI{{[0-9]+_[0-9]+}}@PAGEOFF] +; CHECK: ; %bb.0: +; CHECK-NEXT: Ltmp7: ; Block address taken +; CHECK-NEXT: ; %bb.1: ; %dest +; CHECK-NEXT: Lloh24: +; CHECK-NEXT: adrp x0, lCPI38_0@PAGE +; CHECK-NEXT: Lloh25: +; CHECK-NEXT: ldr x0, [x0, lCPI38_0@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh24, Lloh25 br label %dest dest: ret ptr blockaddress(@test_blockaddress, %dest) @@ -457,7 +760,24 @@ define ptr @test_indirectbr(ptr %dest) { ; CHECK-LABEL: test_indirectbr: -; CHECK: br x0 +; CHECK: ; %bb.0: +; CHECK-NEXT: br x0 +; CHECK-NEXT: Ltmp8: ; Block address taken +; CHECK-NEXT: LBB39_1: ; %true +; CHECK-NEXT: Lloh26: +; CHECK-NEXT: adrp x0, lCPI39_0@PAGE +; CHECK-NEXT: Lloh27: +; CHECK-NEXT: ldr x0, [x0, lCPI39_0@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: Ltmp9: ; Block address taken +; CHECK-NEXT: LBB39_2: ; %false +; CHECK-NEXT: Lloh28: +; CHECK-NEXT: adrp x0, lCPI39_1@PAGE +; CHECK-NEXT: Lloh29: +; CHECK-NEXT: ldr x0, [x0, lCPI39_1@PAGEOFF] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh26, Lloh27 +; CHECK-NEXT: .loh AdrpLdr Lloh28, Lloh29 indirectbr ptr %dest, [label %true, label %false] true: @@ -471,7 +791,12 @@ ; claim the FI in the process -- it doesn't need extending. define float @test_frameindex_offset_load() { ; CHECK-LABEL: test_frameindex_offset_load: -; CHECK: ldr s0, [sp, #4] +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr s0, [sp, #4] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %arr = alloca float, i32 4, align 8 %addr = getelementptr inbounds float, ptr %arr, i32 1 @@ -481,10 +806,15 @@ define void @test_unaligned_frameindex_offset_store() { ; CHECK-LABEL: test_unaligned_frameindex_offset_store: -; CHECK: mov x[[TMP:[0-9]+]], sp -; CHECK: orr w[[ADDR:[0-9]+]], w[[TMP]], #0x2 -; CHECK: mov [[VAL:w[0-9]+]], #42 -; CHECK: str [[VAL]], [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: orr w8, w8, #0x2 +; CHECK-NEXT: mov w9, #42 ; =0x2a +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %arr = alloca [4 x i32] %addr.int = ptrtoint ptr %arr to i32 @@ -497,9 +827,11 @@ define {i64, ptr} @test_pre_idx(ptr %addr) { ; CHECK-LABEL: test_pre_idx: +; CHECK: ; %bb.0: +; CHECK-NEXT: add w1, w0, #8 +; CHECK-NEXT: ldr x0, [x1] +; CHECK-NEXT: ret -; CHECK: add w[[ADDR:[0-9]+]], w0, #8 -; CHECK: ldr x0, [x[[ADDR]]] %addr.int = ptrtoint ptr %addr to i32 %addr.next.int = add nuw i32 %addr.int, 8 %addr.next = inttoptr i32 %addr.next.int to ptr @@ -515,8 +847,10 @@ ; %addr wraps round to 0. define {i64, ptr} @test_invalid_pre_idx(ptr %addr) { ; CHECK-LABEL: test_invalid_pre_idx: -; CHECK: add w1, w0, #8 -; CHECK: ldr x0, [x1] +; CHECK: ; %bb.0: +; CHECK-NEXT: add w1, w0, #8 +; CHECK-NEXT: ldr x0, [x1] +; CHECK-NEXT: ret %addr.next = getelementptr i64, ptr %addr, i32 1 %val = load i64, ptr %addr.next @@ -528,24 +862,81 @@ declare void @callee(ptr) define void @test_stack_guard() ssp { -; CHECK-LABEL: test_stack_guard: -; CHECK: adrp x[[GUARD_GOTPAGE:[0-9]+]], ___stack_chk_guard@GOTPAGE -; CHECK: ldr w[[GUARD_ADDR:[0-9]+]], [x[[GUARD_GOTPAGE]], ___stack_chk_guard@GOTPAGEOFF] -; CHECK: ldr [[GUARD_VAL:w[0-9]+]], [x[[GUARD_ADDR]]] -; CHECK: stur [[GUARD_VAL]], [x29, #[[GUARD_OFFSET:-[0-9]+]]] - -; CHECK: add x0, sp, #{{[0-9]+}} -; CHECK: bl _callee - -; CHECK-OPT: adrp x[[GUARD_GOTPAGE:[0-9]+]], ___stack_chk_guard@GOTPAGE -; CHECK-OPT: ldr w[[GUARD_ADDR:[0-9]+]], [x[[GUARD_GOTPAGE]], ___stack_chk_guard@GOTPAGEOFF] -; CHECK-OPT: ldr [[GUARD_VAL:w[0-9]+]], [x[[GUARD_ADDR]]] -; CHECK-OPT: ldur [[NEW_VAL:w[0-9]+]], [x29, #[[GUARD_OFFSET]]] -; CHECK-OPT: cmp [[GUARD_VAL]], [[NEW_VAL]] -; CHECK-OPT: b.ne [[FAIL:LBB[0-9]+_[0-9]+]] - -; CHECK-OPT: [[FAIL]]: -; CHECK-OPT-NEXT: bl ___stack_chk_fail +; CHECK-OPT-LABEL: test_stack_guard: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: sub sp, sp, #64 +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 64 +; CHECK-OPT-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-OPT-NEXT: add x29, sp, #48 +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: Lloh30: +; CHECK-OPT-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE +; CHECK-OPT-NEXT: Lloh31: +; CHECK-OPT-NEXT: ldr w8, [x8, ___stack_chk_guard@GOTPAGEOFF] +; CHECK-OPT-NEXT: Lloh32: +; CHECK-OPT-NEXT: ldr w8, [x8] +; CHECK-OPT-NEXT: stur w8, [x29, #-4] +; CHECK-OPT-NEXT: add x0, sp, #12 +; CHECK-OPT-NEXT: bl _callee +; CHECK-OPT-NEXT: Lloh33: +; CHECK-OPT-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE +; CHECK-OPT-NEXT: Lloh34: +; CHECK-OPT-NEXT: ldr w8, [x8, ___stack_chk_guard@GOTPAGEOFF] +; CHECK-OPT-NEXT: Lloh35: +; CHECK-OPT-NEXT: ldr w8, [x8] +; CHECK-OPT-NEXT: ldur w9, [x29, #-4] +; CHECK-OPT-NEXT: cmp w8, w9 +; CHECK-OPT-NEXT: b.ne LBB44_2 +; CHECK-OPT-NEXT: ; %bb.1: +; CHECK-OPT-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-OPT-NEXT: add sp, sp, #64 +; CHECK-OPT-NEXT: ret +; CHECK-OPT-NEXT: LBB44_2: +; CHECK-OPT-NEXT: bl ___stack_chk_fail +; CHECK-OPT-NEXT: .loh AdrpLdrGotLdr Lloh33, Lloh34, Lloh35 +; CHECK-OPT-NEXT: .loh AdrpLdrGotLdr Lloh30, Lloh31, Lloh32 +; +; CHECK-FAST-LABEL: test_stack_guard: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: sub sp, sp, #64 +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 64 +; CHECK-FAST-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-FAST-NEXT: add x29, sp, #48 +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: Lloh30: +; CHECK-FAST-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE +; CHECK-FAST-NEXT: Lloh31: +; CHECK-FAST-NEXT: ldr w8, [x8, ___stack_chk_guard@GOTPAGEOFF] +; CHECK-FAST-NEXT: Lloh32: +; CHECK-FAST-NEXT: ldr w8, [x8] +; CHECK-FAST-NEXT: stur w8, [x29, #-4] +; CHECK-FAST-NEXT: add x0, sp, #12 +; CHECK-FAST-NEXT: bl _callee +; CHECK-FAST-NEXT: Lloh33: +; CHECK-FAST-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE +; CHECK-FAST-NEXT: Lloh34: +; CHECK-FAST-NEXT: ldr w8, [x8, ___stack_chk_guard@GOTPAGEOFF] +; CHECK-FAST-NEXT: Lloh35: +; CHECK-FAST-NEXT: ldr w8, [x8] +; CHECK-FAST-NEXT: ldur w9, [x29, #-4] +; CHECK-FAST-NEXT: and x8, x8, #0xffffffff +; CHECK-FAST-NEXT: cmp x8, x9 +; CHECK-FAST-NEXT: b.ne LBB44_2 +; CHECK-FAST-NEXT: ; %bb.1: ; %SP_return +; CHECK-FAST-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-FAST-NEXT: add sp, sp, #64 +; CHECK-FAST-NEXT: ret +; CHECK-FAST-NEXT: LBB44_2: ; %CallStackCheckFailBlk +; CHECK-FAST-NEXT: bl ___stack_chk_fail +; CHECK-FAST-NEXT: .loh AdrpLdrGotLdr Lloh33, Lloh34, Lloh35 +; CHECK-FAST-NEXT: .loh AdrpLdrGotLdr Lloh30, Lloh31, Lloh32 + + + %arr = alloca [8 x i32] call void @callee(ptr %arr) ret void @@ -556,9 +947,62 @@ @_ZTI8Whatever = external global i8 define void @test_landingpad_marshalling() personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: test_landingpad_marshalling: -; CHECK-OPT: mov x2, x1 -; CHECK-OPT: mov x1, x0 -; CHECK: bl _eat_landingpad_args +; CHECK: Lfunc_begin0: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 155, ___gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 16, Lexception0 +; CHECK-NEXT: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: Ltmp3: +; CHECK-NEXT: bl _callee +; CHECK-NEXT: Ltmp4: +; CHECK-NEXT: ; %bb.1: ; %done +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: LBB45_2: ; %lpad +; CHECK-NEXT: Ltmp5: +; CHECK-NEXT: mov x2, x1 +; CHECK-NEXT: mov x1, x0 +; CHECK-NEXT: ; kill: def $w2 killed $w2 killed $x2 +; CHECK-NEXT: bl _eat_landingpad_args +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: Lfunc_end0: +; CHECK-NEXT: .cfi_endproc +; CHECK-NEXT: .section __TEXT,__gcc_except_tab +; CHECK-NEXT: .p2align 2, 0x0 +; CHECK-NEXT: GCC_except_table45: +; CHECK-NEXT: Lexception0: +; CHECK-NEXT: .byte 255 ; @LPStart Encoding = omit +; CHECK-NEXT: .byte 155 ; @TType Encoding = indirect pcrel sdata4 +; CHECK-NEXT: .uleb128 Lttbase0-Lttbaseref0 +; CHECK-NEXT: Lttbaseref0: +; CHECK-NEXT: .byte 1 ; Call site Encoding = uleb128 +; CHECK-NEXT: .uleb128 Lcst_end0-Lcst_begin0 +; CHECK-NEXT: Lcst_begin0: +; CHECK-NEXT: .uleb128 Ltmp3-Lfunc_begin0 ; >> Call Site 1 << +; CHECK-NEXT: .uleb128 Ltmp4-Ltmp3 ; Call between Ltmp3 and Ltmp4 +; CHECK-NEXT: .uleb128 Ltmp5-Lfunc_begin0 ; jumps to Ltmp5 +; CHECK-NEXT: .byte 1 ; On action: 1 +; CHECK-NEXT: .uleb128 Ltmp4-Lfunc_begin0 ; >> Call Site 2 << +; CHECK-NEXT: .uleb128 Lfunc_end0-Ltmp4 ; Call between Ltmp4 and Lfunc_end0 +; CHECK-NEXT: .byte 0 ; has no landing pad +; CHECK-NEXT: .byte 0 ; On action: cleanup +; CHECK-NEXT: Lcst_end0: +; CHECK-NEXT: .byte 1 ; >> Action Record 1 << +; CHECK-NEXT: ; Catch TypeInfo 1 +; CHECK-NEXT: .byte 0 ; No further actions +; CHECK-NEXT: .p2align 2, 0x0 +; CHECK-NEXT: ; >> Catch TypeInfos << +; CHECK-NEXT: Ltmp10: ; TypeInfo 1 +; CHECK-NEXT: .long __ZTI8Whatever@GOT-Ltmp10 +; CHECK-NEXT: Lttbase0: +; CHECK-NEXT: .p2align 2, 0x0 invoke void @callee(ptr undef) to label %done unwind label %lpad lpad: ; preds = %entry @@ -575,10 +1019,19 @@ define void @test_dynamic_stackalloc() { ; CHECK-LABEL: test_dynamic_stackalloc: -; CHECK: sub [[REG:x[0-9]+]], sp, #32 -; CHECK: mov sp, [[REG]] -; CHECK-OPT-NOT: ubfx -; CHECK: bl _callee +; CHECK: ; %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x0, sp, #32 +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: bl _callee +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-NEXT: ret br label %next next: @@ -589,8 +1042,12 @@ define void @test_asm_memory(ptr %base.addr) { ; CHECK-LABEL: test_asm_memory: -; CHECK: add w[[ADDR:[0-9]+]], w0, #4 -; CHECK: str wzr, [x[[ADDR]] +; CHECK: ; %bb.0: +; CHECK-NEXT: add w8, w0, #4 +; CHECK-NEXT: ; InlineAsm Start +; CHECK-NEXT: str wzr, [x8] +; CHECK-NEXT: ; InlineAsm End +; CHECK-NEXT: ret %addr = getelementptr i32, ptr %base.addr, i32 1 call void asm sideeffect "str wzr, $0", "*m"(ptr elementtype(i32) %addr) ret void @@ -598,8 +1055,12 @@ define void @test_unsafe_asm_memory(i64 %val) { ; CHECK-LABEL: test_unsafe_asm_memory: -; CHECK: and x[[ADDR:[0-9]+]], x0, #0xffffffff -; CHECK: str wzr, [x[[ADDR]]] +; CHECK: ; %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffff +; CHECK-NEXT: ; InlineAsm Start +; CHECK-NEXT: str wzr, [x8] +; CHECK-NEXT: ; InlineAsm End +; CHECK-NEXT: ret %addr_int = trunc i64 %val to i32 %addr = inttoptr i32 %addr_int to ptr call void asm sideeffect "str wzr, $0", "*m"(ptr elementtype(i32) %addr) @@ -608,14 +1069,22 @@ define [9 x ptr] @test_demoted_return(ptr %in) { ; CHECK-LABEL: test_demoted_return: -; CHECK: str w0, [x8, #32] +; CHECK: ; %bb.0: +; CHECK-NEXT: stp w8, w0, [x8, #28] +; CHECK-NEXT: stp w8, w8, [x8, #20] +; CHECK-NEXT: stp w8, w8, [x8, #12] +; CHECK-NEXT: stp w8, w8, [x8, #4] +; CHECK-NEXT: str w8, [x8] +; CHECK-NEXT: ret %res = insertvalue [9 x ptr] undef, ptr %in, 8 ret [9 x ptr] %res } define ptr @test_inttoptr(i64 %in) { ; CHECK-LABEL: test_inttoptr: -; CHECK: and x0, x0, #0xffffffff +; CHECK: ; %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffff +; CHECK-NEXT: ret %res = inttoptr i64 %in to ptr ret ptr %res } @@ -623,16 +1092,18 @@ declare i32 @llvm.get.dynamic.area.offset.i32() define i32 @test_dynamic_area() { ; CHECK-LABEL: test_dynamic_area: -; CHECK: mov w0, wzr +; CHECK: ; %bb.0: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret %res = call i32 @llvm.get.dynamic.area.offset.i32() ret i32 %res } define void @test_pointer_vec_store(ptr %addr) { ; CHECK-LABEL: test_pointer_vec_store: -; CHECK: str xzr, [x0] -; CHECK-NOT: str -; CHECK-NOT: stp +; CHECK: ; %bb.0: +; CHECK-NEXT: str xzr, [x0] +; CHECK-NEXT: ret store <2 x ptr> zeroinitializer, ptr %addr, align 16 ret void @@ -640,28 +1111,58 @@ define <2 x ptr> @test_pointer_vec_load(ptr %addr) { ; CHECK-LABEL: test_pointer_vec_load: -; CHECK: ldr d[[TMP:[0-9]+]], [x0] -; CHECK: ushll.2d v0, v[[TMP]], #0 +; CHECK: ; %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ushll.2d v0, v0, #0 +; CHECK-NEXT: ret %val = load <2 x ptr>, ptr %addr, align 16 ret <2 x ptr> %val } define void @test_inline_asm_mem_pointer(ptr %in) { ; CHECK-LABEL: test_inline_asm_mem_pointer: -; CHECK: str w0, +; CHECK: ; %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: add x8, sp, #12 +; CHECK-NEXT: str w0, [sp, #12] +; CHECK-NEXT: ; InlineAsm Start +; CHECK-NEXT: ldr x0, [x8] +; CHECK-NEXT: ; InlineAsm End +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret tail call void asm sideeffect "ldr x0, $0", "rm"(ptr %in) ret void } define void @test_struct_hi(i32 %hi) nounwind { -; CHECK-LABEL: test_struct_hi: -; CHECK: mov w[[IN:[0-9]+]], w0 -; CHECK: bl _get_int -; CHECK-FAST-NEXT: mov w[[DST:[0-9]+]], w0 -; CHECK-FAST-NEXT: orr x0, x[[DST]], x[[IN]], lsl #32 -; CHECK-OPT-NEXT: bfi x0, x[[IN]], #32, #32 -; CHECK-NEXT: bl _take_pair +; CHECK-OPT-LABEL: test_struct_hi: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-OPT-NEXT: add x29, sp, #16 +; CHECK-OPT-NEXT: mov w19, w0 +; CHECK-OPT-NEXT: bl _get_int +; CHECK-OPT-NEXT: bfi x0, x19, #32, #32 +; CHECK-OPT-NEXT: bl _take_pair +; CHECK-OPT-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-OPT-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_struct_hi: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-FAST-NEXT: add x29, sp, #16 +; CHECK-FAST-NEXT: mov w19, w0 +; CHECK-FAST-NEXT: bl _get_int +; CHECK-FAST-NEXT: mov w8, w0 +; CHECK-FAST-NEXT: orr x0, x8, x19, lsl #32 +; CHECK-FAST-NEXT: bl _take_pair +; CHECK-FAST-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-FAST-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: ret %val.64 = call i64 @get_int() %val.32 = trunc i64 %val.64 to i32 @@ -675,16 +1176,55 @@ declare i64 @get_int() define i1 @test_icmp_ptr(ptr %in) { -; CHECK-LABEL: test_icmp_ptr -; CHECK: ubfx x0, x0, #31, #1 +; CHECK-LABEL: test_icmp_ptr: +; CHECK: ; %bb.0: +; CHECK-NEXT: ubfx x0, x0, #31, #1 +; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret %res = icmp slt ptr %in, null ret i1 %res } define void @test_multiple_icmp_ptr(ptr %l, ptr %r) { -; CHECK-LABEL: test_multiple_icmp_ptr: -; CHECK: tbnz w0, #31, [[FALSEBB:LBB[0-9]+_[0-9]+]] -; CHECK: tbnz w1, #31, [[FALSEBB]] +; CHECK-OPT-LABEL: test_multiple_icmp_ptr: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: tbnz w0, #31, LBB57_3 +; CHECK-OPT-NEXT: ; %bb.1: +; CHECK-OPT-NEXT: tbnz w1, #31, LBB57_3 +; CHECK-OPT-NEXT: ; %bb.2: ; %true +; CHECK-OPT-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 16 +; CHECK-OPT-NEXT: mov x29, sp +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: bl _bar +; CHECK-OPT-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: LBB57_3: ; %false +; CHECK-OPT-NEXT: .cfi_def_cfa wsp, 0 +; CHECK-OPT-NEXT: .cfi_same_value w30 +; CHECK-OPT-NEXT: .cfi_same_value w29 +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_multiple_icmp_ptr: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: tbnz w0, #31, LBB57_3 +; CHECK-FAST-NEXT: ; %bb.1: ; %.cond.split +; CHECK-FAST-NEXT: tbnz w1, #31, LBB57_3 +; CHECK-FAST-NEXT: ; %bb.2: ; %true +; CHECK-FAST-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 16 +; CHECK-FAST-NEXT: mov x29, sp +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: bl _bar +; CHECK-FAST-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: LBB57_3: ; %false +; CHECK-FAST-NEXT: .cfi_def_cfa wsp, 0 +; CHECK-FAST-NEXT: .cfi_same_value w30 +; CHECK-FAST-NEXT: .cfi_same_value w29 +; CHECK-FAST-NEXT: ret %tst1 = icmp sgt ptr %l, inttoptr (i32 -1 to ptr) %tst2 = icmp sgt ptr %r, inttoptr (i32 -1 to ptr) %tst = and i1 %tst1, %tst2 @@ -699,9 +1239,45 @@ } define void @test_multiple_icmp_ptr_select(ptr %l, ptr %r) { -; CHECK-LABEL: test_multiple_icmp_ptr_select: -; CHECK: tbnz w0, #31, [[FALSEBB:LBB[0-9]+_[0-9]+]] -; CHECK: tbnz w1, #31, [[FALSEBB]] +; CHECK-OPT-LABEL: test_multiple_icmp_ptr_select: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: tbnz w0, #31, LBB58_3 +; CHECK-OPT-NEXT: ; %bb.1: +; CHECK-OPT-NEXT: tbnz w1, #31, LBB58_3 +; CHECK-OPT-NEXT: ; %bb.2: ; %true +; CHECK-OPT-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-OPT-NEXT: .cfi_def_cfa_offset 16 +; CHECK-OPT-NEXT: mov x29, sp +; CHECK-OPT-NEXT: .cfi_def_cfa w29, 16 +; CHECK-OPT-NEXT: .cfi_offset w30, -8 +; CHECK-OPT-NEXT: .cfi_offset w29, -16 +; CHECK-OPT-NEXT: bl _bar +; CHECK-OPT-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-OPT-NEXT: LBB58_3: ; %false +; CHECK-OPT-NEXT: .cfi_def_cfa wsp, 0 +; CHECK-OPT-NEXT: .cfi_same_value w30 +; CHECK-OPT-NEXT: .cfi_same_value w29 +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_multiple_icmp_ptr_select: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: tbnz w0, #31, LBB58_3 +; CHECK-FAST-NEXT: ; %bb.1: ; %.cond.split +; CHECK-FAST-NEXT: tbnz w1, #31, LBB58_3 +; CHECK-FAST-NEXT: ; %bb.2: ; %true +; CHECK-FAST-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-FAST-NEXT: .cfi_def_cfa_offset 16 +; CHECK-FAST-NEXT: mov x29, sp +; CHECK-FAST-NEXT: .cfi_def_cfa w29, 16 +; CHECK-FAST-NEXT: .cfi_offset w30, -8 +; CHECK-FAST-NEXT: .cfi_offset w29, -16 +; CHECK-FAST-NEXT: bl _bar +; CHECK-FAST-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload +; CHECK-FAST-NEXT: LBB58_3: ; %false +; CHECK-FAST-NEXT: .cfi_def_cfa wsp, 0 +; CHECK-FAST-NEXT: .cfi_same_value w30 +; CHECK-FAST-NEXT: .cfi_same_value w29 +; CHECK-FAST-NEXT: ret %tst1 = icmp sgt ptr %l, inttoptr (i32 -1 to ptr) %tst2 = icmp sgt ptr %r, inttoptr (i32 -1 to ptr) %tst = select i1 %tst1, i1 %tst2, i1 false @@ -716,25 +1292,31 @@ } define ptr @test_gep_nonpow2(ptr %a0, i32 %a1) { -; CHECK-LABEL: test_gep_nonpow2: -; CHECK-OPT: mov w[[SIZE:[0-9]+]], #18 -; CHECK-OPT-NEXT: smaddl x0, w1, w[[SIZE]], x0 -; CHECK-OPT-NEXT: ret - -; CHECK-FAST: mov w[[SIZE:[0-9]+]], #18 -; CHECK-FAST-NEXT: smaddl [[TMP:x[0-9]+]], w1, w[[SIZE]], x0 -; CHECK-FAST-NEXT: and x0, [[TMP]], #0xffffffff -; CHECK-FAST-NEXT: ret +; CHECK-OPT-LABEL: test_gep_nonpow2: +; CHECK-OPT: ; %bb.0: +; CHECK-OPT-NEXT: mov w8, #18 ; =0x12 +; CHECK-OPT-NEXT: smaddl x0, w1, w8, x0 +; CHECK-OPT-NEXT: ret +; +; CHECK-FAST-LABEL: test_gep_nonpow2: +; CHECK-FAST: ; %bb.0: +; CHECK-FAST-NEXT: mov w8, #18 ; =0x12 +; CHECK-FAST-NEXT: smaddl x8, w1, w8, x0 +; CHECK-FAST-NEXT: and x0, x8, #0xffffffff +; CHECK-FAST-NEXT: ret + %tmp0 = getelementptr inbounds { [18 x i8] }, ptr %a0, i32 %a1 ret ptr %tmp0 } define void @test_memset(i64 %in, i8 %value) { ; CHECK-LABEL: test_memset: -; CHECK-DAG: and x8, x0, #0xffffffff -; CHECK-DAG: lsr x2, x0, #32 -; CHECK-DAG: mov x0, x8 -; CHECK: b _memset +; CHECK: ; %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffff +; CHECK-NEXT: lsr x2, x0, #32 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ; kill: def $w2 killed $w2 killed $x2 +; CHECK-NEXT: b _memset %ptr.i32 = trunc i64 %in to i32 %size.64 = lshr i64 %in, 32 @@ -746,9 +1328,11 @@ define void @test_bzero(i64 %in) { ; CHECK-LABEL: test_bzero: -; CHECK-DAG: lsr x1, x0, #32 -; CHECK-DAG: and x0, x0, #0xffffffff -; CHECK: b _bzero +; CHECK: ; %bb.0: +; CHECK-NEXT: lsr x1, x0, #32 +; CHECK-NEXT: and x0, x0, #0xffffffff +; CHECK-NEXT: ; kill: def $w1 killed $w1 killed $x1 +; CHECK-NEXT: b _bzero %ptr.i32 = trunc i64 %in to i32 %size.64 = lshr i64 %in, 32 diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll --- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll @@ -6,8 +6,8 @@ ; CHECK-LABEL: varargs_callee: ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: stp x1, x2, [x4, #-24]! -; CHECK-NEXT: str x3, [x4, #16] +; CHECK-NEXT: str x1, [x4, #-24]! +; CHECK-NEXT: stp x2, x3, [x4, #8] ; CHECK-NEXT: str x4, [sp, #8] ; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret @@ -35,12 +35,12 @@ ; CHECK-NEXT: sub sp, sp, #48 ; CHECK-NEXT: mov x4, sp ; CHECK-NEXT: add x8, sp, #16 -; CHECK-NEXT: mov x9, #4617315517961601024 -; CHECK-NEXT: mov x0, #4607182418800017408 -; CHECK-NEXT: mov w1, #2 -; CHECK-NEXT: mov x2, #4613937818241073152 -; CHECK-NEXT: mov w3, #4 -; CHECK-NEXT: mov w5, #16 +; CHECK-NEXT: mov x9, #4617315517961601024 // =0x4014000000000000 +; CHECK-NEXT: mov x0, #4607182418800017408 // =0x3ff0000000000000 +; CHECK-NEXT: mov w1, #2 // =0x2 +; CHECK-NEXT: mov x2, #4613937818241073152 // =0x4008000000000000 +; CHECK-NEXT: mov w3, #4 // =0x4 +; CHECK-NEXT: mov w5, #16 // =0x10 ; CHECK-NEXT: stp xzr, x30, [sp, #24] // 8-byte Folded Spill ; CHECK-NEXT: stp x8, xzr, [sp, #8] ; CHECK-NEXT: str x9, [sp] @@ -71,13 +71,13 @@ ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov x4, sp -; CHECK-NEXT: mov x8, #4618441417868443648 +; CHECK-NEXT: mov x8, #4618441417868443648 // =0x4018000000000000 ; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: add x3, sp, #32 -; CHECK-NEXT: mov x0, #4607182418800017408 -; CHECK-NEXT: mov x1, #4611686018427387904 -; CHECK-NEXT: mov x2, #4613937818241073152 -; CHECK-NEXT: mov w5, #16 +; CHECK-NEXT: mov x0, #4607182418800017408 // =0x3ff0000000000000 +; CHECK-NEXT: mov x1, #4611686018427387904 // =0x4000000000000000 +; CHECK-NEXT: mov x2, #4613937818241073152 // =0x4008000000000000 +; CHECK-NEXT: mov w5, #16 // =0x10 ; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill ; CHECK-NEXT: stp q0, q0, [sp, #16] ; CHECK-NEXT: stp x9, x8, [sp] diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll --- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -45,7 +45,7 @@ ; ; LSE-LABEL: test_rmw_add_8: ; LSE: // %bb.0: // %entry -; LSE-NEXT: mov w8, #1 +; LSE-NEXT: mov w8, #1 // =0x1 ; LSE-NEXT: ldaddalb w8, w0, [x0] ; LSE-NEXT: ret entry: @@ -94,7 +94,7 @@ ; ; LSE-LABEL: test_rmw_add_16: ; LSE: // %bb.0: // %entry -; LSE-NEXT: mov w8, #1 +; LSE-NEXT: mov w8, #1 // =0x1 ; LSE-NEXT: ldaddalh w8, w0, [x0] ; LSE-NEXT: ret entry: @@ -143,7 +143,7 @@ ; ; LSE-LABEL: test_rmw_add_32: ; LSE: // %bb.0: // %entry -; LSE-NEXT: mov w8, #1 +; LSE-NEXT: mov w8, #1 // =0x1 ; LSE-NEXT: ldaddal w8, w0, [x0] ; LSE-NEXT: ret entry: @@ -192,7 +192,7 @@ ; ; LSE-LABEL: test_rmw_add_64: ; LSE: // %bb.0: // %entry -; LSE-NEXT: mov w8, #1 +; LSE-NEXT: mov w8, #1 // =0x1 ; LSE-NEXT: // kill: def $x8 killed $w8 ; LSE-NEXT: ldaddal x8, x0, [x0] ; LSE-NEXT: ret @@ -215,37 +215,35 @@ ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB4_2 Depth 2 -; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload -; NOLSE-NEXT: adds x14, x11, #1 -; NOLSE-NEXT: cinc x15, x13, hs +; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload +; NOLSE-NEXT: adds x14, x10, #1 +; NOLSE-NEXT: cinc x15, x11, hs ; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB4_1 Depth=1 ; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x10, x12, [x9] -; NOLSE-NEXT: cmp x10, x11 -; NOLSE-NEXT: cset w8, ne -; NOLSE-NEXT: cmp x12, x13 -; NOLSE-NEXT: cinc w8, w8, ne -; NOLSE-NEXT: cbnz w8, .LBB4_4 +; NOLSE-NEXT: ldaxp x9, x8, [x13] +; NOLSE-NEXT: cmp x9, x10 +; NOLSE-NEXT: cset w12, ne +; NOLSE-NEXT: cmp x8, x11 +; NOLSE-NEXT: cinc w12, w12, ne +; NOLSE-NEXT: cbnz w12, .LBB4_4 ; NOLSE-NEXT: // %bb.3: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 -; NOLSE-NEXT: stlxp w8, x14, x15, [x9] -; NOLSE-NEXT: cbnz w8, .LBB4_2 +; NOLSE-NEXT: stlxp w12, x14, x15, [x13] +; NOLSE-NEXT: cbnz w12, .LBB4_2 ; NOLSE-NEXT: b .LBB4_5 ; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 -; NOLSE-NEXT: stlxp w8, x10, x12, [x9] -; NOLSE-NEXT: cbnz w8, .LBB4_2 +; NOLSE-NEXT: stlxp w12, x9, x8, [x13] +; NOLSE-NEXT: cbnz w12, .LBB4_2 ; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 -; NOLSE-NEXT: mov x8, x12 +; NOLSE-NEXT: subs x11, x8, x11 +; NOLSE-NEXT: ccmp x9, x10, #0, eq ; NOLSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill -; NOLSE-NEXT: mov x9, x10 ; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill -; NOLSE-NEXT: subs x12, x12, x13 -; NOLSE-NEXT: ccmp x10, x11, #0, eq ; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill ; NOLSE-NEXT: b.ne .LBB4_1 @@ -605,41 +603,39 @@ ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Loop Header: Depth=1 ; NOLSE-NEXT: // Child Loop BB9_2 Depth 2 -; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload -; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload -; NOLSE-NEXT: mov w8, w11 -; NOLSE-NEXT: mvn w10, w8 -; NOLSE-NEXT: // implicit-def: $x8 +; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload +; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload ; NOLSE-NEXT: mov w8, w10 +; NOLSE-NEXT: mvn w9, w8 +; NOLSE-NEXT: // implicit-def: $x8 +; NOLSE-NEXT: mov w8, w9 ; NOLSE-NEXT: orr x14, x8, #0xfffffffffffffffe -; NOLSE-NEXT: mov x15, #-1 +; NOLSE-NEXT: mov x15, #-1 // =0xffffffffffffffff ; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start ; NOLSE-NEXT: // Parent Loop BB9_1 Depth=1 ; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x10, x12, [x9] -; NOLSE-NEXT: cmp x10, x11 -; NOLSE-NEXT: cset w8, ne -; NOLSE-NEXT: cmp x12, x13 -; NOLSE-NEXT: cinc w8, w8, ne -; NOLSE-NEXT: cbnz w8, .LBB9_4 +; NOLSE-NEXT: ldaxp x9, x8, [x13] +; NOLSE-NEXT: cmp x9, x10 +; NOLSE-NEXT: cset w12, ne +; NOLSE-NEXT: cmp x8, x11 +; NOLSE-NEXT: cinc w12, w12, ne +; NOLSE-NEXT: cbnz w12, .LBB9_4 ; NOLSE-NEXT: // %bb.3: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 -; NOLSE-NEXT: stlxp w8, x14, x15, [x9] -; NOLSE-NEXT: cbnz w8, .LBB9_2 +; NOLSE-NEXT: stlxp w12, x14, x15, [x13] +; NOLSE-NEXT: cbnz w12, .LBB9_2 ; NOLSE-NEXT: b .LBB9_5 ; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 -; NOLSE-NEXT: stlxp w8, x10, x12, [x9] -; NOLSE-NEXT: cbnz w8, .LBB9_2 +; NOLSE-NEXT: stlxp w12, x9, x8, [x13] +; NOLSE-NEXT: cbnz w12, .LBB9_2 ; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1 -; NOLSE-NEXT: mov x8, x12 +; NOLSE-NEXT: subs x11, x8, x11 +; NOLSE-NEXT: ccmp x9, x10, #0, eq ; NOLSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill -; NOLSE-NEXT: mov x9, x10 ; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill -; NOLSE-NEXT: subs x12, x12, x13 -; NOLSE-NEXT: ccmp x10, x11, #0, eq ; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill ; NOLSE-NEXT: b.ne .LBB9_1 @@ -672,7 +668,7 @@ ; LSE-NEXT: // implicit-def: $x9 ; LSE-NEXT: mov w9, w12 ; LSE-NEXT: orr x2, x9, #0xfffffffffffffffe -; LSE-NEXT: mov x9, #-1 +; LSE-NEXT: mov x9, #-1 // =0xffffffffffffffff ; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3 ; LSE-NEXT: mov x3, x9 ; LSE-NEXT: caspal x0, x1, x2, x3, [x8] diff --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll --- a/llvm/test/CodeGen/AArch64/bcmp.ll +++ b/llvm/test/CodeGen/AArch64/bcmp.ll @@ -6,7 +6,7 @@ define i1 @bcmp0(ptr %a, ptr %b) { ; CHECK-LABEL: bcmp0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret %cr = call i32 @bcmp(ptr %a, ptr %b, i64 0) %r = icmp eq i32 %cr, 0 @@ -418,7 +418,7 @@ ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov w2, #89 +; CHECK-NEXT: mov w2, #89 // =0x59 ; CHECK-NEXT: bl bcmp ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cset w0, eq @@ -432,10 +432,11 @@ define i1 @bcmp_zext(i32 %0, i32 %1, i8 %2, i8 %3) { ; CHECK-LABEL: bcmp_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w2, #0xff -; CHECK-NEXT: and w9, w3, #0xff -; CHECK-NEXT: cmp w1, w0 -; CHECK-NEXT: ccmp w9, w8, #0, eq +; CHECK-NEXT: eor w8, w3, w2 +; CHECK-NEXT: eor w9, w1, w0 +; CHECK-NEXT: and w8, w8, #0xff +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %5 = xor i32 %1, %0 @@ -449,14 +450,12 @@ define i1 @bcmp_i8(i8 %a0, i8 %b0, i8 %a1, i8 %b1, i8 %a2, i8 %b2) { ; CHECK-LABEL: bcmp_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w9, w1, #0xff -; CHECK-NEXT: and w8, w2, #0xff -; CHECK-NEXT: and w10, w3, #0xff -; CHECK-NEXT: cmp w9, w0, uxtb -; CHECK-NEXT: ccmp w10, w8, #0, eq -; CHECK-NEXT: and w8, w4, #0xff -; CHECK-NEXT: and w9, w5, #0xff -; CHECK-NEXT: ccmp w9, w8, #0, eq +; CHECK-NEXT: eor w8, w1, w0 +; CHECK-NEXT: eor w9, w3, w2 +; CHECK-NEXT: eor w10, w5, w4 +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: tst w8, #0xff ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %xor0 = xor i8 %b0, %a0 @@ -471,14 +470,12 @@ define i1 @bcmp_i16(i16 %a0, i16 %b0, i16 %a1, i16 %b1, i16 %a2, i16 %b2) { ; CHECK-LABEL: bcmp_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: and w9, w1, #0xffff -; CHECK-NEXT: and w8, w2, #0xffff -; CHECK-NEXT: and w10, w3, #0xffff -; CHECK-NEXT: cmp w9, w0, uxth -; CHECK-NEXT: ccmp w10, w8, #0, eq -; CHECK-NEXT: and w8, w4, #0xffff -; CHECK-NEXT: and w9, w5, #0xffff -; CHECK-NEXT: ccmp w9, w8, #0, eq +; CHECK-NEXT: eor w8, w1, w0 +; CHECK-NEXT: eor w9, w3, w2 +; CHECK-NEXT: eor w10, w5, w4 +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: tst w8, #0xffff ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %xor0 = xor i16 %b0, %a0 @@ -496,13 +493,14 @@ ; CHECK-NEXT: cmp x2, x0 ; CHECK-NEXT: ccmp x3, x1, #0, eq ; CHECK-NEXT: ldp x9, x8, [sp] -; CHECK-NEXT: ccmp x6, x4, #0, eq -; CHECK-NEXT: ldp x10, x11, [sp, #16] +; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: cmp x6, x4 +; CHECK-NEXT: ldp x11, x12, [sp, #16] ; CHECK-NEXT: ccmp x7, x5, #0, eq -; CHECK-NEXT: cset w12, ne -; CHECK-NEXT: cmp x10, x9 -; CHECK-NEXT: ccmp x11, x8, #0, eq -; CHECK-NEXT: csinc w0, w12, wzr, eq +; CHECK-NEXT: csinc w10, w10, wzr, eq +; CHECK-NEXT: cmp x11, x9 +; CHECK-NEXT: ccmp x12, x8, #0, eq +; CHECK-NEXT: csinc w0, w10, wzr, eq ; CHECK-NEXT: ret %xor0 = xor i128 %b0, %a0 %xor1 = xor i128 %b1, %a1 @@ -516,15 +514,12 @@ define i1 @bcmp_i42(i42 %a0, i42 %b0, i42 %a1, i42 %b1, i42 %a2, i42 %b2) { ; CHECK-LABEL: bcmp_i42: ; CHECK: // %bb.0: -; CHECK-NEXT: and x9, x0, #0x3ffffffffff -; CHECK-NEXT: and x10, x1, #0x3ffffffffff -; CHECK-NEXT: and x8, x2, #0x3ffffffffff -; CHECK-NEXT: and x11, x3, #0x3ffffffffff -; CHECK-NEXT: cmp x10, x9 -; CHECK-NEXT: and x9, x5, #0x3ffffffffff -; CHECK-NEXT: ccmp x11, x8, #0, eq -; CHECK-NEXT: and x8, x4, #0x3ffffffffff -; CHECK-NEXT: ccmp x9, x8, #0, eq +; CHECK-NEXT: eor x8, x1, x0 +; CHECK-NEXT: eor x9, x3, x2 +; CHECK-NEXT: eor x10, x5, x4 +; CHECK-NEXT: orr x8, x8, x9 +; CHECK-NEXT: orr x8, x8, x10 +; CHECK-NEXT: tst x8, #0x3ffffffffff ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %xor0 = xor i42 %b0, %a0 diff --git a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll --- a/llvm/test/CodeGen/AArch64/bfis-in-loop.ll +++ b/llvm/test/CodeGen/AArch64/bfis-in-loop.ll @@ -22,7 +22,7 @@ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh w10, [x9, #72] ; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: ubfx x11, x10, #8, #24 +; CHECK-NEXT: lsr w11, w10, #8 ; CHECK-NEXT: cset w12, ne ; CHECK-NEXT: csel w8, w8, w11, eq ; CHECK-NEXT: ldr x11, [x9, #8] @@ -90,7 +90,7 @@ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh w10, [x9, #72] ; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: ubfx x11, x10, #8, #24 +; CHECK-NEXT: lsr w11, w10, #8 ; CHECK-NEXT: cset w12, ne ; CHECK-NEXT: csel w8, w8, w11, eq ; CHECK-NEXT: ldr x11, [x9, #8] diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll --- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll +++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll @@ -267,9 +267,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff ; CHECK-NEXT: lsl w8, w8, #8 -; CHECK-NEXT: mov w9, w8 -; CHECK-NEXT: bfxil w9, w0, #0, #8 -; CHECK-NEXT: orr w0, w8, w9, lsl #16 +; CHECK-NEXT: orr w8, w8, w0, lsl #16 +; CHECK-NEXT: bfxil w8, w0, #0, #8 +; CHECK-NEXT: lsl w0, w8, #8 ; CHECK-NEXT: ret %conv = zext i8 %a to i32 ; 0 0 0 A %shl = shl i32 %b, 8 ; B2 B1 B0 0 diff --git a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll --- a/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-to-extract-subvec-crash.ll @@ -9,16 +9,18 @@ ; CHECK: // %bb.0: // %bb ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: bfi x10, x0, #1, #3 +; CHECK-NEXT: umov w10, v0.h[0] ; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: dup v0.4h, v0.h[0] ; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: dup v1.8h, w9 -; CHECK-NEXT: str q0, [sp] -; CHECK-NEXT: ld1 { v1.h }[1], [x10] -; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: bfi x9, x8, #1, #3 +; CHECK-NEXT: dup v2.4h, w10 +; CHECK-NEXT: str q1, [sp] +; CHECK-NEXT: ld1 { v2.h }[1], [x9] +; CHECK-NEXT: str d0, [x8, #8] +; CHECK-NEXT: str d2, [x8] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll --- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -22,11 +22,11 @@ define <16 x i8> @test2(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ld1r { v1.8b }, [x1] -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: dup v0.8b, w8 -; CHECK-NEXT: mov v1.b[7], w8 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: ld1r { v1.16b }, [x1] +; CHECK-NEXT: ld1r { v0.16b }, [x0] +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -42,9 +42,9 @@ define <16 x i8> @test3(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ld1r { v0.8b }, [x0] -; CHECK-NEXT: ld1r { v1.8b }, [x1] -; CHECK-NEXT: zip1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ld1r { v0.16b }, [x1] +; CHECK-NEXT: ld1r { v1.16b }, [x0] +; CHECK-NEXT: zip1 v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret entry: %0 = load i8, ptr %a, align 1 @@ -209,12 +209,12 @@ define <4 x i32> @test12(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test12: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ld1r { v0.2s }, [x0] ; CHECK-NEXT: ldr w8, [x1] -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: mov v1.s[0], w8 +; CHECK-NEXT: ld1r { v0.4s }, [x0] +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: zip1 v0.4s, v1.4s, v0.4s ; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret entry: %0 = load i32, ptr %a, align 1 diff --git a/llvm/test/CodeGen/AArch64/cmp-bool.ll b/llvm/test/CodeGen/AArch64/cmp-bool.ll --- a/llvm/test/CodeGen/AArch64/cmp-bool.ll +++ b/llvm/test/CodeGen/AArch64/cmp-bool.ll @@ -25,8 +25,9 @@ define void @bool_ne(i1 zeroext %a, i1 zeroext %b, ptr nocapture %c) nounwind { ; CHECK-LABEL: bool_ne: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w0, w1 -; CHECK-NEXT: b.eq .LBB1_2 +; CHECK-NEXT: eor w8, w0, w1 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: b.ne .LBB1_2 ; CHECK-NEXT: // %bb.1: // %if.then ; CHECK-NEXT: br x2 ; CHECK-NEXT: .LBB1_2: // %if.end diff --git a/llvm/test/CodeGen/AArch64/cmp-const-max.ll b/llvm/test/CodeGen/AArch64/cmp-const-max.ll --- a/llvm/test/CodeGen/AArch64/cmp-const-max.ll +++ b/llvm/test/CodeGen/AArch64/cmp-const-max.ll @@ -1,11 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -verify-machineinstrs -aarch64-enable-atomic-cfg-tidy=0 < %s -mtriple=aarch64-none-eabihf -fast-isel=false | FileCheck %s define i32 @ule_64_max(i64 %p) { -entry: ; CHECK-LABEL: ule_64_max: -; CHECK: cmn x0, #1 -; CHECK: b.hi [[RET_ZERO:.LBB[0-9]+_[0-9]+]] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cbnz wzr, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %ret_one +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %ret_zero +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: %cmp = icmp ule i64 %p, 18446744073709551615 ; 0xffffffffffffffff br i1 %cmp, label %ret_one, label %ret_zero @@ -13,16 +20,21 @@ ret i32 1 ret_zero: -; CHECK: [[RET_ZERO]]: -; CHECK-NEXT: mov w0, wzr ret i32 0 } define i32 @ugt_64_max(i64 %p) { -entry: ; CHECK-LABEL: ugt_64_max: -; CHECK: cmn x0, #1 -; CHECK: b.ls [[RET_ZERO:.LBB[0-9]+_[0-9]+]] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: cbnz w8, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %ret_one +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_2: // %ret_zero +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +entry: %cmp = icmp ugt i64 %p, 18446744073709551615 ; 0xffffffffffffffff br i1 %cmp, label %ret_one, label %ret_zero @@ -30,7 +42,5 @@ ret i32 1 ret_zero: -; CHECK: [[RET_ZERO]]: -; CHECK-NEXT: mov w0, wzr ret i32 0 } diff --git a/llvm/test/CodeGen/AArch64/combine-andintoload.ll b/llvm/test/CodeGen/AArch64/combine-andintoload.ll --- a/llvm/test/CodeGen/AArch64/combine-andintoload.ll +++ b/llvm/test/CodeGen/AArch64/combine-andintoload.ll @@ -232,15 +232,15 @@ ; CHECK-LABEL: load8_and16_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: and w8, w1, w8 -; CHECK-NEXT: and x0, x8, #0xff +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: and x0, x1, x8 ; CHECK-NEXT: ret ; ; CHECKBE-LABEL: load8_and16_zext: ; CHECKBE: // %bb.0: ; CHECKBE-NEXT: ldrb w8, [x0] -; CHECKBE-NEXT: and w8, w1, w8 -; CHECKBE-NEXT: and x0, x8, #0xff +; CHECKBE-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECKBE-NEXT: and x0, x1, x8 ; CHECKBE-NEXT: ret %x = load i8, ptr %p, align 4 %xz = zext i8 %x to i64 @@ -415,10 +415,10 @@ ; CHECK-NEXT: ldrb w8, [x0, x2] ; CHECK-NEXT: and w10, w3, #0x7 ; CHECK-NEXT: ldrb w9, [x1, x2] -; CHECK-NEXT: mov w11, #8 +; CHECK-NEXT: mov w11, #8 // =0x8 ; CHECK-NEXT: sub w10, w11, w10 ; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: mov w9, #5 +; CHECK-NEXT: mov w9, #5 // =0x5 ; CHECK-NEXT: lsr w8, w8, w10 ; CHECK-NEXT: tst w8, w9 ; CHECK-NEXT: cset w0, eq @@ -429,10 +429,10 @@ ; CHECKBE-NEXT: ldrb w8, [x0, x2] ; CHECKBE-NEXT: and w10, w3, #0x7 ; CHECKBE-NEXT: ldrb w9, [x1, x2] -; CHECKBE-NEXT: mov w11, #8 +; CHECKBE-NEXT: mov w11, #8 // =0x8 ; CHECKBE-NEXT: sub w10, w11, w10 ; CHECKBE-NEXT: eor w8, w9, w8 -; CHECKBE-NEXT: mov w9, #5 +; CHECKBE-NEXT: mov w9, #5 // =0x5 ; CHECKBE-NEXT: lsr w8, w8, w10 ; CHECKBE-NEXT: tst w8, w9 ; CHECKBE-NEXT: cset w0, eq diff --git a/llvm/test/CodeGen/AArch64/combine-mul.ll b/llvm/test/CodeGen/AArch64/combine-mul.ll --- a/llvm/test/CodeGen/AArch64/combine-mul.ll +++ b/llvm/test/CodeGen/AArch64/combine-mul.ll @@ -66,7 +66,7 @@ define i8 @one_demanded_bit(i8 %x) { ; CHECK-LABEL: one_demanded_bit: ; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, #6 +; CHECK-NEXT: neg w8, w0, lsl #6 ; CHECK-NEXT: orr w0, w8, #0xffffffbf ; CHECK-NEXT: ret %m = mul i8 %x, 192 ; 0b1100_0000 @@ -77,7 +77,7 @@ define <2 x i64> @one_demanded_bit_splat(<2 x i64> %x) { ; CHECK-LABEL: one_demanded_bit_splat: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w8, #32 // =0x20 ; CHECK-NEXT: shl v0.2d, v0.2d, #5 ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -131,7 +131,7 @@ define <2 x i64> @squared_demanded_2_low_bits_splat(<2 x i64> %x) { ; CHECK-LABEL: squared_demanded_2_low_bits_splat: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-2 +; CHECK-NEXT: mov x8, #-2 // =0xfffffffffffffffe ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -201,93 +201,85 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) { ; CHECK-LABEL: abp90c12: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr s21, [sp, #32] -; CHECK-NEXT: add x9, sp, #48 -; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: ldr s23, [sp, #40] -; CHECK-NEXT: add x11, sp, #56 +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 +; CHECK-NEXT: add x10, sp, #48 +; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: ldr s2, [sp] -; CHECK-NEXT: add x10, sp, #16 -; CHECK-NEXT: ld1 { v21.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ld1 { v23.s }[1], [x11] +; CHECK-NEXT: ldr s2, [sp, #32] +; CHECK-NEXT: ldr s23, [sp, #8] +; CHECK-NEXT: add x11, sp, #24 +; CHECK-NEXT: ldr s21, [sp, #40] ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-NEXT: ldr s22, [sp, #96] -; CHECK-NEXT: add x11, sp, #24 -; CHECK-NEXT: ld1 { v2.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #72 -; CHECK-NEXT: mov v1.s[1], v3.s[0] -; CHECK-NEXT: ld1 { v21.s }[2], [x9] -; CHECK-NEXT: ldr s24, [sp, #8] -; CHECK-NEXT: add x9, sp, #112 -; CHECK-NEXT: ld1 { v23.s }[2], [x10] -; CHECK-NEXT: add x10, sp, #80 ; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 -; CHECK-NEXT: ldr s18, [sp, #128] ; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 -; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 -; CHECK-NEXT: mov v1.s[2], v5.s[0] +; CHECK-NEXT: ld1 { v2.s }[1], [x10] +; CHECK-NEXT: add x10, sp, #56 +; CHECK-NEXT: mov v0.s[2], v4.s[0] +; CHECK-NEXT: ldr s4, [sp] +; CHECK-NEXT: ldr s18, [sp, #128] +; CHECK-NEXT: ld1 { v23.s }[1], [x11] +; CHECK-NEXT: add x11, sp, #144 +; CHECK-NEXT: ld1 { v4.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #64 +; CHECK-NEXT: ldr s22, [sp, #96] +; CHECK-NEXT: mov v1.s[1], v3.s[0] +; CHECK-NEXT: ld1 { v21.s }[1], [x10] +; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ld1 { v18.s }[1], [x11] +; CHECK-NEXT: add x11, sp, #72 ; CHECK-NEXT: ldr s20, [sp, #104] -; CHECK-NEXT: ld1 { v24.s }[1], [x11] -; CHECK-NEXT: add x11, sp, #88 +; CHECK-NEXT: add x10, sp, #80 ; CHECK-NEXT: ld1 { v22.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #144 -; CHECK-NEXT: ld1 { v21.s }[3], [x10] -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: mov v0.s[2], v4.s[0] -; CHECK-NEXT: ld1 { v23.s }[3], [x11] -; CHECK-NEXT: ld1 { v18.s }[1], [x9] -; CHECK-NEXT: add x11, sp, #152 -; CHECK-NEXT: ld1 { v20.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: add x9, sp, #160 +; CHECK-NEXT: mov v1.s[2], v5.s[0] +; CHECK-NEXT: ld1 { v21.s }[2], [x11] +; CHECK-NEXT: add x11, sp, #120 +; CHECK-NEXT: ldr s16, [sp, #136] +; CHECK-NEXT: ld1 { v18.s }[2], [x9] +; CHECK-NEXT: add x9, sp, #88 +; CHECK-NEXT: ld1 { v2.s }[3], [x10] +; CHECK-NEXT: add x10, sp, #152 +; CHECK-NEXT: ld1 { v20.s }[1], [x11] +; CHECK-NEXT: add x11, sp, #168 ; CHECK-NEXT: mov v1.s[3], v7.s[0] -; CHECK-NEXT: ldr s17, [sp, #136] +; CHECK-NEXT: ld1 { v21.s }[3], [x9] +; CHECK-NEXT: ld1 { v16.s }[1], [x10] +; CHECK-NEXT: add x10, sp, #176 +; CHECK-NEXT: mov v0.s[3], v6.s[0] +; CHECK-NEXT: ldr s17, [sp, #200] +; CHECK-NEXT: fmul v3.4s, v20.4s, v23.4s ; CHECK-NEXT: ldr s19, [sp, #192] +; CHECK-NEXT: fmul v5.4s, v21.4s, v1.4s ; CHECK-NEXT: add x9, sp, #208 -; CHECK-NEXT: mov v0.s[3], v6.s[0] -; CHECK-NEXT: ld1 { v18.s }[2], [x10] -; CHECK-NEXT: ld1 { v17.s }[1], [x11] -; CHECK-NEXT: add x10, sp, #176 -; CHECK-NEXT: fmul v3.4s, v23.4s, v1.4s -; CHECK-NEXT: ld1 { v19.s }[1], [x9] -; CHECK-NEXT: fmul v4.4s, v20.4s, v24.4s -; CHECK-NEXT: add x9, sp, #168 -; CHECK-NEXT: fmul v1.4s, v21.4s, v1.4s +; CHECK-NEXT: ld1 { v16.s }[2], [x11] +; CHECK-NEXT: add x11, sp, #184 +; CHECK-NEXT: fmul v1.4s, v2.4s, v1.4s ; CHECK-NEXT: ld1 { v18.s }[3], [x10] -; CHECK-NEXT: fmul v5.4s, v22.4s, v24.4s -; CHECK-NEXT: ldr s16, [sp, #200] -; CHECK-NEXT: ld1 { v17.s }[2], [x9] -; CHECK-NEXT: add x11, sp, #216 +; CHECK-NEXT: fmul v6.4s, v22.4s, v23.4s +; CHECK-NEXT: add x10, sp, #216 ; CHECK-NEXT: fneg v3.4s, v3.4s -; CHECK-NEXT: add x9, sp, #184 -; CHECK-NEXT: fneg v4.4s, v4.4s -; CHECK-NEXT: fmla v1.4s, v0.4s, v23.4s -; CHECK-NEXT: fmla v5.4s, v2.4s, v20.4s -; CHECK-NEXT: ld1 { v16.s }[1], [x11] -; CHECK-NEXT: ld1 { v17.s }[3], [x9] -; CHECK-NEXT: fmla v3.4s, v0.4s, v21.4s -; CHECK-NEXT: fmla v4.4s, v2.4s, v22.4s +; CHECK-NEXT: ld1 { v19.s }[1], [x9] +; CHECK-NEXT: fneg v5.4s, v5.4s +; CHECK-NEXT: ld1 { v16.s }[3], [x11] +; CHECK-NEXT: fmla v1.4s, v0.4s, v21.4s +; CHECK-NEXT: ld1 { v17.s }[1], [x10] +; CHECK-NEXT: fmla v6.4s, v4.4s, v20.4s +; CHECK-NEXT: fmla v3.4s, v4.4s, v22.4s +; CHECK-NEXT: fmla v5.4s, v0.4s, v2.4s ; CHECK-NEXT: fsub v0.4s, v18.4s, v1.4s -; CHECK-NEXT: fsub v1.4s, v19.4s, v5.4s +; CHECK-NEXT: fsub v1.4s, v19.4s, v6.4s ; CHECK-NEXT: fadd v2.4s, v17.4s, v3.4s -; CHECK-NEXT: fadd v3.4s, v16.4s, v4.4s -; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 -; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 -; CHECK-NEXT: trn2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ext v4.16b, v0.16b, v4.16b, #12 -; CHECK-NEXT: zip2 v3.4s, v0.4s, v2.4s -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 -; CHECK-NEXT: zip1 v0.4s, v0.4s, v2.4s -; CHECK-NEXT: rev64 v4.4s, v4.4s +; CHECK-NEXT: fadd v3.4s, v16.4s, v5.4s +; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v2.4s, v0.4s, v3.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: stp q2, q1, [x8, #16] ; CHECK-NEXT: str q0, [x8] -; CHECK-NEXT: trn2 v4.4s, v4.4s, v5.4s -; CHECK-NEXT: ext v1.16b, v4.16b, v1.16b, #8 -; CHECK-NEXT: mov v3.d[1], v4.d[0] -; CHECK-NEXT: stp q3, q1, [x8, #16] ; CHECK-NEXT: ret entry: %ar = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> diff --git a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll --- a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll +++ b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll @@ -27,10 +27,7 @@ define i64 @g(ptr %p) { ; CHECK-LABEL: g: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0, #8] -; CHECK-NEXT: add x9, x8, x8 -; CHECK-NEXT: add x8, x9, x8 -; CHECK-NEXT: sub x0, x8, x8 +; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: ret %vec = load <2 x i64>, ptr %p, align 1 %elt = extractelement <2 x i64> %vec, i32 1 diff --git a/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll b/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll --- a/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll @@ -17,9 +17,9 @@ define <16 x i8> @fn2_vector(<16 x i8> %arg) { ; CHECK-LABEL: fn2_vector: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov v1.b[1], v0.b[1] +; CHECK-NEXT: shl v0.16b, v1.16b, #7 ; CHECK-NEXT: ret entry: %mul = mul <16 x i8> %arg, @@ -43,9 +43,9 @@ define <16 x i8> @fn2_vector_undef(<16 x i8> %arg) { ; CHECK-LABEL: fn2_vector_undef: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov v1.b[1], v0.b[1] +; CHECK-NEXT: shl v0.16b, v1.16b, #7 ; CHECK-NEXT: ret entry: %mul = mul <16 x i8> %arg, @@ -56,7 +56,7 @@ define i32 @fn1_scalar(i32 %arg) { ; CHECK-LABEL: fn1_scalar: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #1664 +; CHECK-NEXT: mov w8, #1664 // =0x680 ; CHECK-NEXT: mul w0, w0, w8 ; CHECK-NEXT: ret entry: @@ -68,7 +68,7 @@ define i32 @fn2_scalar(i32 %arg) { ; CHECK-LABEL: fn2_scalar: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #1664 +; CHECK-NEXT: mov w8, #1664 // =0x680 ; CHECK-NEXT: mul w0, w0, w8 ; CHECK-NEXT: ret entry: @@ -102,7 +102,7 @@ define i32 @fn1_scalar_opaque(i32 %arg) { ; CHECK-LABEL: fn1_scalar_opaque: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #13 +; CHECK-NEXT: mov w8, #13 // =0xd ; CHECK-NEXT: mul w8, w0, w8 ; CHECK-NEXT: lsl w0, w8, #7 ; CHECK-NEXT: ret @@ -116,7 +116,7 @@ define i32 @fn2_scalar_opaque(i32 %arg) { ; CHECK-LABEL: fn2_scalar_opaque: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #13 +; CHECK-NEXT: mov w8, #13 // =0xd ; CHECK-NEXT: mul w8, w0, w8 ; CHECK-NEXT: lsl w0, w8, #7 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/dag-combine-select.ll b/llvm/test/CodeGen/AArch64/dag-combine-select.ll --- a/llvm/test/CodeGen/AArch64/dag-combine-select.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-select.ll @@ -7,20 +7,13 @@ ; Ensure that we transform select(C0, x, select(C1, x, y)) towards ; select(C0 | C1, x, y) so we can use CMP;CCMP for the implementation. define i32 @test0(i32 %v0, i32 %v1, i32 %v2) { -; SDISEL-LABEL: test0: -; SDISEL: // %bb.0: -; SDISEL-NEXT: cmp w0, #7 -; SDISEL-NEXT: ccmp w1, #0, #0, ne -; SDISEL-NEXT: csel w0, w1, w2, gt -; SDISEL-NEXT: ret -; -; GISEL-LABEL: test0: -; GISEL: // %bb.0: -; GISEL-NEXT: cmp w0, #7 -; GISEL-NEXT: csel w8, w1, w2, eq -; GISEL-NEXT: cmp w1, #0 -; GISEL-NEXT: csel w0, w1, w8, gt -; GISEL-NEXT: ret +; CHECK-LABEL: test0: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, #7 +; CHECK-NEXT: csel w8, w1, w2, eq +; CHECK-NEXT: cmp w1, #0 +; CHECK-NEXT: csel w0, w1, w8, gt +; CHECK-NEXT: ret %cmp1 = icmp eq i32 %v0, 7 %cmp2 = icmp sgt i32 %v1, 0 %sel0 = select i1 %cmp1, i32 %v1, i32 %v2 @@ -35,12 +28,13 @@ ; SDISEL-LABEL: test1: ; SDISEL: // %bb.0: ; SDISEL-NEXT: cmp w0, #7 -; SDISEL-NEXT: adrp x8, out +; SDISEL-NEXT: mov w8, #42 // =0x2a ; SDISEL-NEXT: csel w9, w1, w2, eq ; SDISEL-NEXT: cmp w9, #13 ; SDISEL-NEXT: csel w9, w1, w2, lo -; SDISEL-NEXT: cmp w0, #42 -; SDISEL-NEXT: csel w10, w1, w9, eq +; SDISEL-NEXT: ccmp w0, w8, #4, hs +; SDISEL-NEXT: adrp x8, out +; SDISEL-NEXT: csel w10, w1, w2, eq ; SDISEL-NEXT: str w9, [x8, :lo12:out] ; SDISEL-NEXT: str w10, [x8, :lo12:out] ; SDISEL-NEXT: ret @@ -73,5 +67,3 @@ store volatile i32 %cond17, ptr @out, align 4 ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll --- a/llvm/test/CodeGen/AArch64/expand-select.ll +++ b/llvm/test/CodeGen/AArch64/expand-select.ll @@ -33,24 +33,24 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: fmov s0, wzr -; CHECK-NEXT: ldp x10, x9, [sp] +; CHECK-NEXT: ldp x11, x10, [sp] ; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: ldr x11, [sp, #16] ; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s ; CHECK-NEXT: dup v1.4s, v0.s[0] +; CHECK-NEXT: fmov x9, d0 ; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: extr x8, x9, x8, #32 ; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: csel x9, x5, x9, ne -; CHECK-NEXT: csel x10, x4, x10, ne -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: csel x8, x2, x6, ne +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: csel x10, x5, x10, ne +; CHECK-NEXT: csel x11, x4, x11, ne +; CHECK-NEXT: tst w9, #0x1 +; CHECK-NEXT: csel x9, x2, x6, ne ; CHECK-NEXT: csel x12, x3, x7, ne -; CHECK-NEXT: stur x10, [x11, #12] -; CHECK-NEXT: str w9, [x11, #20] -; CHECK-NEXT: str x8, [x11] -; CHECK-NEXT: str w12, [x11, #8] +; CHECK-NEXT: stur x11, [x8, #12] +; CHECK-NEXT: str w10, [x8, #20] +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: str w12, [x8, #8] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 %cbool = icmp eq i32 %cond, 0 diff --git a/llvm/test/CodeGen/AArch64/fadd-combines.ll b/llvm/test/CodeGen/AArch64/fadd-combines.ll --- a/llvm/test/CodeGen/AArch64/fadd-combines.ll +++ b/llvm/test/CodeGen/AArch64/fadd-combines.ll @@ -132,8 +132,8 @@ define float @fadd_const_multiuse_fmf(float %x) { ; CHECK-LABEL: fadd_const_multiuse_fmf: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1109917696 -; CHECK-NEXT: mov w9, #1114374144 +; CHECK-NEXT: mov w8, #1109917696 // =0x42280000 +; CHECK-NEXT: mov w9, #1114374144 // =0x426c0000 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fadd s1, s0, s1 @@ -150,8 +150,8 @@ define float @fadd_const_multiuse_attr(float %x) { ; CHECK-LABEL: fadd_const_multiuse_attr: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1109917696 -; CHECK-NEXT: mov w9, #1114374144 +; CHECK-NEXT: mov w8, #1109917696 // =0x42280000 +; CHECK-NEXT: mov w9, #1114374144 // =0x426c0000 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fadd s1, s0, s1 @@ -245,11 +245,11 @@ define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x double> %x3, <2 x double> %x4, <2 x double> %x5, <2 x double> %x6, <2 x double> %x7, <2 x double> %x8) nounwind { ; CHECK-LABEL: fadd_fma_fmul_3: ; CHECK: // %bb.0: -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmla v2.2d, v1.2d, v0.2d -; CHECK-NEXT: fmla v2.2d, v7.2d, v6.2d -; CHECK-NEXT: fmla v2.2d, v5.2d, v4.2d -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: fmul v6.2d, v6.2d, v7.2d +; CHECK-NEXT: fmla v6.2d, v5.2d, v4.2d +; CHECK-NEXT: fmla v6.2d, v3.2d, v2.2d +; CHECK-NEXT: fmla v6.2d, v1.2d, v0.2d +; CHECK-NEXT: mov v0.16b, v6.16b ; CHECK-NEXT: ret %m1 = fmul fast <2 x double> %x1, %x2 %m2 = fmul fast <2 x double> %x3, %x4 diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll --- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll @@ -4,13 +4,9 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) { ; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: dup v2.2s, v0.s[1] -; CHECK-NEXT: mov v1.16b, v2.16b -; CHECK-NEXT: mov v1.h[0], v0.h[1] -; CHECK-NEXT: mov v0.h[1], v2.h[0] -; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h +; CHECK-NEXT: uzp2 v1.4h, v0.4h, v0.4h +; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret %retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) ret {<2 x half>, <2 x half>} %retval diff --git a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll --- a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll +++ b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll @@ -25,9 +25,9 @@ define i64 @f2() { ; CHECK-LABEL: f2: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, x1 -; CHECK-NEXT: add x8, x8, :lo12:x1 -; CHECK-NEXT: ldr x0, [x8, #24] +; CHECK-NEXT: adrp x8, x1+16 +; CHECK-NEXT: add x8, x8, :lo12:x1+16 +; CHECK-NEXT: ldr x0, [x8, #8] ; CHECK-NEXT: ret ; ; GISEL-LABEL: f2: @@ -100,7 +100,7 @@ define i64 @f6() { ; CHECK-LABEL: f6: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1048576 +; CHECK-NEXT: mov w8, #1048576 // =0x100000 ; CHECK-NEXT: adrp x9, x2 ; CHECK-NEXT: add x9, x9, :lo12:x2 ; CHECK-NEXT: ldr x0, [x9, x8] @@ -108,7 +108,7 @@ ; ; GISEL-LABEL: f6: ; GISEL: // %bb.0: -; GISEL-NEXT: mov w8, #1048576 +; GISEL-NEXT: mov w8, #1048576 // =0x100000 ; GISEL-NEXT: adrp x9, x2 ; GISEL-NEXT: add x9, x9, :lo12:x2 ; GISEL-NEXT: ldr x0, [x9, x8] diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll @@ -400,8 +400,8 @@ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lsl x9, x8, #5 ; CHECK-NEXT: add x8, x8, #1 -; CHECK-NEXT: add x10, x0, x9 -; CHECK-NEXT: add x11, x1, x9 +; CHECK-NEXT: add x10, x1, x9 +; CHECK-NEXT: add x11, x0, x9 ; CHECK-NEXT: add x9, x2, x9 ; CHECK-NEXT: cmp x8, #1000 ; CHECK-NEXT: ldp q0, q1, [x10] @@ -412,7 +412,7 @@ ; CHECK-NEXT: uzp1.8h v0, v0, v1 ; CHECK-NEXT: fcvtzu.4s v3, v3 ; CHECK-NEXT: uzp1.8h v1, v2, v3 -; CHECK-NEXT: stp q0, q1, [x9] +; CHECK-NEXT: stp q1, q0, [x9] ; CHECK-NEXT: b.eq LBB7_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat.ll b/llvm/test/CodeGen/AArch64/fpclamptosat.ll --- a/llvm/test/CodeGen/AArch64/fpclamptosat.ll +++ b/llvm/test/CodeGen/AArch64/fpclamptosat.ll @@ -35,7 +35,12 @@ define i32 @ustest_f64i32(double %x) { ; CHECK-LABEL: ustest_f64i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzu w0, d0 +; CHECK-NEXT: fcvtzs x8, d0 +; CHECK-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: csel x8, x8, x9, lt +; CHECK-NEXT: asr x9, x8, #63 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret entry: %conv = fptosi double %x to i64 @@ -78,7 +83,12 @@ define i32 @ustest_f32i32(float %x) { ; CHECK-LABEL: ustest_f32i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzu w0, s0 +; CHECK-NEXT: fcvtzs x8, s0 +; CHECK-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: csel x8, x8, x9, lt +; CHECK-NEXT: asr x9, x8, #63 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret entry: %conv = fptosi float %x to i64 @@ -134,12 +144,22 @@ ; CHECK-CVT-LABEL: ustest_f16i32: ; CHECK-CVT: // %bb.0: // %entry ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: fcvtzu w0, s0 +; CHECK-CVT-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-CVT-NEXT: fcvtzs x8, s0 +; CHECK-CVT-NEXT: cmp x8, x9 +; CHECK-CVT-NEXT: csel x8, x8, x9, lt +; CHECK-CVT-NEXT: asr x9, x8, #63 +; CHECK-CVT-NEXT: bic w0, w8, w9 ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: ustest_f16i32: ; CHECK-FP16: // %bb.0: // %entry -; CHECK-FP16-NEXT: fcvtzu w0, h0 +; CHECK-FP16-NEXT: fcvtzs x8, h0 +; CHECK-FP16-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-FP16-NEXT: cmp x8, x9 +; CHECK-FP16-NEXT: csel x8, x8, x9, lt +; CHECK-FP16-NEXT: asr x9, x8, #63 +; CHECK-FP16-NEXT: bic w0, w8, w9 ; CHECK-FP16-NEXT: ret entry: %conv = fptosi half %x to i64 @@ -396,11 +416,9 @@ ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: cmp x1, #1 -; CHECK-NEXT: csel x8, x0, xzr, lt -; CHECK-NEXT: csinc x9, x1, xzr, lt -; CHECK-NEXT: cmp xzr, x8 -; CHECK-NEXT: ngcs xzr, x9 -; CHECK-NEXT: csel x0, x8, xzr, lt +; CHECK-NEXT: csinc x8, x1, xzr, lt +; CHECK-NEXT: csel x9, x0, xzr, lt +; CHECK-NEXT: bic x0, x9, x8, asr #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -455,11 +473,9 @@ ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: cmp x1, #1 -; CHECK-NEXT: csel x8, x0, xzr, lt -; CHECK-NEXT: csinc x9, x1, xzr, lt -; CHECK-NEXT: cmp xzr, x8 -; CHECK-NEXT: ngcs xzr, x9 -; CHECK-NEXT: csel x0, x8, xzr, lt +; CHECK-NEXT: csinc x8, x1, xzr, lt +; CHECK-NEXT: csel x9, x0, xzr, lt +; CHECK-NEXT: bic x0, x9, x8, asr #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -520,11 +536,9 @@ ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: bl __fixhfti ; CHECK-NEXT: cmp x1, #1 -; CHECK-NEXT: csel x8, x0, xzr, lt -; CHECK-NEXT: csinc x9, x1, xzr, lt -; CHECK-NEXT: cmp xzr, x8 -; CHECK-NEXT: ngcs xzr, x9 -; CHECK-NEXT: csel x0, x8, xzr, lt +; CHECK-NEXT: csinc x8, x1, xzr, lt +; CHECK-NEXT: csel x9, x0, xzr, lt +; CHECK-NEXT: bic x0, x9, x8, asr #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -211,11 +211,18 @@ define <5 x i32> @test_signed_v5f64_v5i32(<5 x double> %f) { ; CHECK-LABEL: test_signed_v5f64_v5i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs w0, d0 -; CHECK-NEXT: fcvtzs w1, d1 -; CHECK-NEXT: fcvtzs w2, d2 +; CHECK-NEXT: fcvtzs w8, d0 +; CHECK-NEXT: fcvtzs w9, d1 ; CHECK-NEXT: fcvtzs w3, d3 ; CHECK-NEXT: fcvtzs w4, d4 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: fcvtzs w8, d2 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: mov v0.s[3], w3 +; CHECK-NEXT: mov w1, v0.s[1] +; CHECK-NEXT: mov w2, v0.s[2] +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <5 x i32> @llvm.fptosi.sat.v5f64.v5i32(<5 x double> %f) ret <5 x i32> %x @@ -224,12 +231,22 @@ define <6 x i32> @test_signed_v6f64_v6i32(<6 x double> %f) { ; CHECK-LABEL: test_signed_v6f64_v6i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs w0, d0 -; CHECK-NEXT: fcvtzs w1, d1 -; CHECK-NEXT: fcvtzs w2, d2 +; CHECK-NEXT: fcvtzs w9, d0 +; CHECK-NEXT: fcvtzs w10, d1 +; CHECK-NEXT: fcvtzs w8, d4 ; CHECK-NEXT: fcvtzs w3, d3 -; CHECK-NEXT: fcvtzs w4, d4 ; CHECK-NEXT: fcvtzs w5, d5 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: fcvtzs w9, d2 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.s[1], w10 +; CHECK-NEXT: mov v1.s[1], w5 +; CHECK-NEXT: mov v0.s[2], w9 +; CHECK-NEXT: fmov w4, s1 +; CHECK-NEXT: mov v0.s[3], w3 +; CHECK-NEXT: mov w1, v0.s[1] +; CHECK-NEXT: mov w2, v0.s[2] +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <6 x i32> @llvm.fptosi.sat.v6f64.v6i32(<6 x double> %f) ret <6 x i32> %x diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -211,11 +211,18 @@ define <5 x i32> @test_unsigned_v5f64_v5i32(<5 x double> %f) { ; CHECK-LABEL: test_unsigned_v5f64_v5i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu w0, d0 -; CHECK-NEXT: fcvtzu w1, d1 -; CHECK-NEXT: fcvtzu w2, d2 +; CHECK-NEXT: fcvtzu w8, d0 +; CHECK-NEXT: fcvtzu w9, d1 ; CHECK-NEXT: fcvtzu w3, d3 ; CHECK-NEXT: fcvtzu w4, d4 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: fcvtzu w8, d2 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: mov v0.s[3], w3 +; CHECK-NEXT: mov w1, v0.s[1] +; CHECK-NEXT: mov w2, v0.s[2] +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <5 x i32> @llvm.fptoui.sat.v5f64.v5i32(<5 x double> %f) ret <5 x i32> %x @@ -224,12 +231,22 @@ define <6 x i32> @test_unsigned_v6f64_v6i32(<6 x double> %f) { ; CHECK-LABEL: test_unsigned_v6f64_v6i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu w0, d0 -; CHECK-NEXT: fcvtzu w1, d1 -; CHECK-NEXT: fcvtzu w2, d2 +; CHECK-NEXT: fcvtzu w9, d0 +; CHECK-NEXT: fcvtzu w10, d1 +; CHECK-NEXT: fcvtzu w8, d4 ; CHECK-NEXT: fcvtzu w3, d3 -; CHECK-NEXT: fcvtzu w4, d4 ; CHECK-NEXT: fcvtzu w5, d5 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: fcvtzu w9, d2 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.s[1], w10 +; CHECK-NEXT: mov v1.s[1], w5 +; CHECK-NEXT: mov v0.s[2], w9 +; CHECK-NEXT: fmov w4, s1 +; CHECK-NEXT: mov v0.s[3], w3 +; CHECK-NEXT: mov w1, v0.s[1] +; CHECK-NEXT: mov w2, v0.s[2] +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <6 x i32> @llvm.fptoui.sat.v6f64.v6i32(<6 x double> %f) ret <6 x i32> %x @@ -691,7 +708,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s1, v0.s[1] ; CHECK-NEXT: fcvtzu x9, s0 -; CHECK-NEXT: mov x10, #1125899906842623 +; CHECK-NEXT: mov x10, #1125899906842623 // =0x3ffffffffffff ; CHECK-NEXT: fcvtzu x8, s1 ; CHECK-NEXT: cmp x8, x10 ; CHECK-NEXT: csel x8, x8, x10, lo @@ -737,9 +754,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov w8, #1904214015 +; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x21, #68719476735 +; CHECK-NEXT: mov x21, #68719476735 // =0xfffffffff ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fmov s9, w8 @@ -788,7 +805,7 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov w8, #2139095039 +; CHECK-NEXT: mov w8, #2139095039 // =0x7f7fffff ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 @@ -904,7 +921,7 @@ ; CHECK-LABEL: test_unsigned_v4f32_v4i50: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov x8, #1125899906842623 +; CHECK-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-NEXT: mov s3, v0.s[1] ; CHECK-NEXT: fcvtzu x11, s0 ; CHECK-NEXT: mov s2, v1.s[1] @@ -967,10 +984,10 @@ ; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov w8, #1904214015 +; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x25, #68719476735 +; CHECK-NEXT: mov x25, #68719476735 // =0xfffffffff ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: csel x8, xzr, x0, lt @@ -1050,7 +1067,7 @@ ; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov w8, #2139095039 +; CHECK-NEXT: mov w8, #2139095039 // =0x7f7fffff ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: fmov s9, w8 @@ -1146,7 +1163,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu w10, d0 -; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov w8, #255 // =0xff ; CHECK-NEXT: fcvtzu w9, d1 ; CHECK-NEXT: cmp w9, #255 ; CHECK-NEXT: csel w9, w9, w8, lo @@ -1165,7 +1182,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu w9, d0 -; CHECK-NEXT: mov w10, #8191 +; CHECK-NEXT: mov w10, #8191 // =0x1fff ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: cmp w8, w10 ; CHECK-NEXT: csel w8, w8, w10, lo @@ -1184,7 +1201,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu w9, d0 -; CHECK-NEXT: mov w10, #65535 +; CHECK-NEXT: mov w10, #65535 // =0xffff ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: cmp w8, w10 ; CHECK-NEXT: csel w8, w8, w10, lo @@ -1203,7 +1220,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu w9, d0 -; CHECK-NEXT: mov w10, #524287 +; CHECK-NEXT: mov w10, #524287 // =0x7ffff ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: cmp w8, w10 ; CHECK-NEXT: csel w8, w8, w10, lo @@ -1236,7 +1253,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d1, v0.d[1] ; CHECK-NEXT: fcvtzu x9, d0 -; CHECK-NEXT: mov x10, #1125899906842623 +; CHECK-NEXT: mov x10, #1125899906842623 // =0x3ffffffffffff ; CHECK-NEXT: fcvtzu x8, d1 ; CHECK-NEXT: cmp x8, x10 ; CHECK-NEXT: csel x8, x8, x10, lo @@ -1276,9 +1293,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov d0, d8 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: mov x8, #5057542381537067007 +; CHECK-NEXT: mov x8, #5057542381537067007 // =0x462fffffffffffff ; CHECK-NEXT: fcmp d8, #0.0 -; CHECK-NEXT: mov x21, #68719476735 +; CHECK-NEXT: mov x21, #68719476735 // =0xfffffffff ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: fmov d9, x8 @@ -1326,7 +1343,7 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov d0, d8 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: mov x8, #5183643171103440895 +; CHECK-NEXT: mov x8, #5183643171103440895 // =0x47efffffffffffff ; CHECK-NEXT: fcmp d8, #0.0 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -1478,7 +1495,7 @@ ; CHECK-CVT-NEXT: mov h2, v0.h[2] ; CHECK-CVT-NEXT: mov h3, v0.h[3] ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: mov x8, #1125899906842623 +; CHECK-CVT-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 @@ -1503,7 +1520,7 @@ ; CHECK-FP16-NEXT: mov h2, v0.h[2] ; CHECK-FP16-NEXT: mov h3, v0.h[3] ; CHECK-FP16-NEXT: fcvtzu x9, h0 -; CHECK-FP16-NEXT: mov x8, #1125899906842623 +; CHECK-FP16-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-FP16-NEXT: fcvtzu x10, h1 ; CHECK-FP16-NEXT: fcvtzu x11, h2 ; CHECK-FP16-NEXT: cmp x9, x8 @@ -1587,9 +1604,9 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov w8, #1904214015 +; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x25, #68719476735 +; CHECK-NEXT: mov x25, #68719476735 // =0xfffffffff ; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: csel x8, xzr, x0, lt @@ -1673,7 +1690,7 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov w8, #2139095039 +; CHECK-NEXT: mov w8, #2139095039 // =0x7f7fffff ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: fmov s9, w8 @@ -1809,7 +1826,7 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: fcvtl2 v1.4s, v0.8h ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h -; CHECK-CVT-NEXT: mov w8, #255 +; CHECK-CVT-NEXT: mov w8, #255 // =0xff ; CHECK-CVT-NEXT: mov s2, v1.s[1] ; CHECK-CVT-NEXT: mov s3, v1.s[2] ; CHECK-CVT-NEXT: mov s4, v1.s[3] @@ -1866,7 +1883,7 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: fcvtl2 v1.4s, v0.8h ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h -; CHECK-CVT-NEXT: mov w8, #8191 +; CHECK-CVT-NEXT: mov w8, #8191 // =0x1fff ; CHECK-CVT-NEXT: mov s2, v1.s[1] ; CHECK-CVT-NEXT: mov s3, v1.s[2] ; CHECK-CVT-NEXT: mov s4, v1.s[3] @@ -1923,7 +1940,7 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: fcvtl2 v1.4s, v0.8h ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h -; CHECK-CVT-NEXT: mov w8, #65535 +; CHECK-CVT-NEXT: mov w8, #65535 // =0xffff ; CHECK-CVT-NEXT: mov s2, v1.s[1] ; CHECK-CVT-NEXT: mov s3, v1.s[2] ; CHECK-CVT-NEXT: mov s4, v1.s[3] @@ -2012,7 +2029,7 @@ ; CHECK-CVT-LABEL: test_unsigned_v8f16_v8i50: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-CVT-NEXT: mov x8, #1125899906842623 +; CHECK-CVT-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-CVT-NEXT: mov h2, v0.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: mov h5, v0.h[3] @@ -2056,7 +2073,7 @@ ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i50: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-NEXT: mov x8, #1125899906842623 +; CHECK-FP16-NEXT: mov x8, #1125899906842623 // =0x3ffffffffffff ; CHECK-FP16-NEXT: mov h2, v0.h[1] ; CHECK-FP16-NEXT: mov h3, v0.h[2] ; CHECK-FP16-NEXT: mov h5, v0.h[3] @@ -2193,9 +2210,9 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: mov w8, #1904214015 +; CHECK-NEXT: mov w8, #1904214015 // =0x717fffff ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x23, #68719476735 +; CHECK-NEXT: mov x23, #68719476735 // =0xfffffffff ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: csel x8, xzr, x0, lt @@ -2357,7 +2374,7 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: mov w8, #2139095039 +; CHECK-NEXT: mov w8, #2139095039 // =0x7f7fffff ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: fmov s9, w8 @@ -2559,7 +2576,7 @@ ; CHECK-CVT-NEXT: fcvtl2 v2.4s, v1.8h ; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ; CHECK-CVT-NEXT: fcvtl2 v5.4s, v0.8h -; CHECK-CVT-NEXT: mov w8, #255 +; CHECK-CVT-NEXT: mov w8, #255 // =0xff ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: mov s3, v2.s[1] ; CHECK-CVT-NEXT: mov s4, v2.s[2] @@ -2661,7 +2678,7 @@ ; CHECK-CVT-NEXT: fcvtl2 v2.4s, v0.8h ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: fcvtl2 v5.4s, v1.8h -; CHECK-CVT-NEXT: mov w8, #65535 +; CHECK-CVT-NEXT: mov w8, #65535 // =0xffff ; CHECK-CVT-NEXT: fcvtl v1.4s, v1.4h ; CHECK-CVT-NEXT: mov s3, v2.s[1] ; CHECK-CVT-NEXT: mov s4, v2.s[2] @@ -2758,7 +2775,7 @@ ; CHECK-NEXT: mov d4, v3.d[1] ; CHECK-NEXT: fcvtzu w10, d3 ; CHECK-NEXT: mov d3, v2.d[1] -; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov w8, #255 // =0xff ; CHECK-NEXT: fcvtzu w12, d2 ; CHECK-NEXT: fcvtzu w13, d1 ; CHECK-NEXT: fcvtzu w9, d4 @@ -2806,7 +2823,7 @@ ; CHECK-NEXT: mov d16, v0.d[1] ; CHECK-NEXT: fcvtzu w10, d0 ; CHECK-NEXT: mov d0, v1.d[1] -; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov w8, #255 // =0xff ; CHECK-NEXT: fcvtzu w12, d1 ; CHECK-NEXT: mov d1, v2.d[1] ; CHECK-NEXT: fcvtzu w9, d16 @@ -2910,7 +2927,7 @@ ; CHECK-NEXT: mov d4, v3.d[1] ; CHECK-NEXT: fcvtzu w10, d3 ; CHECK-NEXT: mov d3, v2.d[1] -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: mov w8, #65535 // =0xffff ; CHECK-NEXT: fcvtzu w12, d2 ; CHECK-NEXT: fcvtzu w13, d1 ; CHECK-NEXT: fcvtzu w9, d4 @@ -2958,7 +2975,7 @@ ; CHECK-NEXT: mov d16, v3.d[1] ; CHECK-NEXT: fcvtzu w9, d3 ; CHECK-NEXT: mov d3, v2.d[1] -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: mov w8, #65535 // =0xffff ; CHECK-NEXT: fcvtzu w10, d2 ; CHECK-NEXT: mov d2, v1.d[1] ; CHECK-NEXT: fcvtzu w11, d1 diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll --- a/llvm/test/CodeGen/AArch64/funnel-shift.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll @@ -19,12 +19,12 @@ define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: fshl_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsr w9, w1, #1 -; CHECK-NEXT: lsl w10, w0, w2 -; CHECK-NEXT: lsr w8, w9, w8 -; CHECK-NEXT: orr w0, w10, w8 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsr w10, w1, #1 +; CHECK-NEXT: lsr w9, w10, w9 +; CHECK-NEXT: lsl w8, w0, w8 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) ret i32 %f @@ -46,7 +46,8 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; CHECK-LABEL: fshl_i128: ; CHECK: // %bb.0: -; CHECK-NEXT: tst x4, #0x40 +; CHECK-NEXT: ubfx x8, x4, #6, #1 +; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: mvn w8, w4 ; CHECK-NEXT: csel x9, x2, x3, ne ; CHECK-NEXT: csel x10, x3, x0, ne @@ -69,14 +70,14 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-LABEL: fshl_i37: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #46053 +; CHECK-NEXT: mov x9, #46053 // =0xb3e5 ; CHECK-NEXT: and x8, x2, #0x1fffffffff ; CHECK-NEXT: movk x9, #12398, lsl #16 ; CHECK-NEXT: ubfiz x10, x1, #26, #37 ; CHECK-NEXT: movk x9, #15941, lsl #32 ; CHECK-NEXT: movk x9, #1771, lsl #48 ; CHECK-NEXT: umulh x8, x8, x9 -; CHECK-NEXT: mov w9, #37 +; CHECK-NEXT: mov w9, #37 // =0x25 ; CHECK-NEXT: msub w8, w8, w9, w2 ; CHECK-NEXT: mvn w9, w8 ; CHECK-NEXT: lsl x8, x0, x8 @@ -93,7 +94,7 @@ define i7 @fshl_i7_const_fold() { ; CHECK-LABEL: fshl_i7_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #67 +; CHECK-NEXT: mov w0, #67 // =0x43 ; CHECK-NEXT: ret %f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2) ret i7 %f @@ -102,7 +103,7 @@ define i8 @fshl_i8_const_fold_overshift_1() { ; CHECK-LABEL: fshl_i8_const_fold_overshift_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #128 +; CHECK-NEXT: mov w0, #128 // =0x80 ; CHECK-NEXT: ret %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 15) ret i8 %f @@ -111,7 +112,7 @@ define i8 @fshl_i8_const_fold_overshift_2() { ; CHECK-LABEL: fshl_i8_const_fold_overshift_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #120 +; CHECK-NEXT: mov w0, #120 // =0x78 ; CHECK-NEXT: ret %f = call i8 @llvm.fshl.i8(i8 15, i8 15, i8 11) ret i8 %f @@ -164,7 +165,7 @@ define i8 @fshl_i8_const_fold() { ; CHECK-LABEL: fshl_i8_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #128 +; CHECK-NEXT: mov w0, #128 // =0x80 ; CHECK-NEXT: ret %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7) ret i8 %f @@ -177,12 +178,12 @@ define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) { ; CHECK-LABEL: fshr_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsl w9, w0, #1 -; CHECK-NEXT: lsr w10, w1, w2 -; CHECK-NEXT: lsl w8, w9, w8 -; CHECK-NEXT: orr w0, w8, w10 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsl w10, w0, #1 +; CHECK-NEXT: lsr w8, w1, w8 +; CHECK-NEXT: lsl w9, w10, w9 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) ret i32 %f @@ -206,7 +207,7 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-LABEL: fshr_i37: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #46053 +; CHECK-NEXT: mov x9, #46053 // =0xb3e5 ; CHECK-NEXT: and x8, x2, #0x1fffffffff ; CHECK-NEXT: movk x9, #12398, lsl #16 ; CHECK-NEXT: lsl x10, x1, #27 @@ -214,7 +215,7 @@ ; CHECK-NEXT: lsl x11, x0, #1 ; CHECK-NEXT: movk x9, #1771, lsl #48 ; CHECK-NEXT: umulh x8, x8, x9 -; CHECK-NEXT: mov w9, #37 +; CHECK-NEXT: mov w9, #37 // =0x25 ; CHECK-NEXT: msub w8, w8, w9, w2 ; CHECK-NEXT: add w8, w8, #27 ; CHECK-NEXT: mvn w9, w8 @@ -232,7 +233,7 @@ define i7 @fshr_i7_const_fold() { ; CHECK-LABEL: fshr_i7_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #31 +; CHECK-NEXT: mov w0, #31 // =0x1f ; CHECK-NEXT: ret %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2) ret i7 %f @@ -241,7 +242,7 @@ define i8 @fshr_i8_const_fold_overshift_1() { ; CHECK-LABEL: fshr_i8_const_fold_overshift_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #254 +; CHECK-NEXT: mov w0, #254 // =0xfe ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 15) ret i8 %f @@ -250,7 +251,7 @@ define i8 @fshr_i8_const_fold_overshift_2() { ; CHECK-LABEL: fshr_i8_const_fold_overshift_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #225 +; CHECK-NEXT: mov w0, #225 // =0xe1 ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11) ret i8 %f @@ -259,7 +260,7 @@ define i8 @fshr_i8_const_fold_overshift_3() { ; CHECK-LABEL: fshr_i8_const_fold_overshift_3: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #255 +; CHECK-NEXT: mov w0, #255 // =0xff ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8) ret i8 %f @@ -303,7 +304,7 @@ define i8 @fshr_i8_const_fold() { ; CHECK-LABEL: fshr_i8_const_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #254 +; CHECK-NEXT: mov w0, #254 // =0xfe ; CHECK-NEXT: ret %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7) ret i8 %f @@ -472,12 +473,12 @@ define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) { ; CHECK-LABEL: or_shl_fshl_simplify: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsr w9, w0, #1 -; CHECK-NEXT: lsl w10, w1, w2 -; CHECK-NEXT: lsr w8, w9, w8 -; CHECK-NEXT: orr w0, w10, w8 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsr w10, w0, #1 +; CHECK-NEXT: lsr w9, w10, w9 +; CHECK-NEXT: lsl w8, w1, w8 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %shy = shl i32 %y, %s %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s) @@ -488,12 +489,12 @@ define i32 @or_lshr_fshr_simplify(i32 %x, i32 %y, i32 %s) { ; CHECK-LABEL: or_lshr_fshr_simplify: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsl w9, w0, #1 -; CHECK-NEXT: lsr w10, w1, w2 -; CHECK-NEXT: lsl w8, w9, w8 -; CHECK-NEXT: orr w0, w8, w10 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsl w10, w0, #1 +; CHECK-NEXT: lsr w8, w1, w8 +; CHECK-NEXT: lsl w9, w10, w9 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %shy = lshr i32 %y, %s %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s) diff --git a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll --- a/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/AArch64/icmp-shift-opt.ll @@ -12,7 +12,8 @@ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: adds x0, x0, #1 ; CHECK-NEXT: cinc x1, x1, hs -; CHECK-NEXT: orr x8, x1, x0, lsr #60 +; CHECK-NEXT: extr x8, x1, x0, #60 +; CHECK-NEXT: orr x8, x8, x1, lsr #60 ; CHECK-NEXT: cbnz x8, .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret @@ -31,7 +32,8 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_eq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x1, x0, lsr #17 +; CHECK-NEXT: extr x8, x1, x0, #17 +; CHECK-NEXT: orr x8, x8, x1, lsr #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -43,7 +45,8 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_srl_ne_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x1, x0, lsr #17 +; CHECK-NEXT: extr x8, x1, x0, #17 +; CHECK-NEXT: orr x8, x8, x1, lsr #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -55,7 +58,8 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_eq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1, lsl #17 +; CHECK-NEXT: extr x8, x1, x0, #47 +; CHECK-NEXT: orr x8, x8, x0, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -67,7 +71,8 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_ne_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1, lsl #17 +; CHECK-NEXT: extr x8, x1, x0, #47 +; CHECK-NEXT: orr x8, x8, x0, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -101,7 +106,8 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: opt_setcc_expanded_shl_correct_shifts: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x1, x0, lsl #17 +; CHECK-NEXT: extr x8, x0, x1, #47 +; CHECK-NEXT: orr x8, x8, x1, lsl #17 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -136,12 +142,12 @@ define i1 @opt_setcc_shl_ne_zero_i256(i256 %a) nounwind { ; CHECK-LABEL: opt_setcc_shl_ne_zero_i256: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x2, x0 -; CHECK-NEXT: extr x9, x3, x2, #47 +; CHECK-NEXT: extr x8, x3, x2, #47 +; CHECK-NEXT: extr x9, x2, x1, #47 ; CHECK-NEXT: extr x10, x1, x0, #47 -; CHECK-NEXT: extr x8, x8, x1, #47 -; CHECK-NEXT: orr x9, x10, x9 -; CHECK-NEXT: orr x8, x8, x9 +; CHECK-NEXT: orr x9, x9, x0, lsl #17 +; CHECK-NEXT: orr x8, x10, x8 +; CHECK-NEXT: orr x8, x9, x8 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll --- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll +++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll @@ -65,8 +65,6 @@ ; CHECK-NEXT: uzp1 v1.16b, v3.16b, v1.16b ; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b -; CHECK-NEXT: shl v0.16b, v0.16b, #7 -; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: bic w0, w9, w8 diff --git a/llvm/test/CodeGen/AArch64/insertshuffleload.ll b/llvm/test/CodeGen/AArch64/insertshuffleload.ll --- a/llvm/test/CodeGen/AArch64/insertshuffleload.ll +++ b/llvm/test/CodeGen/AArch64/insertshuffleload.ll @@ -30,8 +30,11 @@ define <8 x i16> @inserti8_first_sext(ptr %p) { ; CHECK-LABEL: inserti8_first_sext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldrsb w8, [x0] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14 +; CHECK-NEXT: mov v0.h[0], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -46,8 +49,11 @@ define <8 x i16> @inserti8_last_sext(ptr %p) { ; CHECK-LABEL: inserti8_last_sext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldrsb w8, [x0, #8] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #2 +; CHECK-NEXT: mov v0.h[7], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 8 %l1 = load <8 x i8>, ptr %p @@ -62,8 +68,11 @@ define <8 x i16> @inserti8_first_zext(ptr %p) { ; CHECK-LABEL: inserti8_first_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldrb w8, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #14 +; CHECK-NEXT: mov v0.h[0], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -78,8 +87,11 @@ define <8 x i16> @inserti8_last_zext(ptr %p) { ; CHECK-LABEL: inserti8_last_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldrb w8, [x0, #8] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #2 +; CHECK-NEXT: mov v0.h[7], w8 ; CHECK-NEXT: ret %q = getelementptr inbounds i8, ptr %p, i32 8 %l1 = load <8 x i8>, ptr %p diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -350,8 +350,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-8 -; CHECK-NEXT: mov x8, #-1 -; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] @@ -362,10 +362,10 @@ ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1w { z7.s }, p0, [sp, #7, mul vl] -; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl] +; CHECK-NEXT: st1w { z6.s }, p0, [sp, #6, mul vl] ; CHECK-NEXT: add x10, x9, x8, lsl #2 ; CHECK-NEXT: st1w { z5.s }, p0, [sp, #5, mul vl] -; CHECK-NEXT: st1w { z6.s }, p0, [sp, #6, mul vl] +; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x8, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10, #1, mul vl] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x10, #2, mul vl] @@ -455,7 +455,7 @@ ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #17 -; CHECK-NEXT: mov w10, #17 +; CHECK-NEXT: mov w10, #17 // =0x11 ; CHECK-NEXT: csel x9, x9, x10, lo ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: addvl x8, x8, #1 @@ -500,7 +500,7 @@ ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #18 -; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: mov w10, #18 // =0x12 ; CHECK-NEXT: csel x9, x9, x10, lo ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: addvl x8, x8, #1 @@ -611,7 +611,7 @@ ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #18 -; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: mov w10, #18 // =0x12 ; CHECK-NEXT: csel x9, x9, x10, lo ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: addvl x8, x8, #1 @@ -779,7 +779,7 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-8 +; CHECK-NEXT: mov x9, #-8 // =0xfffffffffffffff8 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] @@ -805,7 +805,7 @@ ; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: rdvl x8, #4 ; CHECK-NEXT: cmp x8, #68 -; CHECK-NEXT: mov w9, #68 +; CHECK-NEXT: mov w9, #68 // =0x44 ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: addvl x9, x10, #4 @@ -815,9 +815,9 @@ ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z7.s }, p0, [sp, #7, mul vl] -; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl] -; CHECK-NEXT: st1w { z5.s }, p0, [sp, #5, mul vl] ; CHECK-NEXT: st1w { z6.s }, p0, [sp, #6, mul vl] +; CHECK-NEXT: st1w { z5.s }, p0, [sp, #5, mul vl] +; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8, #2, mul vl] diff --git a/llvm/test/CodeGen/AArch64/neon-abd.ll b/llvm/test/CodeGen/AArch64/neon-abd.ll --- a/llvm/test/CodeGen/AArch64/neon-abd.ll +++ b/llvm/test/CodeGen/AArch64/neon-abd.ll @@ -53,7 +53,8 @@ ; CHECK-NEXT: shl v1.4h, v1.4h, #8 ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: sshr v1.4h, v1.4h, #8 -; CHECK-NEXT: sabd v0.4h, v0.4h, v1.4h +; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: abs v0.4h, v0.4h ; CHECK-NEXT: ret %a.sext = sext <4 x i8> %a to <4 x i16> %b.sext = sext <4 x i8> %b to <4 x i16> @@ -107,7 +108,8 @@ ; CHECK-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-NEXT: sshr v0.2s, v0.2s, #16 ; CHECK-NEXT: sshr v1.2s, v1.2s, #16 -; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: abs v0.2s, v0.2s ; CHECK-NEXT: ret %a.sext = sext <2 x i16> %a to <2 x i32> %b.sext = sext <2 x i16> %b to <2 x i32> @@ -234,7 +236,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: bic v1.4h, #255, lsl #8 -; CHECK-NEXT: uabd v0.4h, v0.4h, v1.4h +; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: abs v0.4h, v0.4h ; CHECK-NEXT: ret %a.zext = zext <4 x i8> %a to <4 x i16> %b.zext = zext <4 x i8> %b to <4 x i16> @@ -287,7 +290,8 @@ ; CHECK-NEXT: movi d2, #0x00ffff0000ffff ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b -; CHECK-NEXT: uabd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: abs v0.2s, v0.2s ; CHECK-NEXT: ret %a.zext = zext <2 x i16> %a to <2 x i32> %b.zext = zext <2 x i16> %b to <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -661,8 +661,10 @@ define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: bsl2xi32_const: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d2, #0x000000ffffffff -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.s[1], v1.s[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = and <2 x i32> %a, < i32 -1, i32 0 > %tmp2 = and <2 x i32> %b, < i32 0, i32 -1 > @@ -686,8 +688,10 @@ define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) { ; CHECK-LABEL: bsl1xi64_const: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d2, #0xffffffffffffff00 -; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.b[0], v1.b[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp1 = and <1 x i64> %a, < i64 -256 > %tmp2 = and <1 x i64> %b, < i64 255 > @@ -722,9 +726,7 @@ define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: bsl2xi64_const: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI75_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI75_0] -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: mov v0.d[1], v1.d[1] ; CHECK-NEXT: ret %tmp1 = and <2 x i64> %a, < i64 -1, i64 0 > %tmp2 = and <2 x i64> %b, < i64 0, i64 -1 > diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -130,15 +130,17 @@ define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v5i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_0] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: umull2 v3.4s, v1.8h, v2.8h -; CHECK-NEXT: mov v0.s[0], v3.s[0] -; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-NEXT: mov v2.s[3], wzr +; CHECK-NEXT: umlal v2.4s, v0.4h, v1.4h +; CHECK-NEXT: addv s0, v2.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 ; CHECK-NEXT: ret @@ -156,12 +158,14 @@ define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v5i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0 -; CHECK-NEXT: mov v0.s[0], v2.s[0] -; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0 +; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: mov v1.s[3], wzr +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -174,15 +178,17 @@ define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v5i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_0] ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: smull2 v3.4s, v1.8h, v2.8h -; CHECK-NEXT: mov v0.s[0], v3.s[0] -; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h -; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-NEXT: mov v2.s[3], wzr +; CHECK-NEXT: smlal v2.4s, v0.4h, v1.4h +; CHECK-NEXT: addv s0, v2.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 ; CHECK-NEXT: ret @@ -200,19 +206,21 @@ define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) { ; CHECK-LABEL: test_sdot_v5i8_double: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, .LCPI8_0 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI8_0] ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: smull2 v5.4s, v0.8h, v1.8h -; CHECK-NEXT: movi v6.2d, #0000000000000000 -; CHECK-NEXT: smull2 v7.4s, v2.8h, v3.8h -; CHECK-NEXT: mov v6.s[0], v5.s[0] -; CHECK-NEXT: mov v4.s[0], v7.s[0] -; CHECK-NEXT: smlal v6.4s, v0.4h, v1.4h +; CHECK-NEXT: smull2 v6.4s, v2.8h, v3.8h +; CHECK-NEXT: and v5.16b, v5.16b, v4.16b +; CHECK-NEXT: and v4.16b, v6.16b, v4.16b +; CHECK-NEXT: mov v5.s[3], wzr +; CHECK-NEXT: mov v4.s[3], wzr +; CHECK-NEXT: smlal v5.4s, v0.4h, v1.4h ; CHECK-NEXT: smlal v4.4s, v2.4h, v3.4h -; CHECK-NEXT: add v0.4s, v6.4s, v4.4s +; CHECK-NEXT: add v0.4s, v5.4s, v4.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -232,14 +240,16 @@ define i32 @test_sdot_v5i8_double_nomla(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) { ; CHECK-LABEL: test_sdot_v5i8_double_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: adrp x8, .LCPI9_0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0 -; CHECK-NEXT: sshll2 v5.4s, v2.8h, #0 -; CHECK-NEXT: mov v3.s[0], v4.s[0] -; CHECK-NEXT: mov v1.s[0], v5.s[0] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: sshll2 v4.4s, v2.8h, #0 +; CHECK-NEXT: and v3.16b, v3.16b, v1.16b +; CHECK-NEXT: and v1.16b, v4.16b, v1.16b +; CHECK-NEXT: mov v3.s[3], wzr +; CHECK-NEXT: mov v1.s[3], wzr ; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h ; CHECK-NEXT: saddw v1.4s, v1.4s, v2.4h ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s @@ -998,27 +1008,29 @@ define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v25i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q1, q4, [x1] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ushll2 v7.8h, v4.16b, #0 -; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: adrp x8, .LCPI31_0 ; CHECK-NEXT: ushll2 v6.8h, v3.16b, #0 ; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ushll2 v7.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: ushll v5.8h, v2.8b, #0 ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: umull2 v16.4s, v3.8h, v1.8h ; CHECK-NEXT: umull v2.4s, v7.4h, v2.4h -; CHECK-NEXT: ushll v7.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-NEXT: umull2 v16.4s, v7.8h, v3.8h -; CHECK-NEXT: mov v0.s[0], v2.s[0] -; CHECK-NEXT: umull v2.4s, v7.4h, v3.4h -; CHECK-NEXT: umlal2 v16.4s, v4.8h, v5.8h -; CHECK-NEXT: umlal v0.4s, v1.4h, v6.4h -; CHECK-NEXT: umlal v2.4s, v4.4h, v5.4h -; CHECK-NEXT: umlal2 v16.4s, v1.8h, v6.8h -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v0.4s, v0.4s, v16.4s +; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI31_0] +; CHECK-NEXT: umull v1.4s, v3.4h, v1.4h +; CHECK-NEXT: umlal2 v16.4s, v0.8h, v5.8h +; CHECK-NEXT: and v2.16b, v2.16b, v7.16b +; CHECK-NEXT: mov v2.s[3], wzr +; CHECK-NEXT: umlal v1.4s, v0.4h, v5.4h +; CHECK-NEXT: umlal2 v16.4s, v6.8h, v4.8h +; CHECK-NEXT: umlal v2.4s, v6.4h, v4.4h +; CHECK-NEXT: add v0.4s, v1.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 @@ -1037,20 +1049,22 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v25i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: adrp x8, .LCPI32_0 +; CHECK-NEXT: ushll2 v3.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI32_0] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v4.8h, v2.16b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-NEXT: uaddl2 v5.4s, v1.8h, v2.8h -; CHECK-NEXT: mov v0.s[0], v4.s[0] -; CHECK-NEXT: uaddl v1.4s, v1.4h, v2.4h -; CHECK-NEXT: uaddw2 v2.4s, v5.4s, v3.8h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v3.4h -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: and v2.16b, v4.16b, v2.16b +; CHECK-NEXT: uaddl2 v4.4s, v0.8h, v1.8h +; CHECK-NEXT: mov v2.s[3], wzr +; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: uaddw2 v1.4s, v4.4s, v3.8h +; CHECK-NEXT: uaddw v2.4s, v2.4s, v3.4h +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -1063,27 +1077,29 @@ define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v25i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q1, q4, [x1] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: sshll2 v7.8h, v4.16b, #0 -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: adrp x8, .LCPI33_0 ; CHECK-NEXT: sshll2 v6.8h, v3.16b, #0 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: sshll2 v7.8h, v0.16b, #0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshll2 v4.8h, v1.16b, #0 +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v5.8h, v2.8b, #0 ; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: smull2 v16.4s, v3.8h, v1.8h ; CHECK-NEXT: smull v2.4s, v7.4h, v2.4h -; CHECK-NEXT: sshll v7.8h, v1.8b, #0 -; CHECK-NEXT: sshll2 v1.8h, v1.16b, #0 -; CHECK-NEXT: smull2 v16.4s, v7.8h, v3.8h -; CHECK-NEXT: mov v0.s[0], v2.s[0] -; CHECK-NEXT: smull v2.4s, v7.4h, v3.4h -; CHECK-NEXT: smlal2 v16.4s, v4.8h, v5.8h -; CHECK-NEXT: smlal v0.4s, v1.4h, v6.4h -; CHECK-NEXT: smlal v2.4s, v4.4h, v5.4h -; CHECK-NEXT: smlal2 v16.4s, v1.8h, v6.8h -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v0.4s, v0.4s, v16.4s +; CHECK-NEXT: ldr q7, [x8, :lo12:.LCPI33_0] +; CHECK-NEXT: smull v1.4s, v3.4h, v1.4h +; CHECK-NEXT: smlal2 v16.4s, v0.8h, v5.8h +; CHECK-NEXT: and v2.16b, v2.16b, v7.16b +; CHECK-NEXT: mov v2.s[3], wzr +; CHECK-NEXT: smlal v1.4s, v0.4h, v5.4h +; CHECK-NEXT: smlal2 v16.4s, v6.8h, v4.8h +; CHECK-NEXT: smlal v2.4s, v6.4h, v4.4h +; CHECK-NEXT: add v0.4s, v1.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 @@ -1109,216 +1125,218 @@ ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b0, [sp, #16] ; CHECK-NEXT: add x9, sp, #24 -; CHECK-NEXT: add x10, sp, #40 -; CHECK-NEXT: add x11, sp, #128 +; CHECK-NEXT: ldr b6, [sp, #280] +; CHECK-NEXT: add x10, sp, #288 ; CHECK-NEXT: ld1 { v2.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #96 ; CHECK-NEXT: ld1 { v0.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: ldr b17, [sp, #152] -; CHECK-NEXT: fmov s4, w0 -; CHECK-NEXT: ldr b6, [sp, #280] +; CHECK-NEXT: ld1 { v6.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ldr b3, [sp, #152] ; CHECK-NEXT: add x12, sp, #224 ; CHECK-NEXT: ld1 { v2.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #104 ; CHECK-NEXT: ld1 { v0.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ld1 { v3.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #120 ; CHECK-NEXT: ldr b1, [sp, #216] -; CHECK-NEXT: mov v4.b[1], w1 -; CHECK-NEXT: ldr b3, [sp, #480] +; CHECK-NEXT: fmov s7, w0 ; CHECK-NEXT: ld1 { v2.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #120 -; CHECK-NEXT: ld1 { v0.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #48 +; CHECK-NEXT: add x8, sp, #296 +; CHECK-NEXT: ldr b4, [sp, #480] +; CHECK-NEXT: add x11, sp, #136 ; CHECK-NEXT: ld1 { v1.b }[1], [x12] -; CHECK-NEXT: mov v4.b[2], w2 -; CHECK-NEXT: ldr b18, [sp, #352] +; CHECK-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #40 ; CHECK-NEXT: ld1 { v2.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #304 +; CHECK-NEXT: mov v7.b[1], w1 +; CHECK-NEXT: ldr b16, [sp, #352] +; CHECK-NEXT: ld1 { v0.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #128 +; CHECK-NEXT: ld1 { v6.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #48 +; CHECK-NEXT: ld1 { v2.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #312 +; CHECK-NEXT: ldr b17, [sp, #552] +; CHECK-NEXT: ld1 { v0.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #56 -; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #288 +; CHECK-NEXT: ld1 { v6.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #64 +; CHECK-NEXT: ld1 { v2.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #320 +; CHECK-NEXT: mov v7.b[2], w2 ; CHECK-NEXT: ldr b20, [sp, #680] -; CHECK-NEXT: mov v4.b[3], w3 -; CHECK-NEXT: ldr b5, [sp, #144] -; CHECK-NEXT: ld1 { v2.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #160 ; CHECK-NEXT: ld1 { v0.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ld1 { v6.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #296 -; CHECK-NEXT: ld1 { v17.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #136 -; CHECK-NEXT: ld1 { v2.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #320 -; CHECK-NEXT: ld1 { v0.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #72 -; CHECK-NEXT: ld1 { v6.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #304 -; CHECK-NEXT: mov v4.b[4], w4 -; CHECK-NEXT: ld1 { v2.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #168 -; CHECK-NEXT: ld1 { v0.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #232 -; CHECK-NEXT: ld1 { v6.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #312 -; CHECK-NEXT: ld1 { v17.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #488 -; CHECK-NEXT: mov v4.b[5], w5 -; CHECK-NEXT: ld1 { v6.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #240 -; CHECK-NEXT: ld1 { v3.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #496 -; CHECK-NEXT: ld1 { v17.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #184 -; CHECK-NEXT: ld1 { v1.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #248 -; CHECK-NEXT: mov v4.b[6], w6 -; CHECK-NEXT: ld1 { v6.b }[5], [x11] +; CHECK-NEXT: add x9, sp, #168 +; CHECK-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #232 +; CHECK-NEXT: ld1 { v2.b }[7], [x11] +; CHECK-NEXT: add x11, sp, #72 ; CHECK-NEXT: ld1 { v3.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #504 -; CHECK-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #192 -; CHECK-NEXT: ld1 { v1.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #256 -; CHECK-NEXT: add x11, sp, #328 +; CHECK-NEXT: add x9, sp, #176 +; CHECK-NEXT: ld1 { v0.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #328 +; CHECK-NEXT: ld1 { v1.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #240 +; CHECK-NEXT: mov v7.b[3], w3 +; CHECK-NEXT: ldr b19, [sp, #344] ; CHECK-NEXT: ld1 { v3.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #512 -; CHECK-NEXT: ld1 { v17.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #200 -; CHECK-NEXT: ld1 { v1.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #264 -; CHECK-NEXT: mov v4.b[7], w7 -; CHECK-NEXT: ld1 { v6.b }[6], [x11] +; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: ld1 { v6.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #336 +; CHECK-NEXT: ld1 { v1.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #248 +; CHECK-NEXT: mov v7.b[4], w4 +; CHECK-NEXT: ld1 { v0.b }[7], [x11] ; CHECK-NEXT: ld1 { v3.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #520 -; CHECK-NEXT: ld1 { v17.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #208 -; CHECK-NEXT: ld1 { v1.b }[6], [x10] -; CHECK-NEXT: add x11, sp, #336 -; CHECK-NEXT: add x10, sp, #272 +; CHECK-NEXT: add x9, sp, #192 +; CHECK-NEXT: ld1 { v6.b }[7], [x10] +; CHECK-NEXT: add x10, sp, #488 +; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #256 +; CHECK-NEXT: mov v7.b[5], w5 +; CHECK-NEXT: add x11, sp, #264 ; CHECK-NEXT: ld1 { v3.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #536 -; CHECK-NEXT: ld1 { v17.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #528 -; CHECK-NEXT: sshll v19.8h, v4.8b, #0 -; CHECK-NEXT: ldr b4, [sp, #416] -; CHECK-NEXT: ld1 { v6.b }[7], [x11] -; CHECK-NEXT: add x11, sp, #688 -; CHECK-NEXT: ld1 { v3.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #424 -; CHECK-NEXT: ld1 { v1.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #360 -; CHECK-NEXT: sshll v7.8h, v2.8b, #0 -; CHECK-NEXT: ldr b2, [sp, #344] -; CHECK-NEXT: ld1 { v4.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #432 -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: ld1 { v18.b }[1], [x10] -; CHECK-NEXT: sshll v16.8h, v6.8b, #0 -; CHECK-NEXT: ld1 { v3.b }[7], [x9] -; CHECK-NEXT: sshll v6.8h, v2.8b, #0 -; CHECK-NEXT: add x9, sp, #560 -; CHECK-NEXT: smull v2.4s, v19.4h, v17.4h -; CHECK-NEXT: ld1 { v4.b }[2], [x8] -; CHECK-NEXT: smull2 v17.4s, v19.8h, v17.8h -; CHECK-NEXT: ldr b19, [sp, #552] -; CHECK-NEXT: add x8, sp, #368 -; CHECK-NEXT: add x10, sp, #440 -; CHECK-NEXT: ld1 { v20.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #696 -; CHECK-NEXT: ld1 { v19.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #376 -; CHECK-NEXT: ld1 { v18.b }[2], [x8] +; CHECK-NEXT: add x9, sp, #200 +; CHECK-NEXT: ld1 { v4.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #496 +; CHECK-NEXT: ld1 { v1.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #208 +; CHECK-NEXT: mov v7.b[6], w6 +; CHECK-NEXT: ldr b5, [sp, #144] +; CHECK-NEXT: ld1 { v3.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #360 +; CHECK-NEXT: ld1 { v4.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #560 +; CHECK-NEXT: ld1 { v1.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #528 +; CHECK-NEXT: ld1 { v16.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #368 +; CHECK-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #504 +; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #512 +; CHECK-NEXT: mov v7.b[7], w7 +; CHECK-NEXT: ld1 { v4.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #568 -; CHECK-NEXT: ld1 { v4.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #448 -; CHECK-NEXT: ld1 { v20.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #704 -; CHECK-NEXT: ld1 { v19.b }[2], [x8] +; CHECK-NEXT: ld1 { v16.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #376 +; CHECK-NEXT: sshll v18.8h, v3.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #576 -; CHECK-NEXT: ld1 { v18.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #384 -; CHECK-NEXT: smlal v2.4s, v7.4h, v16.4h ; CHECK-NEXT: ld1 { v4.b }[4], [x10] -; CHECK-NEXT: smlal2 v17.4s, v7.8h, v16.8h -; CHECK-NEXT: ldr b7, [sp, #616] -; CHECK-NEXT: ld1 { v19.b }[3], [x8] +; CHECK-NEXT: add x10, sp, #520 +; CHECK-NEXT: ld1 { v16.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #384 +; CHECK-NEXT: sshll v7.8h, v7.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #584 -; CHECK-NEXT: ld1 { v18.b }[4], [x9] +; CHECK-NEXT: ld1 { v4.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #688 +; CHECK-NEXT: ld1 { v16.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #392 -; CHECK-NEXT: add x10, sp, #456 -; CHECK-NEXT: ld1 { v20.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #400 -; CHECK-NEXT: ld1 { v19.b }[4], [x8] +; CHECK-NEXT: smull2 v3.4s, v7.8h, v18.8h +; CHECK-NEXT: ld1 { v17.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #592 -; CHECK-NEXT: ld1 { v18.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #624 -; CHECK-NEXT: ld1 { v4.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #632 -; CHECK-NEXT: ld1 { v19.b }[5], [x8] +; CHECK-NEXT: ld1 { v20.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #696 +; CHECK-NEXT: ld1 { v16.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #400 +; CHECK-NEXT: smull v7.4s, v7.4h, v18.4h +; CHECK-NEXT: ld1 { v4.b }[6], [x11] +; CHECK-NEXT: ld1 { v17.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: ld1 { v20.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #704 +; CHECK-NEXT: ld1 { v16.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #408 +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: ldr b18, [sp, #416] +; CHECK-NEXT: ld1 { v17.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #608 +; CHECK-NEXT: ld1 { v20.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #712 +; CHECK-NEXT: ld1 { v16.b }[7], [x9] +; CHECK-NEXT: add x11, sp, #424 +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: add x9, sp, #720 +; CHECK-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #432 ; CHECK-NEXT: ld1 { v20.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #720 -; CHECK-NEXT: ld1 { v18.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #408 -; CHECK-NEXT: ld1 { v7.b }[2], [x9] +; CHECK-NEXT: add x10, sp, #624 +; CHECK-NEXT: smlal v7.4s, v2.4h, v6.4h +; CHECK-NEXT: ld1 { v18.b }[1], [x11] +; CHECK-NEXT: smlal2 v3.4s, v2.8h, v6.8h +; CHECK-NEXT: add x11, sp, #536 +; CHECK-NEXT: sshll v2.8h, v16.8b, #0 +; CHECK-NEXT: sshll v6.8h, v17.8b, #0 +; CHECK-NEXT: ld1 { v20.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #728 +; CHECK-NEXT: ld1 { v18.b }[2], [x8] +; CHECK-NEXT: smull2 v16.4s, v2.8h, v6.8h +; CHECK-NEXT: add x8, sp, #440 +; CHECK-NEXT: smull v2.4s, v2.4h, v6.4h +; CHECK-NEXT: ldr b6, [sp, #616] +; CHECK-NEXT: ld1 { v20.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #736 +; CHECK-NEXT: ld1 { v18.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #448 +; CHECK-NEXT: ld1 { v6.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #632 +; CHECK-NEXT: ld1 { v4.b }[7], [x11] +; CHECK-NEXT: add x11, sp, #272 +; CHECK-NEXT: ld1 { v20.b }[7], [x9] ; CHECK-NEXT: add x9, sp, #640 -; CHECK-NEXT: ld1 { v19.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #608 -; CHECK-NEXT: ld1 { v20.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #728 -; CHECK-NEXT: ld1 { v18.b }[7], [x11] -; CHECK-NEXT: add x11, sp, #464 -; CHECK-NEXT: ld1 { v7.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #664 -; CHECK-NEXT: ld1 { v19.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #648 -; CHECK-NEXT: ld1 { v20.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #736 -; CHECK-NEXT: sshll v16.8h, v18.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[6], [x11] -; CHECK-NEXT: ld1 { v7.b }[4], [x8] +; CHECK-NEXT: ld1 { v18.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #456 +; CHECK-NEXT: ld1 { v6.b }[2], [x10] +; CHECK-NEXT: adrp x10, .LCPI34_0 +; CHECK-NEXT: sshll v17.8h, v19.8b, #0 +; CHECK-NEXT: ld1 { v1.b }[7], [x11] +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v18.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #656 -; CHECK-NEXT: sshll v18.8h, v19.8b, #0 -; CHECK-NEXT: ld1 { v20.b }[7], [x10] -; CHECK-NEXT: smull v19.4s, v16.4h, v18.4h -; CHECK-NEXT: ld1 { v7.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #472 -; CHECK-NEXT: smull2 v16.4s, v16.8h, v18.8h -; CHECK-NEXT: ldr b18, [sp, #544] -; CHECK-NEXT: smull v5.4s, v5.4h, v6.4h -; CHECK-NEXT: ldr b6, [sp, #744] -; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[7], [x8] -; CHECK-NEXT: sshll v20.8h, v20.8b, #0 -; CHECK-NEXT: ld1 { v7.b }[6], [x9] -; CHECK-NEXT: sshll v18.8h, v18.8b, #0 -; CHECK-NEXT: add x9, sp, #672 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: smlal v19.4s, v3.4h, v20.4h -; CHECK-NEXT: smlal2 v16.4s, v3.8h, v20.8h -; CHECK-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: smull v6.4s, v18.4h, v6.4h -; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v6.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #648 +; CHECK-NEXT: sshll v19.8h, v20.8b, #0 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: smlal v2.4s, v4.4h, v19.4h +; CHECK-NEXT: ld1 { v6.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #464 +; CHECK-NEXT: smlal2 v16.4s, v4.8h, v19.8h +; CHECK-NEXT: ldr b4, [sp, #544] +; CHECK-NEXT: ldr b19, [sp, #744] +; CHECK-NEXT: ld1 { v18.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #472 +; CHECK-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #664 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: sshll v19.8h, v19.8b, #0 +; CHECK-NEXT: smull v5.4s, v5.4h, v17.4h +; CHECK-NEXT: ldr q17, [x10, :lo12:.LCPI34_0] +; CHECK-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #672 +; CHECK-NEXT: smull v4.4s, v4.4h, v19.4h +; CHECK-NEXT: ld1 { v18.b }[7], [x9] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: mov v3.s[0], v5.s[0] +; CHECK-NEXT: and v5.16b, v5.16b, v17.16b +; CHECK-NEXT: ld1 { v6.b }[7], [x8] +; CHECK-NEXT: and v4.16b, v4.16b, v17.16b +; CHECK-NEXT: mov v5.s[3], wzr +; CHECK-NEXT: mov v4.s[3], wzr ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: mov v18.s[0], v6.s[0] -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-NEXT: sshll v5.8h, v7.8b, #0 -; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h -; CHECK-NEXT: smlal v18.4s, v4.4h, v5.4h -; CHECK-NEXT: smlal2 v17.4s, v0.8h, v1.8h -; CHECK-NEXT: smlal2 v16.4s, v4.8h, v5.8h -; CHECK-NEXT: add v0.4s, v2.4s, v3.4s -; CHECK-NEXT: add v1.4s, v19.4s, v18.4s -; CHECK-NEXT: add v0.4s, v0.4s, v17.4s +; CHECK-NEXT: sshll v17.8h, v18.8b, #0 +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: smlal v5.4s, v0.4h, v1.4h +; CHECK-NEXT: smlal v4.4s, v17.4h, v6.4h +; CHECK-NEXT: smlal2 v3.4s, v0.8h, v1.8h +; CHECK-NEXT: smlal2 v16.4s, v17.8h, v6.8h +; CHECK-NEXT: add v0.4s, v7.4s, v5.4s +; CHECK-NEXT: add v1.4s, v2.4s, v4.4s +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s ; CHECK-NEXT: add v1.4s, v1.4s, v16.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s @@ -1348,112 +1366,114 @@ ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b2, [sp, #16] ; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: add x10, sp, #112 ; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: ldr b3, [sp, #480] ; CHECK-NEXT: ld1 { v0.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #96 ; CHECK-NEXT: ld1 { v2.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: mov v1.b[1], w1 -; CHECK-NEXT: add x10, sp, #488 -; CHECK-NEXT: add x11, sp, #496 -; CHECK-NEXT: ldr b4, [sp, #352] +; CHECK-NEXT: add x11, sp, #120 +; CHECK-NEXT: ldr b3, [sp, #480] +; CHECK-NEXT: add x12, sp, #488 +; CHECK-NEXT: ldr b5, [sp, #352] ; CHECK-NEXT: ld1 { v0.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #104 ; CHECK-NEXT: ld1 { v2.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #40 -; CHECK-NEXT: ld1 { v3.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #48 -; CHECK-NEXT: mov v1.b[2], w2 +; CHECK-NEXT: mov v1.b[1], w1 +; CHECK-NEXT: ld1 { v3.b }[1], [x12] +; CHECK-NEXT: add x12, sp, #496 ; CHECK-NEXT: ldr b6, [sp, #416] ; CHECK-NEXT: ld1 { v0.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #112 +; CHECK-NEXT: add x8, sp, #48 ; CHECK-NEXT: ld1 { v2.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: ld1 { v3.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #56 +; CHECK-NEXT: mov v1.b[2], w2 +; CHECK-NEXT: ld1 { v3.b }[2], [x12] +; CHECK-NEXT: add x12, sp, #424 +; CHECK-NEXT: ldr b4, [sp, #144] +; CHECK-NEXT: ld1 { v0.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #56 +; CHECK-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #136 ; CHECK-NEXT: mov v1.b[3], w3 -; CHECK-NEXT: add x12, sp, #504 -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #120 -; CHECK-NEXT: ld1 { v2.b }[4], [x10] +; CHECK-NEXT: ld1 { v6.b }[1], [x12] +; CHECK-NEXT: ldr b17, [sp, #544] +; CHECK-NEXT: ld1 { v0.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #72 +; CHECK-NEXT: ld1 { v2.b }[5], [x10] ; CHECK-NEXT: add x10, sp, #64 -; CHECK-NEXT: ldr b5, [sp, #144] ; CHECK-NEXT: mov v1.b[4], w4 -; CHECK-NEXT: ld1 { v3.b }[3], [x12] -; CHECK-NEXT: ld1 { v0.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #136 -; CHECK-NEXT: ld1 { v2.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #360 -; CHECK-NEXT: add x12, sp, #72 -; CHECK-NEXT: mov v1.b[5], w5 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 ; CHECK-NEXT: ld1 { v0.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #424 -; CHECK-NEXT: ld1 { v4.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #512 +; CHECK-NEXT: add x9, sp, #504 ; CHECK-NEXT: ld1 { v2.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #368 -; CHECK-NEXT: ld1 { v6.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #376 +; CHECK-NEXT: add x10, sp, #360 +; CHECK-NEXT: mov v1.b[5], w5 +; CHECK-NEXT: ld1 { v3.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #432 +; CHECK-NEXT: ld1 { v5.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #512 ; CHECK-NEXT: ld1 { v0.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #432 -; CHECK-NEXT: ld1 { v4.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #520 +; CHECK-NEXT: add x8, sp, #368 +; CHECK-NEXT: ld1 { v6.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #440 ; CHECK-NEXT: mov v1.b[6], w6 -; CHECK-NEXT: ld1 { v2.b }[7], [x12] -; CHECK-NEXT: ld1 { v6.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #440 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: ld1 { v3.b }[4], [x11] -; CHECK-NEXT: ld1 { v4.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #384 -; CHECK-NEXT: movi v7.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v6.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #448 +; CHECK-NEXT: ld1 { v3.b }[4], [x10] +; CHECK-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #376 +; CHECK-NEXT: add x10, sp, #520 +; CHECK-NEXT: ld1 { v2.b }[7], [x11] +; CHECK-NEXT: ld1 { v6.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #448 ; CHECK-NEXT: mov v1.b[7], w7 +; CHECK-NEXT: adrp x11, .LCPI35_0 +; CHECK-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #384 ; CHECK-NEXT: ld1 { v3.b }[5], [x10] -; CHECK-NEXT: sshll v5.4s, v5.4h, #0 -; CHECK-NEXT: ld1 { v4.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #392 ; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: ld1 { v6.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #456 -; CHECK-NEXT: mov v7.s[0], v5.s[0] +; CHECK-NEXT: ld1 { v6.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #456 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[5], [x9] -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: add x9, sp, #400 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: ld1 { v6.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #464 +; CHECK-NEXT: ldr q7, [x11, :lo12:.LCPI35_0] +; CHECK-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #392 ; CHECK-NEXT: ld1 { v3.b }[6], [x10] -; CHECK-NEXT: saddw v5.4s, v7.4s, v2.4h -; CHECK-NEXT: ld1 { v4.b }[6], [x9] -; CHECK-NEXT: saddl v7.4s, v1.4h, v0.4h ; CHECK-NEXT: add x10, sp, #536 -; CHECK-NEXT: add x9, sp, #408 -; CHECK-NEXT: ld1 { v6.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #472 -; CHECK-NEXT: add v5.4s, v7.4s, v5.4s -; CHECK-NEXT: ldr b7, [sp, #544] -; CHECK-NEXT: saddl2 v0.4s, v1.8h, v0.8h +; CHECK-NEXT: ld1 { v6.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #464 +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #400 +; CHECK-NEXT: saddl v16.4s, v1.4h, v0.4h ; CHECK-NEXT: ld1 { v3.b }[7], [x10] -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v4.b }[7], [x9] -; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v6.b }[7], [x8] -; CHECK-NEXT: saddw2 v0.4s, v0.4s, v2.8h -; CHECK-NEXT: sshll v7.4s, v7.4h, #0 +; CHECK-NEXT: ld1 { v6.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #472 +; CHECK-NEXT: saddl2 v0.4s, v1.8h, v0.8h +; CHECK-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #408 +; CHECK-NEXT: sshll v1.8h, v17.8b, #0 +; CHECK-NEXT: sshll v4.4s, v4.4h, #0 +; CHECK-NEXT: ld1 { v6.b }[7], [x9] +; CHECK-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-NEXT: and v4.16b, v4.16b, v7.16b +; CHECK-NEXT: and v1.16b, v1.16b, v7.16b +; CHECK-NEXT: mov v4.s[3], wzr +; CHECK-NEXT: mov v1.s[3], wzr ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: mov v1.s[0], v7.s[0] -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: saddl v7.4s, v4.4h, v3.4h -; CHECK-NEXT: saddl2 v3.4s, v4.8h, v3.8h +; CHECK-NEXT: saddl v7.4s, v5.4h, v3.4h +; CHECK-NEXT: saddl2 v3.4s, v5.8h, v3.8h +; CHECK-NEXT: saddw v4.4s, v4.4s, v2.4h ; CHECK-NEXT: saddw v1.4s, v1.4s, v6.4h -; CHECK-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-NEXT: saddw2 v0.4s, v0.4s, v2.8h ; CHECK-NEXT: saddw2 v2.4s, v3.4s, v6.8h +; CHECK-NEXT: add v4.4s, v16.4s, v4.4s ; CHECK-NEXT: add v1.4s, v7.4s, v1.4s +; CHECK-NEXT: add v0.4s, v4.4s, v0.4s ; CHECK-NEXT: add v1.4s, v1.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s @@ -1587,33 +1607,35 @@ define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v33i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b1, [x1, #32] -; CHECK-NEXT: ldr b2, [x0, #32] -; CHECK-NEXT: ldp q3, q4, [x0] +; CHECK-NEXT: ldr b0, [x1, #32] +; CHECK-NEXT: adrp x8, .LCPI41_0 +; CHECK-NEXT: ldr b1, [x0, #32] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h -; CHECK-NEXT: ushll v16.8h, v3.8b, #0 -; CHECK-NEXT: ldp q5, q6, [x1] -; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0 -; CHECK-NEXT: mov v0.s[0], v1.s[0] -; CHECK-NEXT: ushll2 v7.8h, v4.16b, #0 -; CHECK-NEXT: ushll2 v2.8h, v5.16b, #0 -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 -; CHECK-NEXT: umull2 v18.4s, v2.8h, v3.8h -; CHECK-NEXT: umull2 v1.4s, v5.8h, v16.8h +; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ushll v7.8h, v2.8b, #0 +; CHECK-NEXT: ldp q4, q5, [x1] +; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: ushll2 v6.8h, v3.16b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ushll2 v1.8h, v4.16b, #0 ; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: ushll2 v17.8h, v6.16b, #0 -; CHECK-NEXT: ushll v6.8h, v6.8b, #0 -; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h -; CHECK-NEXT: umlal2 v18.4s, v17.8h, v7.8h -; CHECK-NEXT: umlal2 v1.4s, v6.8h, v4.8h -; CHECK-NEXT: umlal v0.4s, v5.4h, v16.4h -; CHECK-NEXT: umlal v2.4s, v17.4h, v7.4h -; CHECK-NEXT: add v1.4s, v1.4s, v18.4s -; CHECK-NEXT: umlal v0.4s, v6.4h, v4.4h -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI41_0] +; CHECK-NEXT: umull2 v18.4s, v1.8h, v2.8h +; CHECK-NEXT: ushll2 v16.8h, v5.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v17.16b +; CHECK-NEXT: umull2 v17.4s, v4.8h, v7.8h +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: umlal2 v18.4s, v16.8h, v6.8h +; CHECK-NEXT: umlal2 v17.4s, v5.8h, v3.8h +; CHECK-NEXT: umlal v0.4s, v4.4h, v7.4h +; CHECK-NEXT: umlal v1.4s, v16.4h, v6.4h +; CHECK-NEXT: add v2.4s, v17.4s, v18.4s +; CHECK-NEXT: umlal v0.4s, v5.4h, v3.4h +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -1633,16 +1655,18 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v33i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b1, [x0, #32] -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll2 v5.8h, v3.16b, #0 -; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: ldr b0, [x0, #32] +; CHECK-NEXT: adrp x8, .LCPI42_0 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll2 v5.8h, v1.16b, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI42_0] ; CHECK-NEXT: ushll v4.8h, v2.8b, #0 ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 -; CHECK-NEXT: ushll v1.8h, v3.8b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: uaddl2 v3.4s, v5.8h, v2.8h ; CHECK-NEXT: uaddl2 v6.4s, v1.8h, v4.8h ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h @@ -1663,33 +1687,35 @@ define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v33i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b1, [x1, #32] -; CHECK-NEXT: ldr b2, [x0, #32] -; CHECK-NEXT: ldp q3, q4, [x0] +; CHECK-NEXT: ldr b0, [x1, #32] +; CHECK-NEXT: adrp x8, .LCPI43_0 +; CHECK-NEXT: ldr b1, [x0, #32] +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h -; CHECK-NEXT: sshll v16.8h, v3.8b, #0 -; CHECK-NEXT: ldp q5, q6, [x1] -; CHECK-NEXT: sshll2 v3.8h, v3.16b, #0 -; CHECK-NEXT: mov v0.s[0], v1.s[0] -; CHECK-NEXT: sshll2 v7.8h, v4.16b, #0 -; CHECK-NEXT: sshll2 v2.8h, v5.16b, #0 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: smull2 v18.4s, v2.8h, v3.8h -; CHECK-NEXT: smull2 v1.4s, v5.8h, v16.8h +; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: sshll v7.8h, v2.8b, #0 +; CHECK-NEXT: ldp q4, q5, [x1] +; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: sshll2 v6.8h, v3.16b, #0 +; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: sshll2 v1.8h, v4.16b, #0 ; CHECK-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-NEXT: sshll2 v17.8h, v6.16b, #0 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: smull v2.4s, v2.4h, v3.4h -; CHECK-NEXT: smlal2 v18.4s, v17.8h, v7.8h -; CHECK-NEXT: smlal2 v1.4s, v6.8h, v4.8h -; CHECK-NEXT: smlal v0.4s, v5.4h, v16.4h -; CHECK-NEXT: smlal v2.4s, v17.4h, v7.4h -; CHECK-NEXT: add v1.4s, v1.4s, v18.4s -; CHECK-NEXT: smlal v0.4s, v6.4h, v4.4h -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI43_0] +; CHECK-NEXT: smull2 v18.4s, v1.8h, v2.8h +; CHECK-NEXT: sshll2 v16.8h, v5.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v17.16b +; CHECK-NEXT: smull2 v17.4s, v4.8h, v7.8h +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: smlal2 v18.4s, v16.8h, v6.8h +; CHECK-NEXT: smlal2 v17.4s, v5.8h, v3.8h +; CHECK-NEXT: smlal v0.4s, v4.4h, v7.4h +; CHECK-NEXT: smlal v1.4s, v16.4h, v6.4h +; CHECK-NEXT: add v2.4s, v17.4s, v18.4s +; CHECK-NEXT: smlal v0.4s, v5.4h, v3.4h +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -1712,291 +1738,293 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr b0, [sp, #80] +; CHECK-NEXT: ldr b3, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 -; CHECK-NEXT: ldr b1, [sp, #144] -; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: ldr b3, [sp, #16] -; CHECK-NEXT: add x10, sp, #104 -; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #152 -; CHECK-NEXT: ldr b4, [sp, #344] -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: ldr b6, [sp, #216] -; CHECK-NEXT: add x11, sp, #136 -; CHECK-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #160 -; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: ldr b0, [sp, #144] +; CHECK-NEXT: add x9, sp, #152 +; CHECK-NEXT: ldr b1, [sp, #16] +; CHECK-NEXT: add x10, sp, #160 +; CHECK-NEXT: ld1 { v3.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #96 +; CHECK-NEXT: ld1 { v0.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #24 -; CHECK-NEXT: mov v2.b[1], w1 -; CHECK-NEXT: ldr b17, [sp, #280] -; CHECK-NEXT: ldr b7, [sp, #408] -; CHECK-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #168 -; CHECK-NEXT: ld1 { v3.b }[1], [x9] +; CHECK-NEXT: ldr b2, [sp, #344] +; CHECK-NEXT: fmov s6, w0 +; CHECK-NEXT: ldr b16, [sp, #280] +; CHECK-NEXT: add x11, sp, #200 +; CHECK-NEXT: ld1 { v3.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #104 +; CHECK-NEXT: ld1 { v1.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: ld1 { v0.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #112 -; CHECK-NEXT: mov v2.b[2], w2 -; CHECK-NEXT: ldr b5, [sp, #208] -; CHECK-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: ld1 { v3.b }[2], [x9] +; CHECK-NEXT: ld1 { v0.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #168 +; CHECK-NEXT: ldr b7, [sp, #408] +; CHECK-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #112 +; CHECK-NEXT: ld1 { v1.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #40 +; CHECK-NEXT: ld1 { v0.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #176 +; CHECK-NEXT: mov v6.b[1], w1 +; CHECK-NEXT: ldr b4, [sp, #208] +; CHECK-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #120 +; CHECK-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #48 ; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: mov v2.b[3], w3 -; CHECK-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #184 -; CHECK-NEXT: ld1 { v3.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: ld1 { v0.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #48 -; CHECK-NEXT: mov v2.b[4], w4 -; CHECK-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #192 -; CHECK-NEXT: ld1 { v3.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #360 -; CHECK-NEXT: ld1 { v0.b }[6], [x9] +; CHECK-NEXT: add x10, sp, #184 +; CHECK-NEXT: mov v6.b[2], w2 +; CHECK-NEXT: ld1 { v3.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #128 +; CHECK-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #56 -; CHECK-NEXT: mov v2.b[5], w5 -; CHECK-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #200 -; CHECK-NEXT: ld1 { v3.b }[5], [x9] +; CHECK-NEXT: ld1 { v0.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #192 +; CHECK-NEXT: mov v6.b[3], w3 +; CHECK-NEXT: ld1 { v3.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #136 +; CHECK-NEXT: ld1 { v1.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #64 +; CHECK-NEXT: ld1 { v0.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #352 +; CHECK-NEXT: mov v6.b[4], w4 +; CHECK-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #72 +; CHECK-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #360 +; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #288 ; CHECK-NEXT: ld1 { v0.b }[7], [x11] -; CHECK-NEXT: add x11, sp, #232 -; CHECK-NEXT: mov v2.b[6], w6 +; CHECK-NEXT: add x11, sp, #368 +; CHECK-NEXT: sshll v5.8h, v3.8b, #0 +; CHECK-NEXT: ldr b3, [sp, #216] ; CHECK-NEXT: ld1 { v1.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #352 -; CHECK-NEXT: ld1 { v3.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #72 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #224 -; CHECK-NEXT: mov v2.b[7], w7 -; CHECK-NEXT: ld1 { v3.b }[7], [x9] +; CHECK-NEXT: ld1 { v2.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #416 -; CHECK-NEXT: ld1 { v6.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #288 -; CHECK-NEXT: ld1 { v4.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #368 +; CHECK-NEXT: ld1 { v16.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #296 +; CHECK-NEXT: ld1 { v3.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #232 ; CHECK-NEXT: ld1 { v7.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #424 -; CHECK-NEXT: ld1 { v17.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #296 -; CHECK-NEXT: ld1 { v6.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #240 -; CHECK-NEXT: ld1 { v4.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #376 +; CHECK-NEXT: ld1 { v2.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #376 +; CHECK-NEXT: ld1 { v16.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #304 +; CHECK-NEXT: ld1 { v3.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #240 ; CHECK-NEXT: ld1 { v7.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #432 -; CHECK-NEXT: ld1 { v17.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #304 -; CHECK-NEXT: ld1 { v6.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #248 -; CHECK-NEXT: ld1 { v4.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #384 +; CHECK-NEXT: ld1 { v2.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #384 +; CHECK-NEXT: ld1 { v16.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #312 +; CHECK-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #248 ; CHECK-NEXT: ld1 { v7.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #440 -; CHECK-NEXT: ld1 { v17.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ld1 { v6.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #256 -; CHECK-NEXT: ld1 { v4.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #392 +; CHECK-NEXT: mov v6.b[5], w5 +; CHECK-NEXT: ld1 { v2.b }[5], [x11] +; CHECK-NEXT: ld1 { v16.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #320 +; CHECK-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #256 ; CHECK-NEXT: ld1 { v7.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #448 -; CHECK-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #320 -; CHECK-NEXT: ld1 { v6.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #264 -; CHECK-NEXT: sshll v19.8h, v2.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[6], [x10] +; CHECK-NEXT: sshll v17.8h, v0.8b, #0 +; CHECK-NEXT: ldr b0, [sp, #472] +; CHECK-NEXT: ld1 { v16.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #328 +; CHECK-NEXT: ld1 { v3.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #264 ; CHECK-NEXT: ld1 { v7.b }[5], [x9] ; CHECK-NEXT: add x9, sp, #456 -; CHECK-NEXT: ld1 { v17.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #328 -; CHECK-NEXT: ld1 { v6.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #272 -; CHECK-NEXT: sshll v2.8h, v1.8b, #0 -; CHECK-NEXT: ldr b1, [sp, #608] +; CHECK-NEXT: add x11, sp, #392 +; CHECK-NEXT: ld1 { v16.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #680 +; CHECK-NEXT: ld1 { v3.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #272 ; CHECK-NEXT: ld1 { v7.b }[6], [x9] ; CHECK-NEXT: add x9, sp, #464 -; CHECK-NEXT: ld1 { v17.b }[6], [x8] +; CHECK-NEXT: sshll v20.8h, v1.8b, #0 +; CHECK-NEXT: ld1 { v2.b }[6], [x11] +; CHECK-NEXT: sshll v1.8h, v4.8b, #0 +; CHECK-NEXT: add x11, sp, #400 +; CHECK-NEXT: ld1 { v3.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #336 -; CHECK-NEXT: ld1 { v6.b }[7], [x11] -; CHECK-NEXT: add x10, sp, #400 -; CHECK-NEXT: sshll v16.8h, v3.8b, #0 -; CHECK-NEXT: add x11, sp, #648 ; CHECK-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #624 -; CHECK-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-NEXT: adrp x9, .LCPI44_0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ld1 { v2.b }[7], [x11] +; CHECK-NEXT: ld1 { v16.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #616 -; CHECK-NEXT: sshll v21.8h, v6.8b, #0 -; CHECK-NEXT: ldr b6, [sp, #472] -; CHECK-NEXT: ld1 { v4.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #552 -; CHECK-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #488 -; CHECK-NEXT: sshll v18.8h, v17.8b, #0 -; CHECK-NEXT: ldr b17, [sp, #480] +; CHECK-NEXT: mov v6.b[6], w6 +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI44_0] +; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: add x9, sp, #488 +; CHECK-NEXT: sshll v21.8h, v3.8b, #0 +; CHECK-NEXT: add x11, sp, #632 +; CHECK-NEXT: sshll v3.8h, v16.8b, #0 +; CHECK-NEXT: ldr b16, [sp, #544] +; CHECK-NEXT: mov v6.b[7], w7 +; CHECK-NEXT: smull v1.4s, v20.4h, v3.4h +; CHECK-NEXT: smull2 v3.4s, v20.8h, v3.8h +; CHECK-NEXT: ldr b20, [sp, #608] +; CHECK-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-NEXT: sshll v19.8h, v7.8b, #0 +; CHECK-NEXT: ldr b7, [sp, #672] +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: ld1 { v20.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #624 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: sshll v3.8h, v4.8b, #0 -; CHECK-NEXT: ld1 { v17.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #496 -; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #632 -; CHECK-NEXT: sshll v4.8h, v7.8b, #0 -; CHECK-NEXT: smull v20.4s, v5.4h, v6.4h -; CHECK-NEXT: movi v7.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v17.b }[2], [x8] -; CHECK-NEXT: smull v5.4s, v16.4h, v18.4h -; CHECK-NEXT: ld1 { v1.b }[3], [x9] -; CHECK-NEXT: smull2 v16.4s, v16.8h, v18.8h -; CHECK-NEXT: ldr b18, [sp, #544] -; CHECK-NEXT: add x8, sp, #504 -; CHECK-NEXT: add x9, sp, #640 -; CHECK-NEXT: mov v7.s[0], v20.s[0] -; CHECK-NEXT: ldr b20, [sp, #672] -; CHECK-NEXT: ld1 { v18.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #680 -; CHECK-NEXT: ld1 { v17.b }[3], [x8] +; CHECK-NEXT: ld1 { v7.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #552 +; CHECK-NEXT: sshll v18.8h, v2.8b, #0 +; CHECK-NEXT: ld1 { v20.b }[2], [x8] +; CHECK-NEXT: smull2 v2.4s, v6.8h, v21.8h +; CHECK-NEXT: add x8, sp, #688 +; CHECK-NEXT: smlal v0.4s, v6.4h, v21.4h +; CHECK-NEXT: ldr b6, [sp, #480] +; CHECK-NEXT: ld1 { v16.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #696 +; CHECK-NEXT: ld1 { v7.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #560 -; CHECK-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-NEXT: ld1 { v6.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #496 +; CHECK-NEXT: ld1 { v20.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #640 +; CHECK-NEXT: ld1 { v16.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #568 +; CHECK-NEXT: ld1 { v7.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #704 +; CHECK-NEXT: ld1 { v6.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #504 +; CHECK-NEXT: ld1 { v20.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #648 +; CHECK-NEXT: ld1 { v16.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #576 +; CHECK-NEXT: ld1 { v7.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #712 +; CHECK-NEXT: ld1 { v6.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #512 -; CHECK-NEXT: ld1 { v20.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #520 -; CHECK-NEXT: ld1 { v18.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #688 -; CHECK-NEXT: ld1 { v17.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #568 -; CHECK-NEXT: smull2 v6.4s, v19.8h, v21.8h -; CHECK-NEXT: ld1 { v1.b }[5], [x11] -; CHECK-NEXT: ld1 { v20.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #696 -; CHECK-NEXT: ld1 { v18.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #576 -; CHECK-NEXT: ld1 { v17.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: smlal v7.4s, v19.4h, v21.4h -; CHECK-NEXT: ldr b19, [sp, #872] -; CHECK-NEXT: ld1 { v20.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #704 -; CHECK-NEXT: ld1 { v18.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #584 -; CHECK-NEXT: ld1 { v17.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #536 -; CHECK-NEXT: ldr b21, [sp, #936] +; CHECK-NEXT: ld1 { v20.b }[5], [x11] ; CHECK-NEXT: add x11, sp, #656 -; CHECK-NEXT: ld1 { v20.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #712 -; CHECK-NEXT: ld1 { v18.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #592 -; CHECK-NEXT: ld1 { v17.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #880 -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v1.b }[6], [x11] -; CHECK-NEXT: ld1 { v20.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: ld1 { v18.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #720 -; CHECK-NEXT: ld1 { v19.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #944 -; CHECK-NEXT: smlal2 v6.4s, v0.8h, v3.8h +; CHECK-NEXT: ld1 { v16.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #584 +; CHECK-NEXT: ld1 { v7.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #720 +; CHECK-NEXT: ld1 { v6.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #520 +; CHECK-NEXT: ld1 { v20.b }[6], [x11] ; CHECK-NEXT: add x11, sp, #664 -; CHECK-NEXT: ld1 { v20.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #888 -; CHECK-NEXT: ld1 { v18.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #728 -; CHECK-NEXT: ld1 { v21.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #752 -; CHECK-NEXT: ld1 { v19.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #952 -; CHECK-NEXT: ld1 { v20.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #896 -; CHECK-NEXT: smlal v7.4s, v0.4h, v3.4h -; CHECK-NEXT: ldr b0, [sp, #744] -; CHECK-NEXT: ld1 { v21.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #904 +; CHECK-NEXT: ld1 { v16.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #592 +; CHECK-NEXT: smlal2 v3.4s, v17.8h, v19.8h +; CHECK-NEXT: ld1 { v7.b }[6], [x10] +; CHECK-NEXT: ld1 { v6.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #528 +; CHECK-NEXT: smlal2 v2.4s, v5.8h, v18.8h +; CHECK-NEXT: ld1 { v20.b }[7], [x11] +; CHECK-NEXT: smlal v1.4s, v17.4h, v19.4h +; CHECK-NEXT: ldr b17, [sp, #736] +; CHECK-NEXT: smlal v0.4s, v5.4h, v18.4h +; CHECK-NEXT: ldr b18, [sp, #1000] +; CHECK-NEXT: ld1 { v16.b }[6], [x8] +; CHECK-NEXT: add x10, sp, #728 +; CHECK-NEXT: ld1 { v6.b }[6], [x9] +; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: add x9, sp, #536 +; CHECK-NEXT: ldr b19, [sp, #744] +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 +; CHECK-NEXT: ld1 { v7.b }[7], [x10] +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: ld1 { v16.b }[7], [x8] +; CHECK-NEXT: sshll v5.8h, v20.8b, #0 +; CHECK-NEXT: add x8, sp, #752 +; CHECK-NEXT: smull v17.4s, v17.4h, v18.4h +; CHECK-NEXT: ldr b20, [sp, #808] +; CHECK-NEXT: add x10, sp, #816 +; CHECK-NEXT: ld1 { v6.b }[7], [x9] +; CHECK-NEXT: ldr b18, [sp, #872] +; CHECK-NEXT: add x9, sp, #880 +; CHECK-NEXT: ld1 { v19.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #760 +; CHECK-NEXT: ld1 { v20.b }[1], [x10] +; CHECK-NEXT: add x11, sp, #944 +; CHECK-NEXT: and v4.16b, v17.16b, v4.16b +; CHECK-NEXT: ldr b17, [sp, #936] +; CHECK-NEXT: ld1 { v18.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #824 +; CHECK-NEXT: ld1 { v19.b }[2], [x8] +; CHECK-NEXT: add x10, sp, #888 +; CHECK-NEXT: ld1 { v17.b }[1], [x11] +; CHECK-NEXT: add x8, sp, #768 +; CHECK-NEXT: ld1 { v20.b }[2], [x9] +; CHECK-NEXT: add x11, sp, #952 +; CHECK-NEXT: add x9, sp, #832 +; CHECK-NEXT: ld1 { v18.b }[2], [x10] ; CHECK-NEXT: ld1 { v19.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #960 -; CHECK-NEXT: ld1 { v0.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #760 -; CHECK-NEXT: ld1 { v1.b }[7], [x11] -; CHECK-NEXT: add x11, sp, #816 -; CHECK-NEXT: ld1 { v21.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #968 -; CHECK-NEXT: ldr b3, [sp, #808] -; CHECK-NEXT: ld1 { v19.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #912 -; CHECK-NEXT: ld1 { v0.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #768 -; CHECK-NEXT: ld1 { v3.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #824 -; CHECK-NEXT: ld1 { v21.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #976 -; CHECK-NEXT: ld1 { v19.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #920 -; CHECK-NEXT: ld1 { v0.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #776 -; CHECK-NEXT: ld1 { v3.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #832 -; CHECK-NEXT: ld1 { v21.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #984 -; CHECK-NEXT: ld1 { v19.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #928 -; CHECK-NEXT: ld1 { v0.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #848 -; CHECK-NEXT: ld1 { v3.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #840 -; CHECK-NEXT: ld1 { v21.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #992 -; CHECK-NEXT: ld1 { v19.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #784 -; CHECK-NEXT: smlal2 v16.4s, v2.8h, v4.8h -; CHECK-NEXT: ld1 { v3.b }[4], [x11] -; CHECK-NEXT: ld1 { v21.b }[7], [x8] +; CHECK-NEXT: add x10, sp, #896 +; CHECK-NEXT: ld1 { v17.b }[2], [x11] +; CHECK-NEXT: add x8, sp, #776 +; CHECK-NEXT: ld1 { v20.b }[3], [x9] +; CHECK-NEXT: add x11, sp, #960 +; CHECK-NEXT: add x9, sp, #840 +; CHECK-NEXT: ld1 { v18.b }[3], [x10] +; CHECK-NEXT: ld1 { v19.b }[4], [x8] +; CHECK-NEXT: add x10, sp, #904 +; CHECK-NEXT: ld1 { v17.b }[3], [x11] +; CHECK-NEXT: add x8, sp, #784 +; CHECK-NEXT: ld1 { v20.b }[4], [x9] +; CHECK-NEXT: add x11, sp, #968 +; CHECK-NEXT: add x9, sp, #848 +; CHECK-NEXT: ld1 { v18.b }[4], [x10] +; CHECK-NEXT: ld1 { v19.b }[5], [x8] +; CHECK-NEXT: add x10, sp, #912 +; CHECK-NEXT: ld1 { v17.b }[4], [x11] ; CHECK-NEXT: add x8, sp, #792 -; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: ld1 { v20.b }[5], [x9] +; CHECK-NEXT: add x11, sp, #976 ; CHECK-NEXT: add x9, sp, #856 -; CHECK-NEXT: smlal v5.4s, v2.4h, v4.4h -; CHECK-NEXT: ldr b2, [sp, #736] -; CHECK-NEXT: sshll v4.8h, v20.8b, #0 -; CHECK-NEXT: ldr b20, [sp, #1000] -; CHECK-NEXT: ld1 { v3.b }[5], [x10] -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: ld1 { v0.b }[6], [x8] -; CHECK-NEXT: sshll v20.8h, v20.8b, #0 +; CHECK-NEXT: ld1 { v18.b }[5], [x10] +; CHECK-NEXT: ld1 { v19.b }[6], [x8] +; CHECK-NEXT: add x10, sp, #920 +; CHECK-NEXT: ld1 { v17.b }[5], [x11] ; CHECK-NEXT: add x8, sp, #800 -; CHECK-NEXT: sshll v21.8h, v21.8b, #0 -; CHECK-NEXT: smull v2.4s, v2.4h, v20.4h -; CHECK-NEXT: ld1 { v3.b }[6], [x9] -; CHECK-NEXT: smull v20.4s, v4.4h, v21.4h -; CHECK-NEXT: ld1 { v0.b }[7], [x8] -; CHECK-NEXT: smull2 v4.4s, v4.8h, v21.8h +; CHECK-NEXT: ld1 { v20.b }[6], [x9] +; CHECK-NEXT: add x11, sp, #984 ; CHECK-NEXT: add x9, sp, #864 -; CHECK-NEXT: movi v21.2d, #0000000000000000 -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: ld1 { v3.b }[7], [x9] +; CHECK-NEXT: ld1 { v18.b }[6], [x10] +; CHECK-NEXT: ld1 { v19.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #928 +; CHECK-NEXT: ld1 { v17.b }[6], [x11] +; CHECK-NEXT: add x10, sp, #992 +; CHECK-NEXT: ld1 { v20.b }[7], [x9] +; CHECK-NEXT: mov v4.s[3], wzr +; CHECK-NEXT: ld1 { v18.b }[7], [x8] +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: sshll v16.8h, v16.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[7], [x10] ; CHECK-NEXT: sshll v19.8h, v19.8b, #0 -; CHECK-NEXT: mov v21.s[0], v2.s[0] -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: smull2 v2.4s, v1.8h, v19.8h +; CHECK-NEXT: sshll v20.8h, v20.8b, #0 +; CHECK-NEXT: smlal v4.4s, v6.4h, v19.4h +; CHECK-NEXT: smull2 v6.4s, v6.8h, v19.8h +; CHECK-NEXT: smull2 v19.4s, v16.8h, v20.8h +; CHECK-NEXT: smull v16.4s, v16.4h, v20.4h +; CHECK-NEXT: sshll v7.8h, v7.8b, #0 +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 ; CHECK-NEXT: sshll v18.8h, v18.8b, #0 -; CHECK-NEXT: smlal v21.4s, v17.4h, v0.4h -; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: smlal2 v2.4s, v17.8h, v0.8h -; CHECK-NEXT: smlal2 v4.4s, v18.8h, v3.8h -; CHECK-NEXT: smlal v20.4s, v18.4h, v3.4h -; CHECK-NEXT: smlal v21.4s, v1.4h, v19.4h -; CHECK-NEXT: add v0.4s, v6.4s, v16.4s -; CHECK-NEXT: add v1.4s, v7.4s, v5.4s -; CHECK-NEXT: add v2.4s, v2.4s, v4.4s -; CHECK-NEXT: add v3.4s, v21.4s, v20.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v1.4s, v3.4s, v2.4s +; CHECK-NEXT: smlal2 v19.4s, v7.8h, v17.8h +; CHECK-NEXT: smlal2 v6.4s, v5.8h, v18.8h +; CHECK-NEXT: smlal v16.4s, v7.4h, v17.4h +; CHECK-NEXT: smlal v4.4s, v5.4h, v18.4h +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v1.4s, v6.4s, v19.4s +; CHECK-NEXT: add v3.4s, v4.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 @@ -2024,151 +2052,153 @@ ; CHECK-NEXT: ldr b0, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b2, [sp, #144] -; CHECK-NEXT: add x9, sp, #152 +; CHECK-NEXT: add x9, sp, #96 ; CHECK-NEXT: fmov s3, w0 ; CHECK-NEXT: ldr b4, [sp, #16] ; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #96 -; CHECK-NEXT: add x10, sp, #104 -; CHECK-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-NEXT: mov v3.b[1], w1 -; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: add x11, sp, #128 +; CHECK-NEXT: add x8, sp, #152 +; CHECK-NEXT: add x10, sp, #24 ; CHECK-NEXT: ldr b1, [sp, #208] -; CHECK-NEXT: ld1 { v0.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #24 -; CHECK-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #168 -; CHECK-NEXT: mov v3.b[2], w2 -; CHECK-NEXT: ld1 { v4.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #112 -; CHECK-NEXT: ld1 { v0.b }[3], [x10] +; CHECK-NEXT: mov v3.b[1], w1 +; CHECK-NEXT: ldr b17, [sp, #544] +; CHECK-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #160 +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #104 +; CHECK-NEXT: ld1 { v4.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #32 -; CHECK-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #176 -; CHECK-NEXT: mov v3.b[3], w3 +; CHECK-NEXT: mov v3.b[2], w2 +; CHECK-NEXT: add x11, sp, #632 +; CHECK-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #168 +; CHECK-NEXT: ld1 { v0.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #112 ; CHECK-NEXT: ld1 { v4.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #40 -; CHECK-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: add x10, sp, #40 +; CHECK-NEXT: mov v3.b[3], w3 +; CHECK-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #176 +; CHECK-NEXT: ld1 { v0.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #120 +; CHECK-NEXT: ld1 { v4.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #48 ; CHECK-NEXT: mov v3.b[4], w4 -; CHECK-NEXT: ld1 { v4.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #48 -; CHECK-NEXT: ld1 { v0.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #192 +; CHECK-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #184 +; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #128 +; CHECK-NEXT: ld1 { v4.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: mov v3.b[5], w5 -; CHECK-NEXT: ld1 { v4.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #56 -; CHECK-NEXT: ld1 { v0.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #632 -; CHECK-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #200 +; CHECK-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #192 +; CHECK-NEXT: ld1 { v0.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #136 +; CHECK-NEXT: ld1 { v4.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #64 ; CHECK-NEXT: mov v3.b[6], w6 -; CHECK-NEXT: ld1 { v4.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #64 -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: ld1 { v0.b }[7], [x10] -; CHECK-NEXT: ld1 { v2.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #552 -; CHECK-NEXT: mov v3.b[7], w7 -; CHECK-NEXT: add x10, sp, #680 -; CHECK-NEXT: ld1 { v4.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #72 -; CHECK-NEXT: movi v6.2d, #0000000000000000 -; CHECK-NEXT: sshll v5.4s, v1.4h, #0 -; CHECK-NEXT: ldr b1, [sp, #608] -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: ld1 { v2.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #200 +; CHECK-NEXT: ld1 { v0.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #72 +; CHECK-NEXT: ld1 { v4.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #552 +; CHECK-NEXT: sshll v5.8h, v1.8b, #0 +; CHECK-NEXT: ld1 { v2.b }[7], [x8] +; CHECK-NEXT: adrp x8, .LCPI45_0 +; CHECK-NEXT: sshll v1.8h, v0.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[1], [x10] +; CHECK-NEXT: sshll v5.4s, v5.4h, #0 +; CHECK-NEXT: ld1 { v4.b }[7], [x9] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI45_0] ; CHECK-NEXT: add x8, sp, #616 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: sshll v7.8h, v3.8b, #0 -; CHECK-NEXT: ld1 { v1.b }[1], [x8] +; CHECK-NEXT: sshll v6.8h, v2.8b, #0 +; CHECK-NEXT: ldr b2, [sp, #608] +; CHECK-NEXT: mov v3.b[7], w7 +; CHECK-NEXT: add x9, sp, #680 +; CHECK-NEXT: and v16.16b, v5.16b, v0.16b +; CHECK-NEXT: add x10, sp, #560 +; CHECK-NEXT: mov v16.s[3], wzr +; CHECK-NEXT: ld1 { v2.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #624 -; CHECK-NEXT: sshll v3.8h, v4.8b, #0 -; CHECK-NEXT: mov v6.s[0], v5.s[0] -; CHECK-NEXT: saddl2 v5.4s, v3.8h, v2.8h -; CHECK-NEXT: saddl2 v16.4s, v7.8h, v0.8h -; CHECK-NEXT: ld1 { v1.b }[2], [x8] +; CHECK-NEXT: sshll v7.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v17.b }[2], [x10] +; CHECK-NEXT: sshll v5.8h, v3.8b, #0 +; CHECK-NEXT: add x10, sp, #568 +; CHECK-NEXT: saddl2 v4.4s, v7.8h, v6.8h +; CHECK-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-NEXT: saddl v3.4s, v7.4h, v6.4h +; CHECK-NEXT: ldr b7, [sp, #480] +; CHECK-NEXT: saddw v6.4s, v16.4s, v5.4h ; CHECK-NEXT: add x8, sp, #488 -; CHECK-NEXT: saddw v4.4s, v6.4s, v7.4h -; CHECK-NEXT: ldr b6, [sp, #480] -; CHECK-NEXT: add v5.4s, v16.4s, v5.4s -; CHECK-NEXT: ldr b7, [sp, #544] ; CHECK-NEXT: ldr b16, [sp, #672] -; CHECK-NEXT: ld1 { v6.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #496 -; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #560 -; CHECK-NEXT: ld1 { v16.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #688 -; CHECK-NEXT: ld1 { v1.b }[3], [x11] +; CHECK-NEXT: ld1 { v2.b }[3], [x11] ; CHECK-NEXT: add x11, sp, #640 -; CHECK-NEXT: ld1 { v6.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #504 -; CHECK-NEXT: ld1 { v7.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #568 -; CHECK-NEXT: ld1 { v16.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #696 -; CHECK-NEXT: ld1 { v1.b }[4], [x11] +; CHECK-NEXT: ld1 { v7.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #496 +; CHECK-NEXT: ld1 { v16.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #688 +; CHECK-NEXT: ld1 { v17.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #576 +; CHECK-NEXT: ld1 { v2.b }[4], [x11] ; CHECK-NEXT: add x11, sp, #648 -; CHECK-NEXT: ld1 { v6.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #512 -; CHECK-NEXT: ld1 { v7.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #576 -; CHECK-NEXT: ld1 { v16.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #704 -; CHECK-NEXT: ld1 { v1.b }[5], [x11] +; CHECK-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #504 +; CHECK-NEXT: ld1 { v16.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #696 +; CHECK-NEXT: ld1 { v17.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #584 +; CHECK-NEXT: ld1 { v2.b }[5], [x11] ; CHECK-NEXT: add x11, sp, #656 -; CHECK-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-NEXT: ld1 { v7.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #512 +; CHECK-NEXT: ld1 { v16.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #704 +; CHECK-NEXT: ld1 { v17.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #592 +; CHECK-NEXT: saddl2 v5.4s, v5.8h, v1.8h +; CHECK-NEXT: ld1 { v2.b }[6], [x11] +; CHECK-NEXT: ld1 { v7.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #520 -; CHECK-NEXT: ld1 { v7.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #584 -; CHECK-NEXT: ld1 { v16.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: ld1 { v1.b }[6], [x11] +; CHECK-NEXT: ld1 { v16.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #712 +; CHECK-NEXT: saddw v1.4s, v6.4s, v1.4h +; CHECK-NEXT: ldr b6, [sp, #736] +; CHECK-NEXT: ld1 { v17.b }[6], [x10] ; CHECK-NEXT: add x11, sp, #664 -; CHECK-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #528 -; CHECK-NEXT: ld1 { v7.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #592 -; CHECK-NEXT: ld1 { v16.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #720 -; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h -; CHECK-NEXT: ldr b3, [sp, #736] -; CHECK-NEXT: ld1 { v6.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h -; CHECK-NEXT: ld1 { v7.b }[6], [x9] -; CHECK-NEXT: ld1 { v16.b }[6], [x10] -; CHECK-NEXT: add x9, sp, #728 +; CHECK-NEXT: ld1 { v16.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #720 ; CHECK-NEXT: add x10, sp, #536 -; CHECK-NEXT: ld1 { v1.b }[7], [x11] -; CHECK-NEXT: movi v4.2d, #0000000000000000 -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ld1 { v7.b }[7], [x8] -; CHECK-NEXT: sshll v2.8h, v3.8b, #0 -; CHECK-NEXT: ld1 { v16.b }[7], [x9] -; CHECK-NEXT: ld1 { v6.b }[7], [x10] -; CHECK-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: mov v4.s[0], v2.s[0] +; CHECK-NEXT: ld1 { v2.b }[7], [x11] +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ld1 { v7.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #728 +; CHECK-NEXT: ld1 { v16.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #600 +; CHECK-NEXT: sshll v3.8h, v6.8b, #0 +; CHECK-NEXT: add v4.4s, v5.4s, v4.4s +; CHECK-NEXT: ld1 { v17.b }[7], [x9] +; CHECK-NEXT: ld1 { v16.b }[7], [x8] +; CHECK-NEXT: ld1 { v7.b }[7], [x10] +; CHECK-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: and v0.16b, v3.16b, v0.16b +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: sshll v5.8h, v16.8b, #0 +; CHECK-NEXT: sshll v6.8h, v17.8b, #0 ; CHECK-NEXT: sshll v3.8h, v7.8b, #0 -; CHECK-NEXT: sshll v7.8h, v16.8b, #0 -; CHECK-NEXT: sshll v2.8h, v6.8b, #0 -; CHECK-NEXT: saddl2 v6.4s, v7.8h, v3.8h -; CHECK-NEXT: saddl2 v16.4s, v1.8h, v2.8h -; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h -; CHECK-NEXT: saddl v3.4s, v7.4h, v3.4h -; CHECK-NEXT: add v4.4s, v16.4s, v6.4s -; CHECK-NEXT: saddw v1.4s, v2.4s, v1.4h -; CHECK-NEXT: add v2.4s, v3.4s, v4.4s -; CHECK-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: saddl2 v7.4s, v6.8h, v5.8h +; CHECK-NEXT: saddl2 v16.4s, v3.8h, v2.8h +; CHECK-NEXT: saddw v0.4s, v0.4s, v3.4h +; CHECK-NEXT: saddl v3.4s, v6.4h, v5.4h +; CHECK-NEXT: add v5.4s, v16.4s, v7.4s +; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: add v2.4s, v3.4s, v5.4s +; CHECK-NEXT: add v1.4s, v1.4s, v4.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/neon-sad.ll b/llvm/test/CodeGen/AArch64/neon-sad.ll --- a/llvm/test/CodeGen/AArch64/neon-sad.ll +++ b/llvm/test/CodeGen/AArch64/neon-sad.ll @@ -9,9 +9,20 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: uabdl v2.8h, v1.8b, v0.8b -; CHECK-NEXT: uabal2 v2.8h, v1.16b, v0.16b -; CHECK-NEXT: uaddlv s0, v2.8h +; CHECK-NEXT: usubl v2.8h, v1.8b, v0.8b +; CHECK-NEXT: usubl2 v0.8h, v1.16b, v0.16b +; CHECK-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: abs v0.4s, v0.4s +; CHECK-NEXT: abs v3.4s, v3.4s +; CHECK-NEXT: abs v1.4s, v1.4s +; CHECK-NEXT: abs v2.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -30,9 +41,20 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: sabdl v2.8h, v1.8b, v0.8b -; CHECK-NEXT: sabal2 v2.8h, v1.16b, v0.16b -; CHECK-NEXT: uaddlv s0, v2.8h +; CHECK-NEXT: ssubl v2.8h, v1.8b, v0.8b +; CHECK-NEXT: ssubl2 v0.8h, v1.16b, v0.16b +; CHECK-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-NEXT: sshll2 v3.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: abs v0.4s, v0.4s +; CHECK-NEXT: abs v3.4s, v3.4s +; CHECK-NEXT: abs v1.4s, v1.4s +; CHECK-NEXT: abs v2.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll --- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll @@ -478,47 +478,50 @@ ; CHECK-NEXT: ldr x10, [x0, #24] ; CHECK-NEXT: and x1, x8, #0x1 ; CHECK-NEXT: ldrb w11, [x0, #32] +; CHECK-NEXT: extr x12, x10, x9, #1 ; CHECK-NEXT: extr x2, x9, x8, #1 +; CHECK-NEXT: extr x8, x11, x10, #2 ; CHECK-NEXT: extr x4, x10, x9, #2 -; CHECK-NEXT: extr x6, x11, x10, #3 -; CHECK-NEXT: ubfx x3, x9, #1, #1 ; CHECK-NEXT: mov.d v0[1], x1 -; CHECK-NEXT: ubfx x5, x10, #2, #1 +; CHECK-NEXT: extr x6, x11, x10, #3 ; CHECK-NEXT: ubfx x7, x11, #3, #1 +; CHECK-NEXT: and x3, x12, #0x1 +; CHECK-NEXT: and x5, x8, #0x1 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_ldnp_v4i65: ; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: ldp x10, x9, [x0, #16] -; CHECK-BE-NEXT: ldp x12, x11, [x0] -; CHECK-BE-NEXT: ldrb w8, [x0, #32] -; CHECK-BE-NEXT: lsr x13, x10, #56 -; CHECK-BE-NEXT: lsr x14, x12, #56 -; CHECK-BE-NEXT: extr x15, x11, x10, #56 -; CHECK-BE-NEXT: orr x7, x8, x9, lsl #8 -; CHECK-BE-NEXT: extr x8, x10, x9, #56 -; CHECK-BE-NEXT: extr x9, x12, x11, #56 -; CHECK-BE-NEXT: lsr x12, x12, #59 -; CHECK-BE-NEXT: ubfx x10, x10, #57, #1 -; CHECK-BE-NEXT: extr x5, x13, x8, #1 +; CHECK-BE-NEXT: ldp x9, x8, [x0, #8] +; CHECK-BE-NEXT: ldr x10, [x0] +; CHECK-BE-NEXT: ldr x11, [x0, #24] +; CHECK-BE-NEXT: ldrb w13, [x0, #32] +; CHECK-BE-NEXT: extr x12, x9, x8, #56 +; CHECK-BE-NEXT: extr x9, x10, x9, #56 +; CHECK-BE-NEXT: lsr x14, x10, #56 +; CHECK-BE-NEXT: extr x8, x8, x11, #56 +; CHECK-BE-NEXT: lsr x10, x10, #59 +; CHECK-BE-NEXT: orr x7, x13, x11, lsl #8 +; CHECK-BE-NEXT: extr x15, x9, x12, #1 +; CHECK-BE-NEXT: extr x16, x14, x9, #2 +; CHECK-BE-NEXT: and x11, x8, #0x1 ; CHECK-BE-NEXT: extr x1, x14, x9, #3 -; CHECK-BE-NEXT: ubfx x9, x11, #58, #1 -; CHECK-BE-NEXT: fmov d0, x12 -; CHECK-BE-NEXT: and x12, x8, #0x1 -; CHECK-BE-NEXT: lsr x11, x11, #56 -; CHECK-BE-NEXT: fmov d2, x10 -; CHECK-BE-NEXT: fmov d1, x9 -; CHECK-BE-NEXT: extr x3, x11, x15, #2 -; CHECK-BE-NEXT: fmov d3, x12 +; CHECK-BE-NEXT: fmov d0, x10 +; CHECK-BE-NEXT: extr x3, x9, x12, #2 +; CHECK-BE-NEXT: and x13, x15, #0x1 +; CHECK-BE-NEXT: and x10, x16, #0x1 +; CHECK-BE-NEXT: fmov d1, x11 +; CHECK-BE-NEXT: extr x5, x12, x8, #1 ; CHECK-BE-NEXT: mov v0.d[1], x1 +; CHECK-BE-NEXT: fmov d2, x13 +; CHECK-BE-NEXT: fmov d3, x10 +; CHECK-BE-NEXT: mov v1.d[1], x7 ; CHECK-BE-NEXT: mov v2.d[1], x5 -; CHECK-BE-NEXT: mov v1.d[1], x3 -; CHECK-BE-NEXT: mov v3.d[1], x7 +; CHECK-BE-NEXT: mov v3.d[1], x3 ; CHECK-BE-NEXT: fmov x0, d0 +; CHECK-BE-NEXT: fmov x6, d1 ; CHECK-BE-NEXT: fmov x4, d2 -; CHECK-BE-NEXT: fmov x2, d1 -; CHECK-BE-NEXT: fmov x6, d3 +; CHECK-BE-NEXT: fmov x2, d3 ; CHECK-BE-NEXT: ret %lv = load <4 x i65>, ptr %A, align 8, !nontemporal !0 ret <4 x i65> %lv diff --git a/llvm/test/CodeGen/AArch64/nzcv-save.ll b/llvm/test/CodeGen/AArch64/nzcv-save.ll --- a/llvm/test/CodeGen/AArch64/nzcv-save.ll +++ b/llvm/test/CodeGen/AArch64/nzcv-save.ll @@ -12,13 +12,13 @@ ; CHECK-NEXT: ldp x14, x15, [x3, #16] ; CHECK-NEXT: adds x9, x9, x11 ; CHECK-NEXT: adcs x8, x8, x10 -; CHECK-NEXT: adcs x10, x13, x14 -; CHECK-NEXT: adc x11, x12, x15 -; CHECK-NEXT: orr x12, x12, #0x100 +; CHECK-NEXT: orr x10, x12, #0x100 +; CHECK-NEXT: adcs x11, x13, x14 ; CHECK-NEXT: adc x12, x12, x15 +; CHECK-NEXT: adc x10, x10, x15 ; CHECK-NEXT: stp x9, x8, [x0] -; CHECK-NEXT: stp x10, x11, [x0, #16] -; CHECK-NEXT: stp x10, x12, [x1, #16] +; CHECK-NEXT: stp x11, x12, [x0, #16] +; CHECK-NEXT: stp x11, x10, [x1, #16] ; CHECK-NEXT: stp x9, x8, [x1] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/pr61111.ll b/llvm/test/CodeGen/AArch64/pr61111.ll --- a/llvm/test/CodeGen/AArch64/pr61111.ll +++ b/llvm/test/CodeGen/AArch64/pr61111.ll @@ -4,10 +4,11 @@ define i62 @f(i1 %0) { ; CHECK-LABEL: f: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: and x8, x0, #0x1 -; CHECK-NEXT: sub x8, x8, #1 -; CHECK-NEXT: tst x8, #0x3fffffffffffffff +; CHECK-NEXT: and w9, w0, #0x1 +; CHECK-NEXT: mov x8, #4611686018427387903 // =0x3fffffffffffffff +; CHECK-NEXT: neg w9, w9 +; CHECK-NEXT: sxtw x9, w9 +; CHECK-NEXT: bics xzr, x8, x9 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %2 = zext i1 %0 to i59 diff --git a/llvm/test/CodeGen/AArch64/pre-indexed-addrmode-with-constant-offset.ll b/llvm/test/CodeGen/AArch64/pre-indexed-addrmode-with-constant-offset.ll --- a/llvm/test/CodeGen/AArch64/pre-indexed-addrmode-with-constant-offset.ll +++ b/llvm/test/CodeGen/AArch64/pre-indexed-addrmode-with-constant-offset.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s ; Reduced test from https://github.com/llvm/llvm-project/issues/60645. @@ -7,8 +8,8 @@ ; CHECK-LABEL: pr60645: ; CHECK: // %bb.0: ; CHECK-NEXT: sub x8, x0, x1, lsl #2 -; CHECK-NEXT: str wzr, [x8, #-32]! -; CHECK-NEXT: stur wzr, [x8, #-8] +; CHECK-NEXT: stur wzr, [x8, #-32] +; CHECK-NEXT: stur wzr, [x8, #-40] ; CHECK-NEXT: ret %t1 = add nuw nsw i64 %t0, 8 %t2 = mul i64 %t1, -4 diff --git a/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll b/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll --- a/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll +++ b/llvm/test/CodeGen/AArch64/pull-negations-after-concat-of-truncates.ll @@ -4,8 +4,11 @@ define <8 x i16> @not_not_trunc_concat(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: not_not_trunc_concat: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: mvn v1.8b, v1.8b +; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ret %notx = xor <4 x i32> %x, %trnx = trunc <4 x i32> %notx to <4 x i16> @@ -19,10 +22,17 @@ define <16 x i8> @not_not_trunc_concat_chain(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: not_not_trunc_concat_chain: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: xtn v3.4h, v3.4s +; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: mvn v1.8b, v1.8b +; CHECK-NEXT: mvn v2.8b, v2.8b +; CHECK-NEXT: mvn v3.8b, v3.8b +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: mov v2.d[1], v3.d[0] ; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: ret %nota = xor <4 x i32> %a, %trna = trunc <4 x i32> %nota to <4 x i16> diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -22,23 +22,24 @@ ; CHECK-NEXT: .cfi_offset b13, -48 ; CHECK-NEXT: .cfi_offset b14, -56 ; CHECK-NEXT: .cfi_offset b15, -64 -; CHECK-NEXT: movi v14.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: adrp x10, B+48 ; CHECK-NEXT: add x10, x10, :lo12:B+48 -; CHECK-NEXT: adrp x11, A -; CHECK-NEXT: add x11, x11, :lo12:A +; CHECK-NEXT: adrp x11, A+128 +; CHECK-NEXT: add x11, x11, :lo12:A+128 +; CHECK-NEXT: // implicit-def: $q0 ; CHECK-NEXT: // implicit-def: $q2 -; CHECK-NEXT: // implicit-def: $q3 -; CHECK-NEXT: // implicit-def: $q15 ; CHECK-NEXT: // implicit-def: $q4 ; CHECK-NEXT: // implicit-def: $q5 ; CHECK-NEXT: // implicit-def: $q6 +; CHECK-NEXT: // implicit-def: $q14 ; CHECK-NEXT: // implicit-def: $q7 ; CHECK-NEXT: // implicit-def: $q16 -; CHECK-NEXT: // implicit-def: $q17 ; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: // implicit-def: $q3 +; CHECK-NEXT: // kill: killed $q3 ; CHECK-NEXT: // implicit-def: $q19 ; CHECK-NEXT: // implicit-def: $q20 ; CHECK-NEXT: // implicit-def: $q21 @@ -61,114 +62,128 @@ ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov x12, xzr -; CHECK-NEXT: stp q15, q14, [sp] // 32-byte Folded Spill +; CHECK-NEXT: str q14, [sp] // 16-byte Folded Spill ; CHECK-NEXT: ldr q14, [x8] -; CHECK-NEXT: add x15, x11, x8 -; CHECK-NEXT: ldr q15, [x10], #64 -; CHECK-NEXT: ldr q0, [x12] ; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: mov v3.16b, v0.16b +; CHECK-NEXT: ldr x15, [x11, x8] +; CHECK-NEXT: ldr q0, [x12] ; CHECK-NEXT: ldr x12, [x12] ; CHECK-NEXT: fmov x13, d14 -; CHECK-NEXT: mov x14, v14.d[1] -; CHECK-NEXT: fmov x0, d15 +; CHECK-NEXT: ldr q15, [x10], #64 ; CHECK-NEXT: fmov x16, d0 -; CHECK-NEXT: ldr x15, [x15, #128] ; CHECK-NEXT: mul x17, x13, x12 +; CHECK-NEXT: mov x14, v14.d[1] +; CHECK-NEXT: fmov x2, d15 ; CHECK-NEXT: mov x18, v0.d[1] -; CHECK-NEXT: mul x4, x0, x12 ; CHECK-NEXT: mul x1, x16, x12 -; CHECK-NEXT: mul x3, x14, x12 +; CHECK-NEXT: mul x4, x13, x15 ; CHECK-NEXT: fmov d0, x17 -; CHECK-NEXT: mul x5, x13, x15 +; CHECK-NEXT: mul x5, x2, x12 ; CHECK-NEXT: mov x17, v15.d[1] -; CHECK-NEXT: fmov d15, x4 +; CHECK-NEXT: mul x3, x14, x12 ; CHECK-NEXT: fmov d14, x1 ; CHECK-NEXT: mul x1, x18, x12 -; CHECK-NEXT: mov v0.d[1], x3 -; CHECK-NEXT: mul x3, x16, x15 -; CHECK-NEXT: ldr x2, [x8], #8 +; CHECK-NEXT: mov v17.16b, v16.16b +; CHECK-NEXT: fmov d15, x4 +; CHECK-NEXT: mov v16.16b, v7.16b +; CHECK-NEXT: mul x4, x14, x15 ; CHECK-NEXT: mul x12, x17, x12 +; CHECK-NEXT: mov v0.d[1], x3 +; CHECK-NEXT: mul x3, x2, x15 +; CHECK-NEXT: mov v7.16b, v6.16b +; CHECK-NEXT: mov v6.16b, v5.16b +; CHECK-NEXT: mov v5.16b, v4.16b +; CHECK-NEXT: mov v4.16b, v2.16b +; CHECK-NEXT: mov v2.16b, v1.16b ; CHECK-NEXT: fmov d1, x5 ; CHECK-NEXT: mov v14.d[1], x1 -; CHECK-NEXT: mul x1, x14, x15 +; CHECK-NEXT: mul x1, x16, x15 +; CHECK-NEXT: ldr x0, [x8], #8 +; CHECK-NEXT: mov v1.d[1], x12 +; CHECK-NEXT: mul x12, x18, x15 +; CHECK-NEXT: mul x15, x17, x15 +; CHECK-NEXT: cmp x8, #64 ; CHECK-NEXT: add v12.2d, v12.2d, v0.2d -; CHECK-NEXT: mul x13, x13, x2 ; CHECK-NEXT: fmov d0, x3 -; CHECK-NEXT: mul x3, x0, x15 -; CHECK-NEXT: mov v15.d[1], x12 -; CHECK-NEXT: mul x12, x18, x2 -; CHECK-NEXT: mov v1.d[1], x1 -; CHECK-NEXT: mul x18, x18, x15 -; CHECK-NEXT: mul x16, x16, x2 -; CHECK-NEXT: cmp x8, #64 -; CHECK-NEXT: mul x15, x17, x15 ; CHECK-NEXT: add v13.2d, v13.2d, v14.2d -; CHECK-NEXT: mul x14, x14, x2 +; CHECK-NEXT: mul x13, x13, x0 ; CHECK-NEXT: add v11.2d, v11.2d, v14.2d -; CHECK-NEXT: fmov d14, x3 -; CHECK-NEXT: add v10.2d, v10.2d, v15.2d -; CHECK-NEXT: fmov d15, x13 -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x13, x0, x2 -; CHECK-NEXT: add v29.2d, v29.2d, v1.2d -; CHECK-NEXT: fmov d1, x16 -; CHECK-NEXT: mov v14.d[1], x15 -; CHECK-NEXT: mov v15.d[1], x14 -; CHECK-NEXT: mov v1.d[1], x12 -; CHECK-NEXT: mul x12, x17, x2 -; CHECK-NEXT: add v28.2d, v28.2d, v0.2d +; CHECK-NEXT: mul x14, x14, x0 +; CHECK-NEXT: fmov d14, x1 +; CHECK-NEXT: mul x16, x16, x0 +; CHECK-NEXT: mov v0.d[1], x15 +; CHECK-NEXT: mul x3, x18, x0 +; CHECK-NEXT: add v10.2d, v10.2d, v1.2d +; CHECK-NEXT: mov v14.d[1], x12 +; CHECK-NEXT: mul x12, x17, x0 +; CHECK-NEXT: fmov d1, x13 +; CHECK-NEXT: mul x13, x2, x0 +; CHECK-NEXT: add v27.2d, v27.2d, v0.2d +; CHECK-NEXT: mov v15.d[1], x4 +; CHECK-NEXT: add v28.2d, v28.2d, v14.2d +; CHECK-NEXT: ldp q14, q0, [sp] // 32-byte Folded Reload +; CHECK-NEXT: mov v1.d[1], x14 +; CHECK-NEXT: add v29.2d, v29.2d, v15.2d +; CHECK-NEXT: fmov d15, x16 +; CHECK-NEXT: add v8.2d, v8.2d, v1.2d +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: mov v15.d[1], x3 +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: add v27.2d, v27.2d, v14.2d -; CHECK-NEXT: ldr q14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: add v8.2d, v8.2d, v15.2d +; CHECK-NEXT: add v25.2d, v25.2d, v1.2d +; CHECK-NEXT: add v22.2d, v22.2d, v1.2d ; CHECK-NEXT: mov v0.d[1], x12 -; CHECK-NEXT: add v25.2d, v25.2d, v15.2d -; CHECK-NEXT: add v22.2d, v22.2d, v15.2d +; CHECK-NEXT: add v14.2d, v14.2d, v1.2d +; CHECK-NEXT: add v1.2d, v2.2d, v1.2d +; CHECK-NEXT: mov v2.16b, v4.16b +; CHECK-NEXT: mov v4.16b, v5.16b +; CHECK-NEXT: mov v5.16b, v6.16b +; CHECK-NEXT: mov v6.16b, v7.16b +; CHECK-NEXT: mov v7.16b, v16.16b +; CHECK-NEXT: mov v16.16b, v17.16b +; CHECK-NEXT: add v9.2d, v9.2d, v15.2d +; CHECK-NEXT: add v31.2d, v31.2d, v15.2d +; CHECK-NEXT: add v26.2d, v26.2d, v15.2d +; CHECK-NEXT: add v23.2d, v23.2d, v15.2d +; CHECK-NEXT: add v21.2d, v21.2d, v15.2d +; CHECK-NEXT: add v19.2d, v19.2d, v15.2d ; CHECK-NEXT: add v18.2d, v18.2d, v15.2d +; CHECK-NEXT: add v7.2d, v7.2d, v15.2d ; CHECK-NEXT: add v6.2d, v6.2d, v15.2d -; CHECK-NEXT: add v14.2d, v14.2d, v15.2d -; CHECK-NEXT: ldr q15, [sp] // 16-byte Folded Reload -; CHECK-NEXT: add v9.2d, v9.2d, v1.2d -; CHECK-NEXT: add v31.2d, v31.2d, v1.2d -; CHECK-NEXT: add v26.2d, v26.2d, v1.2d -; CHECK-NEXT: add v23.2d, v23.2d, v1.2d -; CHECK-NEXT: add v21.2d, v21.2d, v1.2d -; CHECK-NEXT: add v19.2d, v19.2d, v1.2d -; CHECK-NEXT: add v17.2d, v17.2d, v1.2d -; CHECK-NEXT: add v7.2d, v7.2d, v1.2d -; CHECK-NEXT: add v5.2d, v5.2d, v1.2d -; CHECK-NEXT: add v15.2d, v15.2d, v1.2d -; CHECK-NEXT: add v3.2d, v3.2d, v1.2d +; CHECK-NEXT: add v4.2d, v4.2d, v15.2d +; CHECK-NEXT: add v2.2d, v2.2d, v15.2d ; CHECK-NEXT: add v30.2d, v30.2d, v0.2d ; CHECK-NEXT: add v24.2d, v24.2d, v0.2d ; CHECK-NEXT: add v20.2d, v20.2d, v0.2d -; CHECK-NEXT: add v16.2d, v16.2d, v0.2d -; CHECK-NEXT: add v4.2d, v4.2d, v0.2d -; CHECK-NEXT: add v2.2d, v2.2d, v0.2d +; CHECK-NEXT: add v16.2d, v17.2d, v0.2d +; CHECK-NEXT: add v5.2d, v5.2d, v0.2d +; CHECK-NEXT: add v0.2d, v3.2d, v0.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C +; CHECK-NEXT: ldr q3, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: stp q13, q12, [x8] ; CHECK-NEXT: stp q11, q10, [x8, #32] ; CHECK-NEXT: stp q9, q8, [x8, #64] -; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: stp q14, q6, [x8, #400] +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: stp q31, q30, [x8, #96] -; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: stp q29, q28, [x8, #144] +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: stp q27, q26, [x8, #176] +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: str q25, [x8, #208] ; CHECK-NEXT: stp q24, q23, [x8, #240] ; CHECK-NEXT: stp q22, q21, [x8, #272] ; CHECK-NEXT: stp q20, q19, [x8, #304] -; CHECK-NEXT: stp q18, q17, [x8, #336] +; CHECK-NEXT: stp q3, q18, [x8, #336] ; CHECK-NEXT: stp q16, q7, [x8, #368] -; CHECK-NEXT: stp q6, q5, [x8, #400] -; CHECK-NEXT: stp q4, q15, [x8, #432] -; CHECK-NEXT: stp q14, q3, [x8, #464] -; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: str q2, [x8, #496] -; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: stp q5, q4, [x8, #432] +; CHECK-NEXT: stp q1, q2, [x8, #464] +; CHECK-NEXT: str q0, [x8, #496] ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore b8 diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -91,41 +91,41 @@ ; CHECK-NEXT: ext v6.16b, v6.16b, v6.16b, #4 ; CHECK-NEXT: ext v16.16b, v7.16b, v16.16b, #12 ; CHECK-NEXT: ext v17.16b, v3.16b, v17.16b, #12 -; CHECK-NEXT: mov v3.s[2], v5.s[3] -; CHECK-NEXT: mov v7.s[2], v2.s[3] -; CHECK-NEXT: mov v0.s[2], v2.s[1] -; CHECK-NEXT: uzp2 v4.4s, v4.4s, v18.4s -; CHECK-NEXT: sub v20.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v21.4s, v7.4s, v16.4s ; CHECK-NEXT: mov v3.s[1], v5.s[2] ; CHECK-NEXT: mov v7.s[1], v2.s[2] +; CHECK-NEXT: mov v0.s[1], v2.s[0] +; CHECK-NEXT: uzp2 v4.4s, v4.4s, v18.4s +; CHECK-NEXT: add v20.4s, v3.4s, v17.4s +; CHECK-NEXT: add v21.4s, v7.4s, v16.4s +; CHECK-NEXT: mov v3.s[2], v5.s[3] +; CHECK-NEXT: mov v7.s[2], v2.s[3] ; CHECK-NEXT: sub v18.4s, v1.4s, v6.4s ; CHECK-NEXT: mov v6.s[0], v5.s[1] -; CHECK-NEXT: sub v19.4s, v0.4s, v4.4s -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: add v2.4s, v3.4s, v17.4s -; CHECK-NEXT: add v3.4s, v7.4s, v16.4s +; CHECK-NEXT: add v19.4s, v0.4s, v4.4s +; CHECK-NEXT: mov v0.s[2], v2.s[1] +; CHECK-NEXT: sub v2.4s, v3.4s, v17.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v16.4s ; CHECK-NEXT: add v1.4s, v1.4s, v6.4s -; CHECK-NEXT: mov v3.d[1], v21.d[1] -; CHECK-NEXT: mov v2.d[1], v20.d[1] -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: mov v21.d[1], v3.d[1] +; CHECK-NEXT: mov v20.d[1], v2.d[1] +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s ; CHECK-NEXT: mov v1.d[1], v18.d[1] -; CHECK-NEXT: mov v0.d[1], v19.d[1] -; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 -; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 -; CHECK-NEXT: cmlt v4.8h, v1.8h, #0 -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v2.4s, v7.4s, v2.4s -; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: eor v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: mov v19.d[1], v0.d[1] +; CHECK-NEXT: cmlt v3.8h, v21.8h, #0 +; CHECK-NEXT: cmlt v4.8h, v20.8h, #0 +; CHECK-NEXT: cmlt v0.8h, v1.8h, #0 +; CHECK-NEXT: add v5.4s, v3.4s, v21.4s +; CHECK-NEXT: add v6.4s, v4.4s, v20.4s +; CHECK-NEXT: cmlt v2.8h, v19.8h, #0 +; CHECK-NEXT: add v1.4s, v0.4s, v1.4s +; CHECK-NEXT: eor v4.16b, v6.16b, v4.16b +; CHECK-NEXT: eor v3.16b, v5.16b, v3.16b +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: add v7.4s, v2.4s, v19.4s +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-NEXT: eor v1.16b, v7.16b, v2.16b +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 diff --git a/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll b/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll --- a/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll +++ b/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll @@ -25,7 +25,7 @@ ; CHECK-NEXT: sub x0, x29, #1 ; CHECK-NEXT: bl _bar ; CHECK-NEXT: ldurb w8, [x29, #-1] -; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: add w8, w8, #1 ; CHECK-NEXT: and x0, x8, #0xff ; CHECK-NEXT: sturb w8, [x29, #-1] ; CHECK-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/rotate-extract.ll b/llvm/test/CodeGen/AArch64/rotate-extract.ll --- a/llvm/test/CodeGen/AArch64/rotate-extract.ll +++ b/llvm/test/CodeGen/AArch64/rotate-extract.ll @@ -24,8 +24,8 @@ define i32 @ror_extract_shrl(i32 %i) nounwind { ; CHECK-LABEL: ror_extract_shrl: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #3 -; CHECK-NEXT: ror w0, w8, #4 +; CHECK-NEXT: ror w8, w0, #7 +; CHECK-NEXT: and w0, w8, #0xf1ffffff ; CHECK-NEXT: ret %lhs_div = lshr i32 %i, 7 %rhs_div = lshr i32 %i, 3 @@ -50,11 +50,11 @@ define i64 @ror_extract_udiv(i64 %i) nounwind { ; CHECK-LABEL: ror_extract_udiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-6148914691236517206 +; CHECK-NEXT: mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa ; CHECK-NEXT: movk x8, #43691 ; CHECK-NEXT: umulh x8, x0, x8 -; CHECK-NEXT: lsr x8, x8, #1 -; CHECK-NEXT: ror x0, x8, #4 +; CHECK-NEXT: ror x8, x8, #5 +; CHECK-NEXT: and x0, x8, #0xf7ffffffffffffff ; CHECK-NEXT: ret %lhs_div = udiv i64 %i, 3 %rhs_div = udiv i64 %i, 48 @@ -66,9 +66,10 @@ define i64 @ror_extract_mul_with_mask(i64 %i) nounwind { ; CHECK-LABEL: ror_extract_mul_with_mask: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, x0, lsl #3 -; CHECK-NEXT: ror x8, x8, #57 -; CHECK-NEXT: and x0, x8, #0xff +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: add x9, x0, x0, lsl #3 +; CHECK-NEXT: lsr x0, x9, #57 +; CHECK-NEXT: bfi x0, x8, #7, #1 ; CHECK-NEXT: ret %lhs_mul = mul i64 %i, 1152 %rhs_mul = mul i64 %i, 9 @@ -127,15 +128,15 @@ define i32 @no_extract_udiv(i32 %i) nounwind { ; CHECK-LABEL: no_extract_udiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #33437 -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: movk w8, #21399, lsl #16 -; CHECK-NEXT: movk w9, #43690, lsl #16 +; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: mov w9, #33437 // =0x829d +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: movk w9, #21399, lsl #16 ; CHECK-NEXT: umull x8, w0, w8 ; CHECK-NEXT: umull x9, w0, w9 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: lsr x9, x9, #33 -; CHECK-NEXT: extr w0, w9, w8, #4 +; CHECK-NEXT: lsr x8, x8, #33 +; CHECK-NEXT: lsr x9, x9, #36 +; CHECK-NEXT: orr w0, w9, w8, lsl #28 ; CHECK-NEXT: ret %lhs_div = udiv i32 %i, 3 %rhs_div = udiv i32 %i, 49 diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -133,15 +133,17 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1 { v0.b }[0], [x1] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v1.b }[0], [x0] -; CHECK-NEXT: add x9, x0, #1 -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: ldrsb w8, [x1] +; CHECK-NEXT: ldrsb w9, [x0] +; CHECK-NEXT: ldrsb w10, [x1, #1] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: ldrsb w9, [x0, #1] +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: shl v1.2s, v1.2s, #24 -; CHECK-NEXT: sqadd v0.2s, v1.2s, v0.2s +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 @@ -173,15 +175,17 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1 { v0.h }[0], [x1] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v1.h }[0], [x0] -; CHECK-NEXT: add x9, x0, #2 -; CHECK-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: ldrsh w8, [x1] +; CHECK-NEXT: ldrsh w9, [x0] +; CHECK-NEXT: ldrsh w10, [x1, #2] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: ldrsh w9, [x0, #2] +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: sqadd v0.2s, v1.2s, v0.2s +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: unsigned_sat_constant_i8_using_min: ; CHECK: // %bb.0: ; CHECK-NEXT: and w9, w0, #0xff -; CHECK-NEXT: mov w8, #-43 +; CHECK-NEXT: mov w8, #-43 // =0xffffffd5 ; CHECK-NEXT: cmp w9, #213 ; CHECK-NEXT: csel w8, w0, w8, lo ; CHECK-NEXT: add w0, w8, #42 @@ -26,7 +26,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff ; CHECK-NEXT: add w8, w8, #42 -; CHECK-NEXT: tst w8, #0x100 +; CHECK-NEXT: lsr w9, w8, #8 +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csinv w0, w8, wzr, eq ; CHECK-NEXT: ret %a = add i8 %x, 42 @@ -52,9 +53,9 @@ define i16 @unsigned_sat_constant_i16_using_min(i16 %x) { ; CHECK-LABEL: unsigned_sat_constant_i16_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65493 +; CHECK-NEXT: mov w8, #65493 // =0xffd5 ; CHECK-NEXT: cmp w8, w0, uxth -; CHECK-NEXT: mov w8, #-43 +; CHECK-NEXT: mov w8, #-43 // =0xffffffd5 ; CHECK-NEXT: csel w8, w0, w8, hi ; CHECK-NEXT: add w0, w8, #42 ; CHECK-NEXT: ret @@ -69,7 +70,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff ; CHECK-NEXT: add w8, w8, #42 -; CHECK-NEXT: tst w8, #0x10000 +; CHECK-NEXT: lsr w9, w8, #16 +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csinv w0, w8, wzr, eq ; CHECK-NEXT: ret %a = add i16 %x, 42 @@ -81,7 +83,7 @@ define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) { ; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_notval: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65493 +; CHECK-NEXT: mov w8, #65493 // =0xffd5 ; CHECK-NEXT: add w9, w0, #42 ; CHECK-NEXT: cmp w8, w0, uxth ; CHECK-NEXT: csinv w0, w9, wzr, hs @@ -95,7 +97,7 @@ define i32 @unsigned_sat_constant_i32_using_min(i32 %x) { ; CHECK-LABEL: unsigned_sat_constant_i32_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-43 +; CHECK-NEXT: mov w8, #-43 // =0xffffffd5 ; CHECK-NEXT: cmn w0, #43 ; CHECK-NEXT: csel w8, w0, w8, lo ; CHECK-NEXT: add w0, w8, #42 @@ -133,7 +135,7 @@ define i64 @unsigned_sat_constant_i64_using_min(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-43 +; CHECK-NEXT: mov x8, #-43 // =0xffffffffffffffd5 ; CHECK-NEXT: cmn x0, #43 ; CHECK-NEXT: csel x8, x0, x8, lo ; CHECK-NEXT: add x0, x8, #42 @@ -189,7 +191,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff ; CHECK-NEXT: add w8, w8, w1, uxtb -; CHECK-NEXT: tst w8, #0x100 +; CHECK-NEXT: lsr w9, w8, #8 +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csinv w0, w8, wzr, eq ; CHECK-NEXT: ret %a = add i8 %x, %y @@ -204,7 +207,8 @@ ; CHECK-NEXT: and w8, w1, #0xff ; CHECK-NEXT: add w9, w0, w1 ; CHECK-NEXT: add w8, w8, w0, uxtb -; CHECK-NEXT: tst w8, #0x100 +; CHECK-NEXT: lsr w8, w8, #8 +; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: csinv w0, w9, wzr, eq ; CHECK-NEXT: ret %noty = xor i8 %y, -1 @@ -235,7 +239,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff ; CHECK-NEXT: add w8, w8, w1, uxth -; CHECK-NEXT: tst w8, #0x10000 +; CHECK-NEXT: lsr w9, w8, #16 +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csinv w0, w8, wzr, eq ; CHECK-NEXT: ret %a = add i16 %x, %y @@ -250,7 +255,8 @@ ; CHECK-NEXT: and w8, w1, #0xffff ; CHECK-NEXT: add w9, w0, w1 ; CHECK-NEXT: add w8, w8, w0, uxth -; CHECK-NEXT: tst w8, #0x10000 +; CHECK-NEXT: lsr w8, w8, #16 +; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: csinv w0, w9, wzr, eq ; CHECK-NEXT: ret %noty = xor i16 %y, -1 @@ -459,9 +465,9 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) { ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-43 +; CHECK-NEXT: mov x8, #-43 // =0xffffffffffffffd5 ; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: cmhi v2.2d, v1.2d, v0.2d ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: dup v1.2d, x8 @@ -476,7 +482,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) { ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret @@ -489,7 +495,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) { ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/select_fmf.ll b/llvm/test/CodeGen/AArch64/select_fmf.ll --- a/llvm/test/CodeGen/AArch64/select_fmf.ll +++ b/llvm/test/CodeGen/AArch64/select_fmf.ll @@ -7,11 +7,12 @@ define float @select_select_fold_select_and(float %w, float %x, float %y, float %z) { ; CHECK-LABEL: select_select_fold_select_and: ; CHECK: // %bb.0: -; CHECK-NEXT: fminnm s5, s1, s2 +; CHECK-NEXT: fmaxnm s5, s0, s3 +; CHECK-NEXT: fminnm s6, s1, s2 ; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: fmaxnm s1, s0, s3 ; CHECK-NEXT: fmov s4, #0.50000000 -; CHECK-NEXT: fccmp s5, s0, #4, lt +; CHECK-NEXT: fcsel s1, s5, s0, lt +; CHECK-NEXT: fcmp s6, s0 ; CHECK-NEXT: fcsel s2, s1, s0, gt ; CHECK-NEXT: fadd s1, s0, s4 ; CHECK-NEXT: fadd s4, s1, s2 @@ -22,8 +23,8 @@ ; CHECK-NEXT: fadd s0, s2, s0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_2: // %if.end.i159.i.i -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #13107 +; CHECK-NEXT: mov w8, #52429 // =0xcccd +; CHECK-NEXT: mov w9, #13107 // =0x3333 ; CHECK-NEXT: movk w8, #48844, lsl #16 ; CHECK-NEXT: movk w9, #48819, lsl #16 ; CHECK-NEXT: fcmp s1, #0.0 @@ -65,11 +66,12 @@ define float @select_select_fold_select_or(float %w, float %x, float %y, float %z) { ; CHECK-LABEL: select_select_fold_select_or: ; CHECK: // %bb.0: -; CHECK-NEXT: fminnm s5, s1, s2 +; CHECK-NEXT: fmaxnm s5, s0, s3 +; CHECK-NEXT: fminnm s6, s1, s2 ; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: fmaxnm s1, s0, s3 ; CHECK-NEXT: fmov s4, #0.50000000 -; CHECK-NEXT: fccmp s5, s0, #0, ge +; CHECK-NEXT: fcsel s1, s0, s5, lt +; CHECK-NEXT: fcmp s6, s0 ; CHECK-NEXT: fcsel s2, s0, s1, gt ; CHECK-NEXT: fadd s1, s0, s4 ; CHECK-NEXT: fadd s4, s1, s2 @@ -80,8 +82,8 @@ ; CHECK-NEXT: fadd s0, s2, s0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_2: // %if.end.i159.i.i -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #13107 +; CHECK-NEXT: mov w8, #52429 // =0xcccd +; CHECK-NEXT: mov w9, #13107 // =0x3333 ; CHECK-NEXT: movk w8, #48844, lsl #16 ; CHECK-NEXT: movk w9, #48819, lsl #16 ; CHECK-NEXT: fcmp s1, #0.0 diff --git a/llvm/test/CodeGen/AArch64/setcc-fsh.ll b/llvm/test/CodeGen/AArch64/setcc-fsh.ll --- a/llvm/test/CodeGen/AArch64/setcc-fsh.ll +++ b/llvm/test/CodeGen/AArch64/setcc-fsh.ll @@ -63,7 +63,9 @@ define i1 @fshr_or_eq_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or_eq_0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w0, w1, lsl #8 +; CHECK-NEXT: lsl w8, w0, #16 +; CHECK-NEXT: orr w9, w0, w1 +; CHECK-NEXT: extr w8, w9, w8, #24 ; CHECK-NEXT: tst w8, #0xffff ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -76,7 +78,9 @@ define i1 @fshr_or_commute_eq_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or_commute_eq_0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w0, w1, lsl #8 +; CHECK-NEXT: lsl w8, w0, #16 +; CHECK-NEXT: orr w9, w1, w0 +; CHECK-NEXT: extr w8, w9, w8, #24 ; CHECK-NEXT: tst w8, #0xffff ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -156,7 +160,8 @@ define i1 @fshr_or_ne_0(i64 %x, i64 %y) { ; CHECK-LABEL: fshr_or_ne_0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1, lsl #63 +; CHECK-NEXT: orr w8, w0, w1 +; CHECK-NEXT: extr x8, x8, x0, #1 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -169,7 +174,8 @@ define i1 @fshr_or_commute_ne_0(i64 %x, i64 %y) { ; CHECK-LABEL: fshr_or_commute_ne_0: ; CHECK: // %bb.0: -; CHECK-NEXT: orr x8, x0, x1, lsl #63 +; CHECK-NEXT: orr w8, w1, w0 +; CHECK-NEXT: extr x8, x8, x0, #1 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -182,8 +188,9 @@ define i1 @fshr_or2_ne_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or2_ne_0: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w1, #0xfffc -; CHECK-NEXT: orr w8, w0, w8, lsr #2 +; CHECK-NEXT: orr w8, w0, w1 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: extr w8, w0, w8, #18 ; CHECK-NEXT: tst w8, #0xffff ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret @@ -196,8 +203,9 @@ define i1 @fshr_or2_commute_ne_0(i16 %x, i16 %y) { ; CHECK-LABEL: fshr_or2_commute_ne_0: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w1, #0xfffc -; CHECK-NEXT: orr w8, w0, w8, lsr #2 +; CHECK-NEXT: orr w8, w1, w0 +; CHECK-NEXT: lsl w8, w8, #16 +; CHECK-NEXT: extr w8, w0, w8, #18 ; CHECK-NEXT: tst w8, #0xffff ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shift-accumulate.ll b/llvm/test/CodeGen/AArch64/shift-accumulate.ll --- a/llvm/test/CodeGen/AArch64/shift-accumulate.ll +++ b/llvm/test/CodeGen/AArch64/shift-accumulate.ll @@ -92,8 +92,8 @@ define <1 x i64> @ssra_v1i64(<2 x i32> %0) { ; CHECK-LABEL: ssra_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr d1, d0, #63 ; CHECK-NEXT: bic v0.2s, #64, lsl #24 +; CHECK-NEXT: ushr d1, d0, #63 ; CHECK-NEXT: ssra d1, d0, #62 ; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret @@ -108,8 +108,8 @@ define <2 x i64> @ssra_v2i64(<4 x i32> %0) { ; CHECK-LABEL: ssra_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ushr v1.2d, v0.2d, #63 ; CHECK-NEXT: bic v0.4s, #64, lsl #24 +; CHECK-NEXT: ushr v1.2d, v0.2d, #63 ; CHECK-NEXT: ssra v1.2d, v0.2d, #62 ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll --- a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll +++ b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll @@ -61,7 +61,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #32 +; CHECK-NEXT: mov w10, #32 // =0x20 ; CHECK-NEXT: lsl w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -128,7 +128,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #64 +; CHECK-NEXT: mov w10, #64 // =0x40 ; CHECK-NEXT: lsl x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -198,7 +198,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #32 +; CHECK-NEXT: mov w10, #32 // =0x20 ; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -265,7 +265,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #64 +; CHECK-NEXT: mov w10, #64 // =0x40 ; CHECK-NEXT: lsr x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -335,7 +335,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #32 +; CHECK-NEXT: mov w10, #32 // =0x20 ; CHECK-NEXT: asr w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -402,7 +402,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: neg x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #64 +; CHECK-NEXT: mov w10, #64 // =0x40 ; CHECK-NEXT: asr x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -476,7 +476,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #31 +; CHECK-NEXT: mov w10, #31 // =0x1f ; CHECK-NEXT: lsl w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -543,7 +543,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #63 +; CHECK-NEXT: mov w10, #63 // =0x3f ; CHECK-NEXT: lsl x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -613,7 +613,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #31 +; CHECK-NEXT: mov w10, #31 // =0x1f ; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -680,7 +680,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #63 +; CHECK-NEXT: mov w10, #63 // =0x3f ; CHECK-NEXT: lsr x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -750,7 +750,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn w8, w1 ; CHECK-NEXT: ldr w9, [x0] -; CHECK-NEXT: mov w10, #31 +; CHECK-NEXT: mov w10, #31 // =0x1f ; CHECK-NEXT: asr w8, w9, w8 ; CHECK-NEXT: sub w9, w10, w1 ; CHECK-NEXT: str w8, [x0] @@ -817,7 +817,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mvn x8, x1 ; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #63 +; CHECK-NEXT: mov w10, #63 // =0x3f ; CHECK-NEXT: asr x8, x9, x8 ; CHECK-NEXT: sub x9, x10, x1 ; CHECK-NEXT: str x8, [x0] @@ -1030,7 +1030,7 @@ define i32 @reg32_lshr_by_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b) nounwind { ; CHECK-LABEL: reg32_lshr_by_b_sub_negated_unfolded: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w2, w1 +; CHECK-NEXT: add w8, w1, w2 ; CHECK-NEXT: lsr w0, w0, w8 ; CHECK-NEXT: ret %nega = sub i32 0, %a @@ -1042,7 +1042,7 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounwind { ; CHECK-LABEL: reg64_lshr_by_b_sub_negated_unfolded: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x2, x1 +; CHECK-NEXT: add x8, x1, x2 ; CHECK-NEXT: lsr x0, x0, x8 ; CHECK-NEXT: ret %nega = sub i64 0, %a diff --git a/llvm/test/CodeGen/AArch64/shift-by-signext.ll b/llvm/test/CodeGen/AArch64/shift-by-signext.ll --- a/llvm/test/CodeGen/AArch64/shift-by-signext.ll +++ b/llvm/test/CodeGen/AArch64/shift-by-signext.ll @@ -80,12 +80,12 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind { ; CHECK-LABEL: n6_fshl: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsr w9, w1, #1 -; CHECK-NEXT: lsl w10, w0, w2 -; CHECK-NEXT: lsr w8, w9, w8 -; CHECK-NEXT: orr w0, w10, w8 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsr w10, w1, #1 +; CHECK-NEXT: lsr w9, w10, w9 +; CHECK-NEXT: lsl w8, w0, w8 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %shamt_wide = sext i8 %shamt to i32 %r = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %shamt_wide) @@ -94,12 +94,12 @@ define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind { ; CHECK-LABEL: n7_fshr: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w8, w2 -; CHECK-NEXT: lsl w9, w0, #1 -; CHECK-NEXT: lsr w10, w1, w2 -; CHECK-NEXT: lsl w8, w9, w8 -; CHECK-NEXT: orr w0, w8, w10 +; CHECK-NEXT: mov w8, w2 +; CHECK-NEXT: mvn w9, w2 +; CHECK-NEXT: lsl w10, w0, #1 +; CHECK-NEXT: lsr w8, w1, w8 +; CHECK-NEXT: lsl w9, w10, w9 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %shamt_wide = sext i8 %shamt to i32 %r = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt_wide) diff --git a/llvm/test/CodeGen/AArch64/shiftregister-from-and.ll b/llvm/test/CodeGen/AArch64/shiftregister-from-and.ll --- a/llvm/test/CodeGen/AArch64/shiftregister-from-and.ll +++ b/llvm/test/CodeGen/AArch64/shiftregister-from-and.ll @@ -21,7 +21,7 @@ define i64 @bic_shiftedreg_from_and(i64 %a, i64 %b) { ; CHECK-LABEL: bic_shiftedreg_from_and: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16777215 +; CHECK-NEXT: mov w8, #16777215 // =0xffffff ; CHECK-NEXT: orn x8, x8, x0, asr #23 ; CHECK-NEXT: and x0, x1, x8 ; CHECK-NEXT: ret @@ -37,8 +37,9 @@ define i64 @eon_shiftedreg_from_and(i64 %a, i64 %b) { ; CHECK-LABEL: eon_shiftedreg_from_and: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #17 -; CHECK-NEXT: eon x0, x1, x8, lsl #53 +; CHECK-NEXT: mov x8, #9007199254740991 // =0x1fffffffffffff +; CHECK-NEXT: orn x8, x8, x0, lsl #36 +; CHECK-NEXT: eor x0, x1, x8 ; CHECK-NEXT: ret %shl = shl i64 %a, 36 %and = and i64 %shl, -9007199254740992 @@ -67,7 +68,7 @@ define i64 @mvn_shiftedreg_from_and(i64 %a) { ; CHECK-LABEL: mvn_shiftedreg_from_and: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #9007199254740991 +; CHECK-NEXT: mov x8, #9007199254740991 // =0x1fffffffffffff ; CHECK-NEXT: orn x0, x8, x0, lsl #36 ; CHECK-NEXT: ret %shl = shl i64 %a, 36 @@ -205,7 +206,7 @@ define i32 @shiftedreg_from_and_negative_andc1(i32 %a, i32 %b) { ; CHECK-LABEL: shiftedreg_from_and_negative_andc1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #26215 +; CHECK-NEXT: mov w8, #26215 // =0x6667 ; CHECK-NEXT: movk w8, #65510, lsl #16 ; CHECK-NEXT: and w8, w8, w0, asr #23 ; CHECK-NEXT: add w0, w8, w1 @@ -221,7 +222,7 @@ define i32 @shiftedreg_from_and_negative_andc2(i32 %a, i32 %b) { ; CHECK-LABEL: shiftedreg_from_and_negative_andc2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-285212672 +; CHECK-NEXT: mov w8, #-285212672 // =0xef000000 ; CHECK-NEXT: and w8, w8, w0, asr #23 ; CHECK-NEXT: add w0, w8, w1 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll --- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll +++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll @@ -182,10 +182,10 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: adrp x8, .LCPI3_2 ; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI3_1] -; CHECK-NEXT: tbl v1.16b, { v0.16b }, v1.16b -; CHECK-NEXT: tbl v0.16b, { v2.16b }, v3.16b -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-NEXT: tbl v2.16b, { v2.16b }, v1.16b +; CHECK-NEXT: tbl v1.16b, { v0.16b }, v3.16b +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b ; CHECK-NEXT: ret %x = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %y = shufflevector <16 x i8> %c, <16 x i8> %d, <8 x i32> @@ -429,13 +429,13 @@ define <16 x i8> @shuffle4_v4i32_trunc(<4 x i32> %ae, <4 x i32> %be, <4 x i32> %ce, <4 x i32> %de) { ; CHECK-LABEL: shuffle4_v4i32_trunc: ; CHECK: // %bb.0: +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: xtn v4.4h, v0.4s -; CHECK-NEXT: xtn v5.4h, v1.4s -; CHECK-NEXT: xtn v6.4h, v2.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: xtn v7.4h, v3.4s -; CHECK-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: xtn v3.8b, v0.8h +; CHECK-NEXT: xtn v4.8b, v2.8h +; CHECK-NEXT: tbl v0.16b, { v3.16b, v4.16b }, v1.16b ; CHECK-NEXT: ret %a = trunc <4 x i32> %ae to <4 x i8> %b = trunc <4 x i32> %be to <4 x i8> @@ -559,19 +559,17 @@ define <8 x i8> @insert4_v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: insert4_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: adrp x9, .LCPI14_1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v4.16b, v3.16b +; CHECK-NEXT: dup v4.8b, v0.b[4] ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v0.d[1], v2.d[0] -; CHECK-NEXT: mov v3.16b, v1.16b -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_1] -; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b -; CHECK-NEXT: tbl v1.16b, { v3.16b, v4.16b }, v2.16b -; CHECK-NEXT: trn1 v0.4h, v1.4h, v0.4h -; CHECK-NEXT: trn2 v0.4h, v0.4h, v1.4h +; CHECK-NEXT: mov v4.b[1], v2.b[0] +; CHECK-NEXT: mov v4.b[2], v1.b[15] +; CHECK-NEXT: mov v4.b[3], v3.b[11] +; CHECK-NEXT: mov v4.b[4], v2.b[6] +; CHECK-NEXT: mov v4.b[5], v0.b[3] +; CHECK-NEXT: mov v4.b[6], v3.b[8] +; CHECK-NEXT: mov v4.b[7], v1.b[12] +; CHECK-NEXT: fmov d0, d4 ; CHECK-NEXT: ret %e1 = extractelement <8 x i8> %a, i32 4 %e2 = extractelement <8 x i8> %c, i32 0 @@ -629,17 +627,25 @@ define <16 x i8> @insert4_v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: insert4_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q31_q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v4.8b, v0.b[4] ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v4.16b, v3.16b -; CHECK-NEXT: mov v3.16b, v1.16b -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: adrp x8, .LCPI15_1 -; CHECK-NEXT: mov v0.d[1], v2.d[0] -; CHECK-NEXT: tbl v31.16b, { v3.16b, v4.16b }, v5.16b -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] -; CHECK-NEXT: tbl v0.16b, { v31.16b, v0.16b }, v1.16b +; CHECK-NEXT: mov v4.b[1], v2.b[0] +; CHECK-NEXT: mov v4.b[2], v1.b[15] +; CHECK-NEXT: mov v4.b[3], v3.b[11] +; CHECK-NEXT: mov v4.b[4], v2.b[6] +; CHECK-NEXT: mov v4.b[5], v0.b[3] +; CHECK-NEXT: mov v4.b[6], v3.b[8] +; CHECK-NEXT: mov v4.b[7], v1.b[12] +; CHECK-NEXT: mov v4.b[8], v0.b[4] +; CHECK-NEXT: mov v4.b[9], v2.b[0] +; CHECK-NEXT: mov v4.b[10], v1.b[15] +; CHECK-NEXT: mov v4.b[11], v3.b[11] +; CHECK-NEXT: mov v4.b[12], v2.b[6] +; CHECK-NEXT: mov v4.b[13], v0.b[3] +; CHECK-NEXT: mov v4.b[14], v3.b[8] +; CHECK-NEXT: mov v4.b[15], v1.b[12] +; CHECK-NEXT: mov v0.16b, v4.16b ; CHECK-NEXT: ret %e1 = extractelement <8 x i8> %a, i32 4 %e2 = extractelement <8 x i8> %c, i32 0 @@ -698,35 +704,52 @@ ; CHECK-LABEL: test: ; CHECK: // %bb.0: ; CHECK-NEXT: frintm v0.2d, v0.2d -; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: frintm v4.2d, v4.2d ; CHECK-NEXT: frintm v1.2d, v1.2d -; CHECK-NEXT: frintm v5.2d, v5.2d ; CHECK-NEXT: frintm v2.2d, v2.2d -; CHECK-NEXT: frintm v6.2d, v6.2d ; CHECK-NEXT: frintm v3.2d, v3.2d -; CHECK-NEXT: frintm v7.2d, v7.2d ; CHECK-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-NEXT: fcvtzs v4.2d, v4.2d ; CHECK-NEXT: fcvtzs v1.2d, v1.2d +; CHECK-NEXT: frintm v4.2d, v4.2d +; CHECK-NEXT: frintm v5.2d, v5.2d +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: xtn v16.2s, v1.2d +; CHECK-NEXT: fcvtzs v1.2d, v2.2d +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fcvtzs v2.2d, v3.2d +; CHECK-NEXT: mov w9, v16.s[1] +; CHECK-NEXT: fcvtzs v4.2d, v4.2d +; CHECK-NEXT: frintm v3.2d, v7.2d ; CHECK-NEXT: fcvtzs v5.2d, v5.2d -; CHECK-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-NEXT: fcvtzs v6.2d, v6.2d +; CHECK-NEXT: xtn v7.2s, v1.2d +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: xtn v2.2s, v2.2d +; CHECK-NEXT: mov w8, v7.s[1] +; CHECK-NEXT: xtn v4.2s, v4.2d +; CHECK-NEXT: mov v0.h[1], v16.h[0] +; CHECK-NEXT: xtn v5.2s, v5.2d +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: mov w9, v2.s[1] +; CHECK-NEXT: frintm v6.2d, v6.2d ; CHECK-NEXT: fcvtzs v3.2d, v3.2d -; CHECK-NEXT: xtn v16.2s, v0.2d -; CHECK-NEXT: fcvtzs v0.2d, v7.2d -; CHECK-NEXT: xtn v20.2s, v4.2d -; CHECK-NEXT: xtn v17.2s, v1.2d -; CHECK-NEXT: xtn v21.2s, v5.2d -; CHECK-NEXT: xtn v18.2s, v2.2d -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: xtn v22.2s, v6.2d -; CHECK-NEXT: xtn v19.2s, v3.2d -; CHECK-NEXT: xtn v23.2s, v0.2d -; CHECK-NEXT: tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v1.16b -; CHECK-NEXT: tbl v1.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v1.16b -; CHECK-NEXT: uzp1 v0.8h, v2.8h, v1.8h -; CHECK-NEXT: uzp2 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: mov v0.h[2], v7.h[0] +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: mov w8, v4.s[1] +; CHECK-NEXT: fcvtzs v6.2d, v6.2d +; CHECK-NEXT: xtn v3.2s, v3.2d +; CHECK-NEXT: mov v0.h[3], v2.h[0] +; CHECK-NEXT: mov v1.h[3], w9 +; CHECK-NEXT: mov w9, v5.s[1] +; CHECK-NEXT: xtn v6.2s, v6.2d +; CHECK-NEXT: mov v0.h[4], v4.h[0] +; CHECK-NEXT: mov v1.h[4], w8 +; CHECK-NEXT: mov w8, v6.s[1] +; CHECK-NEXT: mov v0.h[5], v5.h[0] +; CHECK-NEXT: mov v1.h[5], w9 +; CHECK-NEXT: mov w9, v3.s[1] +; CHECK-NEXT: mov v0.h[6], v6.h[0] +; CHECK-NEXT: mov v1.h[6], w8 +; CHECK-NEXT: mov v0.h[7], v3.h[0] +; CHECK-NEXT: mov v1.h[7], w9 ; CHECK-NEXT: ret %l214 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %l213) %l215 = fptosi <2 x double> %l214 to <2 x i16> diff --git a/llvm/test/CodeGen/AArch64/signbit-shift.ll b/llvm/test/CodeGen/AArch64/signbit-shift.ll --- a/llvm/test/CodeGen/AArch64/signbit-shift.ll +++ b/llvm/test/CodeGen/AArch64/signbit-shift.ll @@ -43,7 +43,7 @@ define i32 @sel_ifpos_tval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifpos_tval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: mov w8, #41 // =0x29 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, ge ; CHECK-NEXT: ret @@ -66,8 +66,9 @@ define i32 @add_sext_ifpos(i32 %x) { ; CHECK-LABEL: add_sext_ifpos: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #31 -; CHECK-NEXT: add w0, w8, #41 +; CHECK-NEXT: mvn w8, w0 +; CHECK-NEXT: asr w8, w8, #31 +; CHECK-NEXT: add w0, w8, #42 ; CHECK-NEXT: ret %c = icmp sgt i32 %x, -1 %e = sext i1 %c to i32 @@ -92,7 +93,7 @@ define i32 @sel_ifpos_fval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifpos_fval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: mov w8, #41 // =0x29 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, lt ; CHECK-NEXT: ret @@ -128,7 +129,7 @@ define i32 @sel_ifneg_tval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifneg_tval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: mov w8, #41 // =0x29 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, lt ; CHECK-NEXT: ret @@ -162,7 +163,7 @@ define i32 @sel_ifneg_fval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifneg_fval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: mov w8, #41 // =0x29 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, ge ; CHECK-NEXT: ret @@ -199,7 +200,7 @@ define i32 @sub_lshr_not(i32 %x) { ; CHECK-LABEL: sub_lshr_not: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: bfxil w8, w0, #31, #1 ; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/signbit-test.ll b/llvm/test/CodeGen/AArch64/signbit-test.ll --- a/llvm/test/CodeGen/AArch64/signbit-test.ll +++ b/llvm/test/CodeGen/AArch64/signbit-test.ll @@ -4,7 +4,7 @@ define i64 @test_clear_mask_i64_i32(i64 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i64_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: csel x0, x8, x0, ge ; CHECK-NEXT: ret @@ -22,9 +22,9 @@ define i64 @test_set_mask_i64_i32(i64 %x) nounwind { ; CHECK-LABEL: test_set_mask_i64_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: tst x0, #0x80000000 -; CHECK-NEXT: csel x0, x8, x0, ne +; CHECK-NEXT: mov w8, #42 // =0x2a +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: csel x0, x8, x0, lt ; CHECK-NEXT: ret entry: %a = and i64 %x, 2147483648 @@ -40,7 +40,7 @@ define i64 @test_clear_mask_i64_i16(i64 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i64_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst x0, #0x8000 ; CHECK-NEXT: csel x0, x8, x0, eq ; CHECK-NEXT: ret @@ -58,7 +58,7 @@ define i64 @test_set_mask_i64_i16(i64 %x) nounwind { ; CHECK-LABEL: test_set_mask_i64_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst x0, #0x8000 ; CHECK-NEXT: csel x0, x8, x0, ne ; CHECK-NEXT: ret @@ -76,7 +76,7 @@ define i64 @test_clear_mask_i64_i8(i64 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i64_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst x0, #0x80 ; CHECK-NEXT: csel x0, x8, x0, eq ; CHECK-NEXT: ret @@ -94,7 +94,7 @@ define i64 @test_set_mask_i64_i8(i64 %x) nounwind { ; CHECK-LABEL: test_set_mask_i64_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst x0, #0x80 ; CHECK-NEXT: csel x0, x8, x0, ne ; CHECK-NEXT: ret @@ -112,7 +112,7 @@ define i32 @test_clear_mask_i32_i16(i32 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i32_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst w0, #0x8000 ; CHECK-NEXT: csel w0, w8, w0, eq ; CHECK-NEXT: ret @@ -130,7 +130,7 @@ define i32 @test_set_mask_i32_i16(i32 %x) nounwind { ; CHECK-LABEL: test_set_mask_i32_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst w0, #0x8000 ; CHECK-NEXT: csel w0, w8, w0, ne ; CHECK-NEXT: ret @@ -148,7 +148,7 @@ define i32 @test_clear_mask_i32_i8(i32 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i32_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst w0, #0x80 ; CHECK-NEXT: csel w0, w8, w0, eq ; CHECK-NEXT: ret @@ -166,7 +166,7 @@ define i32 @test_set_mask_i32_i8(i32 %x) nounwind { ; CHECK-LABEL: test_set_mask_i32_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst w0, #0x80 ; CHECK-NEXT: csel w0, w8, w0, ne ; CHECK-NEXT: ret @@ -184,7 +184,7 @@ define i16 @test_clear_mask_i16_i8(i16 %x) nounwind { ; CHECK-LABEL: test_clear_mask_i16_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: mov w8, #42 // =0x2a ; CHECK-NEXT: tst w0, #0x80 ; CHECK-NEXT: csel w0, w8, w0, eq ; CHECK-NEXT: ret @@ -202,8 +202,9 @@ define i16 @test_set_mask_i16_i8(i16 %x) nounwind { ; CHECK-LABEL: test_set_mask_i16_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: tst w0, #0x80 +; CHECK-NEXT: ubfx w9, w0, #7, #1 +; CHECK-NEXT: mov w8, #42 // =0x2a +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csel w0, w8, w0, ne ; CHECK-NEXT: ret entry: @@ -220,8 +221,9 @@ define i16 @test_set_mask_i16_i7(i16 %x) nounwind { ; CHECK-LABEL: test_set_mask_i16_i7: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: tst w0, #0x40 +; CHECK-NEXT: ubfx w9, w0, #6, #1 +; CHECK-NEXT: mov w8, #42 // =0x2a +; CHECK-NEXT: cmp w9, #0 ; CHECK-NEXT: csel w0, w8, w0, ne ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll --- a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll +++ b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll @@ -1,41 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s define i128 @ldp_single_csdb(ptr %p) speculative_load_hardening { +; CHECK-LABEL: ldp_single_csdb: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x1, [x0] +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x8, x8, x16 +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: mov x2, sp +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: and x2, x2, x16 +; CHECK-NEXT: mov sp, x2 +; CHECK-NEXT: ret entry: %0 = load i128, ptr %p, align 16 ret i128 %0 -; CHECK-LABEL: ldp_single_csdb -; CHECK: ldp x8, x1, [x0] -; CHECK-NEXT: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and x8, x8, x16 -; CHECK-NEXT: and x1, x1, x16 -; CHECK-NEXT: csdb -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: mov x0, x8 -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret } define double @ld_double(ptr %p) speculative_load_hardening { +; CHECK-LABEL: ld_double: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: ret entry: %0 = load double, ptr %p, align 8 ret double %0 ; Checking that the address laoded from is masked for a floating point load. -; CHECK-LABEL: ld_double -; CHECK: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and x0, x0, x16 -; CHECK-NEXT: csdb -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret } define i32 @csdb_emitted_for_subreg_use(ptr %p, i32 %b) speculative_load_hardening { +; CHECK-LABEL: csdb_emitted_for_subreg_use: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x8, x8, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: add w9, w1, w8 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel w0, w1, w9, eq +; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: mov sp, x1 +; CHECK-NEXT: ret entry: %X = load i64, ptr %p, align 8 %X_trunc = trunc i64 %X to i32 @@ -44,23 +61,24 @@ %ret = select i1 %iszero, i32 %b, i32 %add ret i32 %ret ; Checking that the address laoded from is masked for a floating point load. -; CHECK-LABEL: csdb_emitted_for_subreg_use -; CHECK: ldr x8, [x0] -; CHECK-NEXT: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and x8, x8, x16 ; csdb instruction must occur before the add instruction with w8 as operand. -; CHECK-NEXT: csdb -; CHECK-NEXT: add w9, w1, w8 -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: csel w0, w1, w9, eq -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret } define i64 @csdb_emitted_for_superreg_use(ptr %p, i64 %b) speculative_load_hardening { +; CHECK-LABEL: csdb_emitted_for_superreg_use: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and w8, w8, w16 +; CHECK-NEXT: csdb +; CHECK-NEXT: add x9, x1, x8 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel x0, x1, x9, eq +; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: mov sp, x1 +; CHECK-NEXT: ret entry: %X = load i32, ptr %p, align 4 %X_ext = zext i32 %X to i64 @@ -69,88 +87,84 @@ %ret = select i1 %iszero, i64 %b, i64 %add ret i64 %ret ; Checking that the address laoded from is masked for a floating point load. -; CHECK-LABEL: csdb_emitted_for_superreg_use -; CHECK: ldr w8, [x0] -; CHECK-NEXT: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and w8, w8, w16 ; csdb instruction must occur before the add instruction with x8 as operand. -; CHECK-NEXT: csdb -; CHECK-NEXT: add x9, x1, x8 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel x0, x1, x9, eq -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret } define i64 @no_masking_with_full_control_flow_barriers(i64 %a, i64 %b, ptr %p) speculative_load_hardening { -; CHECK-LABEL: no_masking_with_full_control_flow_barriers -; CHECK: dsb sy -; CHECK: isb +; CHECK-LABEL: no_masking_with_full_control_flow_barriers: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dsb sy +; CHECK-NEXT: isb +; CHECK-NEXT: ldr x8, [x2] +; CHECK-NEXT: mov x17, x0 +; CHECK-NEXT: mov x16, x1 +; CHECK-NEXT: //APP +; CHECK-NEXT: hint #12 +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: add x0, x8, x17 +; CHECK-NEXT: ret entry: %0 = tail call i64 asm "hint #12", "={x17},{x16},0"(i64 %b, i64 %a) %X = load i64, ptr %p, align 8 %ret = add i64 %X, %0 -; CHECK-NOT: csdb -; CHECK-NOT: and -; CHECK: ret ret i64 %ret } define void @f_implicitdef_vector_load(ptr %dst, ptr %src) speculative_load_hardening +; CHECK-LABEL: f_implicitdef_vector_load: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: mov v0.d[1], v0.d[0] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: ret { entry: %0 = load <2 x i32>, ptr %src, align 8 %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> store <4 x i32> %shuffle, ptr %dst, align 4 ret void -; CHECK-LABEL: f_implicitdef_vector_load -; CHECK: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and x1, x1, x16 -; CHECK-NEXT: csdb -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: mov v0.d[1], v0.d[0] -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret } define <2 x double> @f_usedefvectorload(ptr %a, ptr %b) speculative_load_hardening { +; CHECK-LABEL: f_usedefvectorload: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: ret entry: -; CHECK-LABEL: f_usedefvectorload -; CHECK: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: and x1, x1, x16 -; CHECK-NEXT: csdb -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret %0 = load double, ptr %b, align 16 %vld1_lane = insertelement <2 x double> , double %0, i32 0 ret <2 x double> %vld1_lane } define i32 @deadload() speculative_load_hardening uwtable { +; CHECK-LABEL: deadload: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr w8, [sp, #12] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: mov sp, x0 +; CHECK-NEXT: ret entry: -; CHECK-LABEL: deadload -; CHECK: cmp sp, #0 -; CHECK-NEXT: csetm x16, ne -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr w8, [sp, #12] -; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 -; CHECK-NEXT: mov sp, [[TMPREG]] -; CHECK-NEXT: ret %a = alloca i32, align 4 %val = load volatile i32, ptr %a, align 4 ret i32 undef diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll --- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -265,12 +265,17 @@ ; CHECK-LABEL: frsqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte s1, s0 +; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: fmul s2, s1, s1 ; CHECK-NEXT: frsqrts s2, s0, s2 ; CHECK-NEXT: fmul s1, s1, s2 ; CHECK-NEXT: fmul s2, s1, s1 -; CHECK-NEXT: frsqrts s0, s0, s2 -; CHECK-NEXT: fmul s0, s1, s0 +; CHECK-NEXT: fmul s1, s0, s1 +; CHECK-NEXT: frsqrts s2, s0, s2 +; CHECK-NEXT: fmul s1, s1, s2 +; CHECK-NEXT: fcsel s0, s0, s1, eq +; CHECK-NEXT: fmov s1, #1.00000000 +; CHECK-NEXT: fdiv s0, s1, s0 ; CHECK-NEXT: ret %1 = tail call fast float @llvm.sqrt.f32(float %a) %2 = fdiv fast float 1.000000e+00, %1 @@ -287,13 +292,18 @@ ; ; CHECK-LABEL: f2rsqrt: ; CHECK: // %bb.0: -; CHECK-NEXT: frsqrte v1.2s, v0.2s -; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s -; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s -; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s -; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s -; CHECK-NEXT: frsqrts v0.2s, v0.2s, v2.2s -; CHECK-NEXT: fmul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: frsqrte v2.2s, v0.2s +; CHECK-NEXT: fmov v1.2s, #1.00000000 +; CHECK-NEXT: fmul v3.2s, v2.2s, v2.2s +; CHECK-NEXT: frsqrts v3.2s, v0.2s, v3.2s +; CHECK-NEXT: fmul v2.2s, v2.2s, v3.2s +; CHECK-NEXT: fmul v3.2s, v2.2s, v2.2s +; CHECK-NEXT: fmul v2.2s, v0.2s, v2.2s +; CHECK-NEXT: frsqrts v3.2s, v0.2s, v3.2s +; CHECK-NEXT: fmul v2.2s, v2.2s, v3.2s +; CHECK-NEXT: fcmeq v3.2s, v0.2s, #0.0 +; CHECK-NEXT: bif v0.8b, v2.8b, v3.8b +; CHECK-NEXT: fdiv v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) %2 = fdiv fast <2 x float> , %1 @@ -310,13 +320,18 @@ ; ; CHECK-LABEL: f4rsqrt: ; CHECK: // %bb.0: -; CHECK-NEXT: frsqrte v1.4s, v0.4s -; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s -; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s -; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s -; CHECK-NEXT: frsqrts v0.4s, v0.4s, v2.4s -; CHECK-NEXT: fmul v0.4s, v1.4s, v0.4s +; CHECK-NEXT: frsqrte v2.4s, v0.4s +; CHECK-NEXT: fmov v1.4s, #1.00000000 +; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s +; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s +; CHECK-NEXT: fmul v2.4s, v0.4s, v2.4s +; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fcmeq v3.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: fdiv v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) %2 = fdiv fast <4 x float> , %1 @@ -335,20 +350,29 @@ ; ; CHECK-LABEL: f8rsqrt: ; CHECK: // %bb.0: -; CHECK-NEXT: frsqrte v2.4s, v0.4s -; CHECK-NEXT: frsqrte v3.4s, v1.4s -; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s -; CHECK-NEXT: frsqrts v4.4s, v0.4s, v4.4s +; CHECK-NEXT: frsqrte v3.4s, v0.4s +; CHECK-NEXT: fmov v2.4s, #1.00000000 +; CHECK-NEXT: frsqrte v4.4s, v1.4s ; CHECK-NEXT: fmul v5.4s, v3.4s, v3.4s -; CHECK-NEXT: frsqrts v5.4s, v1.4s, v5.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v4.4s -; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s -; CHECK-NEXT: frsqrts v0.4s, v0.4s, v4.4s +; CHECK-NEXT: frsqrts v5.4s, v0.4s, v5.4s +; CHECK-NEXT: fmul v6.4s, v4.4s, v4.4s +; CHECK-NEXT: frsqrts v6.4s, v1.4s, v6.4s +; CHECK-NEXT: fmul v3.4s, v3.4s, v5.4s +; CHECK-NEXT: fmul v5.4s, v3.4s, v3.4s +; CHECK-NEXT: fmul v3.4s, v0.4s, v3.4s +; CHECK-NEXT: frsqrts v5.4s, v0.4s, v5.4s +; CHECK-NEXT: fmul v4.4s, v4.4s, v6.4s +; CHECK-NEXT: fmul v6.4s, v4.4s, v4.4s +; CHECK-NEXT: frsqrts v6.4s, v1.4s, v6.4s ; CHECK-NEXT: fmul v3.4s, v3.4s, v5.4s -; CHECK-NEXT: fmul v4.4s, v3.4s, v3.4s -; CHECK-NEXT: frsqrts v1.4s, v1.4s, v4.4s -; CHECK-NEXT: fmul v0.4s, v2.4s, v0.4s -; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s +; CHECK-NEXT: fcmeq v5.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v3.16b, v5.16b +; CHECK-NEXT: fmul v3.4s, v1.4s, v4.4s +; CHECK-NEXT: fcmeq v4.4s, v1.4s, #0.0 +; CHECK-NEXT: fdiv v0.4s, v2.4s, v0.4s +; CHECK-NEXT: fmul v3.4s, v3.4s, v6.4s +; CHECK-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-NEXT: fdiv v1.4s, v2.4s, v1.4s ; CHECK-NEXT: ret %1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a) %2 = fdiv fast <8 x float> , %1 @@ -366,6 +390,7 @@ ; CHECK-LABEL: drsqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 @@ -373,8 +398,12 @@ ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmul d2, d1, d1 -; CHECK-NEXT: frsqrts d0, d0, d2 -; CHECK-NEXT: fmul d0, d1, d0 +; CHECK-NEXT: fmul d1, d0, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fcsel d0, d0, d1, eq +; CHECK-NEXT: fmov d1, #1.00000000 +; CHECK-NEXT: fdiv d0, d1, d0 ; CHECK-NEXT: ret %1 = tail call fast double @llvm.sqrt.f64(double %a) %2 = fdiv fast double 1.000000e+00, %1 @@ -391,16 +420,21 @@ ; ; CHECK-LABEL: d2rsqrt: ; CHECK: // %bb.0: -; CHECK-NEXT: frsqrte v1.2d, v0.2d -; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d -; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d -; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d -; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d -; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d -; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d -; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d -; CHECK-NEXT: frsqrts v0.2d, v0.2d, v2.2d -; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d +; CHECK-NEXT: frsqrte v2.2d, v0.2d +; CHECK-NEXT: fmov v1.2d, #1.00000000 +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d +; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d +; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d +; CHECK-NEXT: fmul v2.2d, v0.2d, v2.2d +; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fcmeq v3.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b +; CHECK-NEXT: fdiv v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) %2 = fdiv fast <2 x double> , %1 @@ -419,26 +453,35 @@ ; ; CHECK-LABEL: d4rsqrt: ; CHECK: // %bb.0: -; CHECK-NEXT: frsqrte v2.2d, v0.2d -; CHECK-NEXT: frsqrte v3.2d, v1.2d -; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: frsqrte v3.2d, v0.2d +; CHECK-NEXT: fmov v2.2d, #1.00000000 +; CHECK-NEXT: frsqrte v4.2d, v1.2d ; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d -; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d -; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: frsqrts v5.2d, v0.2d, v5.2d +; CHECK-NEXT: fmul v6.2d, v4.2d, v4.2d +; CHECK-NEXT: frsqrts v6.2d, v1.2d, v6.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d ; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d -; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d -; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v0.2d, v0.2d, v4.2d +; CHECK-NEXT: frsqrts v5.2d, v0.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v4.2d, v6.2d +; CHECK-NEXT: fmul v6.2d, v4.2d, v4.2d +; CHECK-NEXT: frsqrts v6.2d, v1.2d, v6.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d +; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d +; CHECK-NEXT: fmul v3.2d, v0.2d, v3.2d +; CHECK-NEXT: frsqrts v5.2d, v0.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v4.2d, v6.2d +; CHECK-NEXT: fmul v6.2d, v4.2d, v4.2d +; CHECK-NEXT: frsqrts v6.2d, v1.2d, v6.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d -; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d -; CHECK-NEXT: frsqrts v1.2d, v1.2d, v4.2d -; CHECK-NEXT: fmul v0.2d, v2.2d, v0.2d -; CHECK-NEXT: fmul v1.2d, v3.2d, v1.2d +; CHECK-NEXT: fcmeq v5.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v3.16b, v5.16b +; CHECK-NEXT: fmul v3.2d, v1.2d, v4.2d +; CHECK-NEXT: fcmeq v4.2d, v1.2d, #0.0 +; CHECK-NEXT: fdiv v0.2d, v2.2d, v0.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v6.2d +; CHECK-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-NEXT: fdiv v1.2d, v2.2d, v1.2d ; CHECK-NEXT: ret %1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) %2 = fdiv fast <4 x double> , %1 @@ -454,6 +497,7 @@ ; CHECK-LABEL: sqrt_fdiv_common_operand: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 @@ -463,7 +507,9 @@ ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fmul d0, d0, d1 +; CHECK-NEXT: fmul d1, d0, d1 +; CHECK-NEXT: fcsel d1, d0, d1, eq +; CHECK-NEXT: fdiv d0, d0, d1 ; CHECK-NEXT: ret %sqrt = call fast double @llvm.sqrt.f64(double %x) %r = fdiv fast double %x, %sqrt @@ -516,9 +562,9 @@ ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmul d1, d0, d1 -; CHECK-NEXT: fcsel d2, d0, d1, eq -; CHECK-NEXT: fmov d0, d1 -; CHECK-NEXT: str d2, [x0] +; CHECK-NEXT: fcsel d1, d0, d1, eq +; CHECK-NEXT: fdiv d0, d0, d1 +; CHECK-NEXT: str d1, [x0] ; CHECK-NEXT: ret %sqrt = call fast double @llvm.sqrt.f64(double %x) store double %sqrt, ptr %p @@ -530,7 +576,7 @@ ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses: ; FAULT: // %bb.0: ; FAULT-NEXT: fsqrt d0, d0 -; FAULT-NEXT: mov x8, #4631107791820423168 +; FAULT-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; FAULT-NEXT: fmov d1, #1.00000000 ; FAULT-NEXT: fmov d2, x8 ; FAULT-NEXT: fdiv d1, d1, d0 @@ -542,17 +588,22 @@ ; CHECK-LABEL: sqrt_simplify_before_recip_3_uses: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 -; CHECK-NEXT: mov x8, #4631107791820423168 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmov d2, #1.00000000 +; CHECK-NEXT: fmul d1, d0, d1 +; CHECK-NEXT: fcsel d1, d0, d1, eq +; CHECK-NEXT: fdiv d1, d2, d1 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: fmul d1, d1, d3 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: fmul d1, d1, d3 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: fmul d1, d1, d3 ; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: fmul d2, d1, d2 ; CHECK-NEXT: str d1, [x0] @@ -571,9 +622,9 @@ ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses_order: ; FAULT: // %bb.0: ; FAULT-NEXT: fsqrt d0, d0 -; FAULT-NEXT: mov x8, #4631107791820423168 +; FAULT-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; FAULT-NEXT: fmov d1, x8 -; FAULT-NEXT: mov x8, #140737488355328 +; FAULT-NEXT: mov x8, #140737488355328 // =0x800000000000 ; FAULT-NEXT: movk x8, #16453, lsl #48 ; FAULT-NEXT: fmov d2, x8 ; FAULT-NEXT: fdiv d1, d1, d0 @@ -585,8 +636,9 @@ ; CHECK-LABEL: sqrt_simplify_before_recip_3_uses_order: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 -; CHECK-NEXT: mov x9, #140737488355328 -; CHECK-NEXT: mov x8, #4631107791820423168 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: mov x9, #140737488355328 // =0x800000000000 +; CHECK-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-NEXT: movk x9, #16453, lsl #48 ; CHECK-NEXT: fmov d3, x9 ; CHECK-NEXT: fmul d2, d1, d1 @@ -598,6 +650,10 @@ ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmov d2, #1.00000000 +; CHECK-NEXT: fmul d1, d0, d1 +; CHECK-NEXT: fcsel d1, d0, d1, eq +; CHECK-NEXT: fdiv d1, d2, d1 ; CHECK-NEXT: fmov d2, x8 ; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: fmul d2, d1, d2 @@ -620,8 +676,8 @@ ; FAULT: // %bb.0: ; FAULT-NEXT: fsqrt d0, d0 ; FAULT-NEXT: fmov d1, #1.00000000 -; FAULT-NEXT: mov x9, #140737488355328 -; FAULT-NEXT: mov x8, #4631107791820423168 +; FAULT-NEXT: mov x9, #140737488355328 // =0x800000000000 +; FAULT-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; FAULT-NEXT: movk x9, #16453, lsl #48 ; FAULT-NEXT: fmov d2, x8 ; FAULT-NEXT: fmov d3, x9 @@ -637,8 +693,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 ; CHECK-NEXT: fcmp d0, #0.0 -; CHECK-NEXT: mov x9, #140737488355328 -; CHECK-NEXT: mov x8, #4631107791820423168 +; CHECK-NEXT: mov x9, #140737488355328 // =0x800000000000 +; CHECK-NEXT: mov x8, #4631107791820423168 // =0x4045000000000000 ; CHECK-NEXT: movk x9, #16453, lsl #48 ; CHECK-NEXT: fmov d3, x9 ; CHECK-NEXT: fmul d2, d1, d1 @@ -650,13 +706,15 @@ ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fmul d2, d0, d1 -; CHECK-NEXT: fmul d3, d1, d3 -; CHECK-NEXT: str d1, [x0] -; CHECK-NEXT: fcsel d2, d0, d2, eq -; CHECK-NEXT: fdiv d0, d0, d2 +; CHECK-NEXT: fmov d2, #1.00000000 +; CHECK-NEXT: fmul d1, d0, d1 +; CHECK-NEXT: fcsel d1, d0, d1, eq +; CHECK-NEXT: fdiv d1, d2, d1 ; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: fmul d2, d1, d2 +; CHECK-NEXT: fmul d3, d1, d3 +; CHECK-NEXT: str d1, [x0] ; CHECK-NEXT: str d2, [x1] ; CHECK-NEXT: str d3, [x2] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/srem-lkk.ll b/llvm/test/CodeGen/AArch64/srem-lkk.ll --- a/llvm/test/CodeGen/AArch64/srem-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-lkk.ll @@ -4,14 +4,14 @@ define i32 @fold_srem_positive_odd(i32 %x) { ; CHECK-LABEL: fold_srem_positive_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: mov w8, #37253 // =0x9185 ; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 ; CHECK-NEXT: lsr x8, x8, #32 ; CHECK-NEXT: add w8, w8, w0 ; CHECK-NEXT: asr w9, w8, #6 ; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: mov w9, #95 // =0x5f ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, 95 @@ -22,13 +22,13 @@ define i32 @fold_srem_positive_even(i32 %x) { ; CHECK-LABEL: fold_srem_positive_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #36849 +; CHECK-NEXT: mov w8, #36849 // =0x8ff1 ; CHECK-NEXT: movk w8, #15827, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 ; CHECK-NEXT: lsr x9, x8, #63 ; CHECK-NEXT: asr x8, x8, #40 ; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #1060 +; CHECK-NEXT: mov w9, #1060 // =0x424 ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, 1060 @@ -39,13 +39,13 @@ define i32 @fold_srem_negative_odd(i32 %x) { ; CHECK-LABEL: fold_srem_negative_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65445 +; CHECK-NEXT: mov w8, #65445 // =0xffa5 ; CHECK-NEXT: movk w8, #42330, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 ; CHECK-NEXT: lsr x9, x8, #63 ; CHECK-NEXT: asr x8, x8, #40 ; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #-723 +; CHECK-NEXT: mov w9, #-723 // =0xfffffd2d ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, -723 @@ -56,13 +56,13 @@ define i32 @fold_srem_negative_even(i32 %x) { ; CHECK-LABEL: fold_srem_negative_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #62439 +; CHECK-NEXT: mov w8, #62439 // =0xf3e7 ; CHECK-NEXT: movk w8, #64805, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 ; CHECK-NEXT: lsr x9, x8, #63 ; CHECK-NEXT: asr x8, x8, #40 ; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #-22981 +; CHECK-NEXT: mov w9, #-22981 // =0xffffa63b ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, -22981 @@ -74,14 +74,14 @@ define i32 @combine_srem_sdiv(i32 %x) { ; CHECK-LABEL: combine_srem_sdiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: mov w8, #37253 // =0x9185 ; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 ; CHECK-NEXT: lsr x8, x8, #32 ; CHECK-NEXT: add w8, w8, w0 ; CHECK-NEXT: asr w9, w8, #6 ; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: mov w9, #95 // =0x5f ; CHECK-NEXT: msub w9, w8, w9, w0 ; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret @@ -95,14 +95,14 @@ define i64 @dont_fold_srem_i64(i64 %x) { ; CHECK-LABEL: dont_fold_srem_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #58849 +; CHECK-NEXT: mov x8, #58849 // =0xe5e1 ; CHECK-NEXT: movk x8, #48148, lsl #16 ; CHECK-NEXT: movk x8, #33436, lsl #32 ; CHECK-NEXT: movk x8, #21399, lsl #48 ; CHECK-NEXT: smulh x8, x0, x8 ; CHECK-NEXT: asr x9, x8, #5 ; CHECK-NEXT: add x8, x9, x8, lsr #63 -; CHECK-NEXT: mov w9, #98 +; CHECK-NEXT: mov w9, #98 // =0x62 ; CHECK-NEXT: msub x0, x8, x9, x0 ; CHECK-NEXT: ret %1 = srem i64 %x, 98 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll @@ -4,12 +4,12 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; CHECK-LABEL: test_srem_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #33099 -; CHECK-NEXT: mov w9, #24493 +; CHECK-NEXT: mov w8, #33099 // =0x814b +; CHECK-NEXT: mov w9, #24493 // =0x5fad ; CHECK-NEXT: movk w8, #8026, lsl #16 ; CHECK-NEXT: movk w9, #41, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #48987 +; CHECK-NEXT: mov w9, #48987 // =0xbf5b ; CHECK-NEXT: movk w9, #82, lsl #16 ; CHECK-NEXT: and w8, w8, #0x1fffffff ; CHECK-NEXT: cmp w8, w9 @@ -24,7 +24,7 @@ ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: ; CHECK-NEXT: sbfx w9, w0, #0, #4 -; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: mov w8, #6 // =0x6 ; CHECK-NEXT: add w9, w9, w9, lsl #1 ; CHECK-NEXT: ubfx w10, w9, #7, #1 ; CHECK-NEXT: add w9, w10, w9, lsr #4 @@ -57,10 +57,10 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; CHECK-LABEL: test_srem_vec: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #7282 +; CHECK-NEXT: mov x8, #7282 // =0x1c72 ; CHECK-NEXT: sbfx x9, x0, #0, #33 ; CHECK-NEXT: movk x8, #29127, lsl #16 -; CHECK-NEXT: mov x11, #7281 +; CHECK-NEXT: mov x11, #7281 // =0x1c71 ; CHECK-NEXT: movk x8, #50972, lsl #32 ; CHECK-NEXT: movk x11, #29127, lsl #16 ; CHECK-NEXT: movk x8, #7281, lsl #48 @@ -83,7 +83,7 @@ ; CHECK-NEXT: add x11, x11, x11, lsl #3 ; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: add x10, x10, x11 -; CHECK-NEXT: mov x9, #8589934591 +; CHECK-NEXT: mov x9, #8589934591 // =0x1ffffffff ; CHECK-NEXT: adrp x11, .LCPI3_0 ; CHECK-NEXT: adrp x12, .LCPI3_1 ; CHECK-NEXT: mov v0.d[1], x8 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll b/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll @@ -4,12 +4,12 @@ define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { ; CHECK-LABEL: test_minsize: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #5 -; CHECK-NEXT: mov w9, #42 +; CHECK-NEXT: mov w8, #5 // =0x5 +; CHECK-NEXT: mov w9, #42 // =0x2a ; CHECK-NEXT: sdiv w8, w0, w8 ; CHECK-NEXT: add w8, w8, w8, lsl #2 ; CHECK-NEXT: cmp w0, w8 -; CHECK-NEXT: mov w8, #-10 +; CHECK-NEXT: mov w8, #-10 // =0xfffffff6 ; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret %rem = srem i32 %X, 5 @@ -21,16 +21,17 @@ define i32 @test_optsize(i32 %X) optsize nounwind readnone { ; CHECK-LABEL: test_optsize: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #858993459 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: mov w8, #-10 -; CHECK-NEXT: mov w9, #42 -; CHECK-NEXT: csel w0, w9, w8, lo +; CHECK-NEXT: mov w8, #26215 // =0x6667 +; CHECK-NEXT: movk w8, #26214, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #42 // =0x2a +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: mov w8, #-10 // =0xfffffff6 +; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret %rem = srem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -6,22 +6,23 @@ ; CHECK-LABEL: test_srem_odd_even: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: adrp x9, .LCPI0_1 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 +; CHECK-NEXT: mov v1.s[1], v0.s[1] +; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_1] ; CHECK-NEXT: adrp x8, .LCPI0_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] -; CHECK-NEXT: adrp x9, .LCPI0_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI0_3] -; CHECK-NEXT: adrp x8, .LCPI0_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -35,17 +36,25 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: adrp x8, .LCPI1_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] +; CHECK-NEXT: adrp x8, .LCPI1_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] +; CHECK-NEXT: adrp x8, .LCPI1_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -56,18 +65,26 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: adrp x8, .LCPI2_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: adrp x8, .LCPI2_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] +; CHECK-NEXT: adrp x8, .LCPI2_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: cmhi v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, @@ -79,21 +96,26 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: adrp x8, .LCPI3_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -103,21 +125,26 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, @@ -130,22 +157,25 @@ ; CHECK-LABEL: test_srem_odd_even_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI5_0 -; CHECK-NEXT: adrp x9, .LCPI5_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: adrp x8, .LCPI5_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] ; CHECK-NEXT: adrp x8, .LCPI5_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI5_1] -; CHECK-NEXT: adrp x9, .LCPI5_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI5_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI5_3] -; CHECK-NEXT: adrp x8, .LCPI5_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] +; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -156,22 +186,25 @@ ; CHECK-LABEL: test_srem_odd_even_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI6_0 -; CHECK-NEXT: adrp x9, .LCPI6_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] +; CHECK-NEXT: adrp x8, .LCPI6_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] ; CHECK-NEXT: adrp x8, .LCPI6_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_1] -; CHECK-NEXT: adrp x9, .LCPI6_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI6_3] -; CHECK-NEXT: adrp x8, .LCPI6_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2] +; CHECK-NEXT: adrp x8, .LCPI6_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_3] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, @@ -186,22 +219,23 @@ ; CHECK-LABEL: test_srem_odd_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI7_0 -; CHECK-NEXT: adrp x9, .LCPI7_1 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_0] +; CHECK-NEXT: adrp x8, .LCPI7_1 +; CHECK-NEXT: mov v1.s[2], v0.s[2] +; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_1] ; CHECK-NEXT: adrp x8, .LCPI7_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI7_1] -; CHECK-NEXT: adrp x9, .LCPI7_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI7_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI7_3] -; CHECK-NEXT: adrp x8, .LCPI7_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_2] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -214,22 +248,19 @@ ; CHECK-LABEL: test_srem_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI8_0 -; CHECK-NEXT: adrp x9, .LCPI8_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: adrp x8, .LCPI8_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_1] -; CHECK-NEXT: adrp x9, .LCPI8_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI8_3] -; CHECK-NEXT: adrp x8, .LCPI8_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: adrp x8, .LCPI8_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-NEXT: sshr v2.4s, v1.4s, #3 +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_1] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -242,22 +273,24 @@ ; CHECK-LABEL: test_srem_odd_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: adrp x9, .LCPI9_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: adrp x8, .LCPI9_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] ; CHECK-NEXT: adrp x8, .LCPI9_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI9_1] -; CHECK-NEXT: adrp x9, .LCPI9_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI9_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI9_3] -; CHECK-NEXT: adrp x8, .LCPI9_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] +; CHECK-NEXT: adrp x8, .LCPI9_3 +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_3] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -271,17 +304,25 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: adrp x8, .LCPI10_1 +; CHECK-NEXT: mov v1.s[2], v0.s[2] +; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI10_1] +; CHECK-NEXT: adrp x8, .LCPI10_2 +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI10_2] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -294,21 +335,24 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: adrp x8, .LCPI11_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_1] +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: adrp x8, .LCPI11_2 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -321,22 +365,26 @@ ; CHECK-LABEL: test_srem_odd_even_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI12_0 -; CHECK-NEXT: adrp x9, .LCPI12_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: adrp x8, .LCPI12_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] ; CHECK-NEXT: adrp x8, .LCPI12_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI12_1] -; CHECK-NEXT: adrp x9, .LCPI12_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI12_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI12_3] -; CHECK-NEXT: adrp x8, .LCPI12_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] +; CHECK-NEXT: adrp x8, .LCPI12_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -440,22 +488,25 @@ ; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: adrp x9, .LCPI16_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: adrp x8, .LCPI16_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] ; CHECK-NEXT: adrp x8, .LCPI16_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI16_1] -; CHECK-NEXT: adrp x9, .LCPI16_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI16_3] -; CHECK-NEXT: adrp x8, .LCPI16_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] +; CHECK-NEXT: adrp x8, .LCPI16_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_3] +; CHECK-NEXT: mov v2.s[1], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -468,22 +519,25 @@ ; CHECK-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI17_0 -; CHECK-NEXT: adrp x9, .LCPI17_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: adrp x8, .LCPI17_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1] ; CHECK-NEXT: adrp x8, .LCPI17_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI17_1] -; CHECK-NEXT: adrp x9, .LCPI17_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI17_3] -; CHECK-NEXT: adrp x8, .LCPI17_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2] +; CHECK-NEXT: adrp x8, .LCPI17_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_3] +; CHECK-NEXT: mov v2.s[1], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -496,22 +550,25 @@ ; CHECK-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI18_0 -; CHECK-NEXT: adrp x9, .LCPI18_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: adrp x8, .LCPI18_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] ; CHECK-NEXT: adrp x8, .LCPI18_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_1] -; CHECK-NEXT: adrp x9, .LCPI18_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI18_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI18_3] -; CHECK-NEXT: adrp x8, .LCPI18_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] +; CHECK-NEXT: adrp x8, .LCPI18_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: mov v2.s[1], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -525,17 +582,25 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_and_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: adrp x8, .LCPI19_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_1] +; CHECK-NEXT: adrp x8, .LCPI19_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_2] +; CHECK-NEXT: adrp x8, .LCPI19_3 +; CHECK-NEXT: ushr v3.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: and v2.16b, v3.16b, v2.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI19_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -548,21 +613,28 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_and_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: adrp x8, .LCPI20_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: shl v0.4s, v2.4s, #31 -; CHECK-NEXT: ushr v1.4s, v2.4s, #1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: adrp x9, .LCPI20_3 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: adrp x8, .LCPI20_1 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI20_3] +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1] +; CHECK-NEXT: adrp x8, .LCPI20_2 +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_2] +; CHECK-NEXT: adrp x8, .LCPI20_4 +; CHECK-NEXT: ushr v4.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: and v2.16b, v4.16b, v3.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_4] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -575,22 +647,27 @@ ; CHECK-LABEL: test_srem_odd_even_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI21_0 -; CHECK-NEXT: adrp x9, .LCPI21_1 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI21_3 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] +; CHECK-NEXT: adrp x8, .LCPI21_1 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI21_3] +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] ; CHECK-NEXT: adrp x8, .LCPI21_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI21_1] -; CHECK-NEXT: adrp x9, .LCPI21_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI21_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI21_3] +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_2] ; CHECK-NEXT: adrp x8, .LCPI21_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushr v4.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: and v2.16b, v4.16b, v3.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_4] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -605,22 +682,26 @@ ; CHECK-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI22_0 -; CHECK-NEXT: adrp x9, .LCPI22_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] +; CHECK-NEXT: adrp x8, .LCPI22_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1] ; CHECK-NEXT: adrp x8, .LCPI22_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI22_1] -; CHECK-NEXT: adrp x9, .LCPI22_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI22_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI22_3] -; CHECK-NEXT: adrp x8, .LCPI22_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2] +; CHECK-NEXT: adrp x8, .LCPI22_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -633,22 +714,23 @@ ; CHECK-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI23_0 -; CHECK-NEXT: adrp x9, .LCPI23_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: adrp x8, .LCPI23_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_1] +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: adrp x8, .LCPI23_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI23_1] -; CHECK-NEXT: adrp x9, .LCPI23_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI23_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI23_3] -; CHECK-NEXT: adrp x8, .LCPI23_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2] +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -661,22 +743,26 @@ ; CHECK-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI24_0 -; CHECK-NEXT: adrp x9, .LCPI24_1 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] +; CHECK-NEXT: adrp x8, .LCPI24_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] ; CHECK-NEXT: adrp x8, .LCPI24_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI24_1] -; CHECK-NEXT: adrp x9, .LCPI24_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI24_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI24_3] -; CHECK-NEXT: adrp x8, .LCPI24_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2] +; CHECK-NEXT: adrp x8, .LCPI24_3 +; CHECK-NEXT: ushr v2.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v3.4s +; CHECK-NEXT: mov v2.s[2], wzr +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -690,22 +776,26 @@ ; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI25_0 -; CHECK-NEXT: adrp x9, .LCPI25_1 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] +; CHECK-NEXT: adrp x8, .LCPI25_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_1] ; CHECK-NEXT: adrp x8, .LCPI25_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI25_1] -; CHECK-NEXT: adrp x9, .LCPI25_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI25_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI25_3] -; CHECK-NEXT: adrp x8, .LCPI25_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_2] +; CHECK-NEXT: adrp x8, .LCPI25_3 +; CHECK-NEXT: ushr v4.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: and v2.16b, v4.16b, v3.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -717,22 +807,26 @@ ; CHECK-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI26_0 -; CHECK-NEXT: adrp x9, .LCPI26_1 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] +; CHECK-NEXT: adrp x8, .LCPI26_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] ; CHECK-NEXT: adrp x8, .LCPI26_2 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI26_1] -; CHECK-NEXT: adrp x9, .LCPI26_3 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI26_2] -; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI26_3] -; CHECK-NEXT: adrp x8, .LCPI26_4 -; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_4] -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_2] +; CHECK-NEXT: adrp x8, .LCPI26_3 +; CHECK-NEXT: ushr v4.4s, v1.4s, #31 +; CHECK-NEXT: sshl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: and v2.16b, v4.16b, v3.16b +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_3] +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -5,18 +5,18 @@ define <4 x i32> @test_srem_odd_25(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47185 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: movi v3.4s, #25 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #28834 -; CHECK-NEXT: movk w8, #2621, lsl #16 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshr v2.4s, v1.4s, #3 +; CHECK-NEXT: usra v2.4s, v1.4s, #31 ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: dup v0.4s, w8 -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -29,22 +29,19 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47184 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: movi v3.4s, #100 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #23592 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: movk w8, #655, lsl #16 -; CHECK-NEXT: shl v0.4s, v2.4s, #30 -; CHECK-NEXT: ushr v1.4s, v2.4s, #2 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshr v2.4s, v1.4s, #5 +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -58,18 +55,18 @@ define <4 x i32> @test_srem_odd_neg25(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_neg25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47185 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #28834 -; CHECK-NEXT: movk w8, #2621, lsl #16 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: adrp x8, .LCPI2_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshr v2.4s, v1.4s, #3 +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: dup v0.4s, w8 -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -82,22 +79,19 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_neg100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47184 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w8, #23592 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: movk w8, #655, lsl #16 -; CHECK-NEXT: shl v0.4s, v2.4s, #30 -; CHECK-NEXT: ushr v1.4s, v2.4s, #2 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: sshr v2.4s, v1.4s, #5 +; CHECK-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -112,7 +106,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: mov w8, #34079 // =0x851f ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: movi v3.4s, #25 ; CHECK-NEXT: dup v1.4s, w8 @@ -135,7 +129,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: mov w8, #34079 // =0x851f ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: movi v3.4s, #100 ; CHECK-NEXT: dup v1.4s, w8 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -8,14 +8,15 @@ define i32 @test_srem_odd(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #858993459 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #26215 // =0x6667 +; CHECK-NEXT: movk w8, #26214, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 5 %cmp = icmp eq i32 %srem, 0 @@ -26,15 +27,16 @@ define i32 @test_srem_odd_25(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd_25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47185 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #28835 -; CHECK-NEXT: movk w9, #2621, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #35 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #25 // =0x19 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 25 %cmp = icmp eq i32 %srem, 0 @@ -46,12 +48,18 @@ define i32 @test_srem_odd_bit30(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd_bit30: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: movk w8, #27306, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: cmp w8, #3 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: sbfiz x10, x0, #29, #32 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov w8, #3 // =0x3 +; CHECK-NEXT: lsr x10, x9, #63 +; CHECK-NEXT: asr x9, x9, #59 +; CHECK-NEXT: movk w8, #16384, lsl #16 +; CHECK-NEXT: add w9, w9, w10 +; CHECK-NEXT: msub w8, w9, w8, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 1073741827 %cmp = icmp eq i32 %srem, 0 @@ -63,12 +71,17 @@ define i32 @test_srem_odd_bit31(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd_bit31: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #21845 -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: movk w8, #54613, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: cmp w8, #3 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x9, w0 +; CHECK-NEXT: mov w8, #-2147483645 // =0x80000003 +; CHECK-NEXT: add x9, x9, x9, lsl #29 +; CHECK-NEXT: neg x9, x9 +; CHECK-NEXT: lsr x10, x9, #63 +; CHECK-NEXT: asr x9, x9, #60 +; CHECK-NEXT: add w9, w9, w10 +; CHECK-NEXT: msub w8, w9, w8, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 2147483651 %cmp = icmp eq i32 %srem, 0 @@ -83,13 +96,15 @@ define i16 @test_srem_even(i16 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #4680 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: lsl w10, w8, #15 -; CHECK-NEXT: bfxil w10, w8, #1, #15 -; CHECK-NEXT: cmp w9, w10, uxth -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #18725 // =0x4925 +; CHECK-NEXT: sxth w9, w0 +; CHECK-NEXT: mul w8, w9, w8 +; CHECK-NEXT: asr w9, w8, #18 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #14 // =0xe +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: tst w8, #0xffff +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i16 %X, 14 %cmp = icmp ne i16 %srem, 0 @@ -100,16 +115,16 @@ define i32 @test_srem_even_100(i32 %X) nounwind { ; CHECK-LABEL: test_srem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #47184 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #23593 -; CHECK-NEXT: movk w9, #655, lsl #16 -; CHECK-NEXT: ror w8, w8, #2 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #37 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #100 // =0x64 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 100 %cmp = icmp eq i32 %srem, 0 @@ -121,13 +136,17 @@ define i32 @test_srem_even_bit30(i32 %X) nounwind { ; CHECK-LABEL: test_srem_even_bit30: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #20165 -; CHECK-NEXT: mov w9, #8 -; CHECK-NEXT: movk w8, #64748, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: ror w8, w8, #3 -; CHECK-NEXT: cmp w8, #3 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #65433 // =0xff99 +; CHECK-NEXT: movk w8, #16383, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #60 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: mov w9, #104 // =0x68 +; CHECK-NEXT: movk w9, #16384, lsl #16 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 1073741928 %cmp = icmp eq i32 %srem, 0 @@ -139,13 +158,18 @@ define i32 @test_srem_even_bit31(i32 %X) nounwind { ; CHECK-LABEL: test_srem_even_bit31: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #1285 -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: movk w8, #50437, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, #3 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #65433 // =0xff99 +; CHECK-NEXT: movk w8, #32767, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #30 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #102 // =0x66 +; CHECK-NEXT: movk w9, #32768, lsl #16 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 2147483750 %cmp = icmp eq i32 %srem, 0 @@ -161,15 +185,15 @@ define i32 @test_srem_odd_setne(i32 %X) nounwind { ; CHECK-LABEL: test_srem_odd_setne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #13106 -; CHECK-NEXT: movk w9, #13107, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mov w8, #26215 // =0x6667 +; CHECK-NEXT: movk w8, #26214, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i32 %X, 5 %cmp = icmp ne i32 %srem, 0 @@ -181,15 +205,14 @@ define i32 @test_srem_negative_odd(i32 %X) nounwind { ; CHECK-LABEL: test_srem_negative_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #13106 -; CHECK-NEXT: movk w9, #13107, lsl #16 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mov w8, #-1717986919 // =0x99999999 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x9, x8, #63 +; CHECK-NEXT: asr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmn w0, w8 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i32 %X, -5 %cmp = icmp ne i32 %srem, 0 @@ -199,14 +222,17 @@ define i32 @test_srem_negative_even(i32 %X) nounwind { ; CHECK-LABEL: test_srem_negative_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mov w8, #56173 // =0xdb6d +; CHECK-NEXT: movk w8, #28086, lsl #16 +; CHECK-NEXT: smull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: sub w8, w8, w0 +; CHECK-NEXT: asr w9, w8, #3 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: mov w9, #-14 // =0xfffffff2 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %srem = srem i32 %X, -14 %cmp = icmp ne i32 %srem, 0 @@ -222,7 +248,7 @@ define i32 @test_srem_one(i32 %X) nounwind { ; CHECK-LABEL: test_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret %srem = srem i32 %X, 1 %cmp = icmp eq i32 %srem, 0 @@ -268,7 +294,7 @@ define i32 @test_srem_allones(i32 %X) nounwind { ; CHECK-LABEL: test_srem_allones: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret %srem = srem i32 %X, 4294967295 %cmp = icmp eq i32 %srem, 0 diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll --- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -7,12 +7,12 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.h[1] ; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: mov w10, #63421 -; CHECK-NEXT: mov w11, #37253 +; CHECK-NEXT: mov w10, #63421 // =0xf7bd +; CHECK-NEXT: mov w11, #37253 // =0x9185 ; CHECK-NEXT: movk w10, #31710, lsl #16 ; CHECK-NEXT: movk w11, #44150, lsl #16 ; CHECK-NEXT: smov w13, v0.h[2] -; CHECK-NEXT: mov w12, #33437 +; CHECK-NEXT: mov w12, #33437 // =0x829d ; CHECK-NEXT: smull x10, w8, w10 ; CHECK-NEXT: movk w12, #21399, lsl #16 ; CHECK-NEXT: smull x11, w9, w11 @@ -24,8 +24,8 @@ ; CHECK-NEXT: asr w15, w11, #6 ; CHECK-NEXT: add w10, w14, w10, lsr #31 ; CHECK-NEXT: add w11, w15, w11, lsr #31 -; CHECK-NEXT: mov w14, #95 -; CHECK-NEXT: mov w15, #-124 +; CHECK-NEXT: mov w14, #95 // =0x5f +; CHECK-NEXT: mov w15, #-124 // =0xffffff84 ; CHECK-NEXT: smull x12, w13, w12 ; CHECK-NEXT: msub w9, w11, w14, w9 ; CHECK-NEXT: msub w8, w10, w15, w8 @@ -33,9 +33,9 @@ ; CHECK-NEXT: asr x11, x12, #37 ; CHECK-NEXT: smov w12, v0.h[3] ; CHECK-NEXT: add w10, w11, w10 -; CHECK-NEXT: mov w11, #98 +; CHECK-NEXT: mov w11, #98 // =0x62 ; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov w9, #63249 +; CHECK-NEXT: mov w9, #63249 // =0xf711 ; CHECK-NEXT: movk w9, #48808, lsl #16 ; CHECK-NEXT: msub w10, w10, w11, w13 ; CHECK-NEXT: smull x9, w12, w9 @@ -43,7 +43,7 @@ ; CHECK-NEXT: lsr x8, x9, #63 ; CHECK-NEXT: asr x9, x9, #40 ; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: mov w9, #-1003 +; CHECK-NEXT: mov w9, #-1003 // =0xfffffc15 ; CHECK-NEXT: mov v0.h[2], w10 ; CHECK-NEXT: msub w8, w8, w9, w12 ; CHECK-NEXT: mov v0.h[3], w8 @@ -58,11 +58,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: mov w8, #37253 // =0x9185 ; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: smov w14, v0.h[2] -; CHECK-NEXT: mov w12, #95 +; CHECK-NEXT: mov w12, #95 // =0x5f ; CHECK-NEXT: smull x11, w9, w8 ; CHECK-NEXT: smull x13, w10, w8 ; CHECK-NEXT: lsr x11, x11, #32 @@ -105,12 +105,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: mov w8, #37253 // =0x9185 ; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: smov w11, v0.h[2] ; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: mov w14, #95 +; CHECK-NEXT: mov w14, #95 // =0x5f ; CHECK-NEXT: smull x13, w9, w8 ; CHECK-NEXT: smull x15, w10, w8 ; CHECK-NEXT: lsr x13, x13, #32 @@ -158,7 +158,7 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[1] ; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: mov w8, #37253 // =0x9185 ; CHECK-NEXT: smov w12, v0.h[2] ; CHECK-NEXT: movk w8, #44150, lsl #16 ; CHECK-NEXT: negs w11, w9 @@ -181,7 +181,7 @@ ; CHECK-NEXT: csneg w9, w9, w10, mi ; CHECK-NEXT: asr w10, w8, #6 ; CHECK-NEXT: add w8, w10, w8, lsr #31 -; CHECK-NEXT: mov w10, #95 +; CHECK-NEXT: mov w10, #95 // =0x5f ; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: msub w8, w8, w10, w11 ; CHECK-NEXT: mov v0.h[3], w8 @@ -197,11 +197,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: mov w9, #30865 +; CHECK-NEXT: mov w9, #30865 // =0x7891 ; CHECK-NEXT: movk w9, #51306, lsl #16 ; CHECK-NEXT: smov w10, v0.h[2] -; CHECK-NEXT: mov w11, #17097 -; CHECK-NEXT: mov w12, #654 +; CHECK-NEXT: mov w11, #17097 // =0x42c9 +; CHECK-NEXT: mov w12, #654 // =0x28e ; CHECK-NEXT: movk w11, #45590, lsl #16 ; CHECK-NEXT: smull x9, w8, w9 ; CHECK-NEXT: smull x11, w10, w11 @@ -211,13 +211,13 @@ ; CHECK-NEXT: asr w13, w9, #9 ; CHECK-NEXT: add w11, w11, w10 ; CHECK-NEXT: add w9, w13, w9, lsr #31 -; CHECK-NEXT: mov w13, #23 +; CHECK-NEXT: mov w13, #23 // =0x17 ; CHECK-NEXT: msub w8, w9, w12, w8 ; CHECK-NEXT: asr w9, w11, #4 ; CHECK-NEXT: smov w12, v0.h[3] ; CHECK-NEXT: add w9, w9, w11, lsr #31 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov w11, #47143 +; CHECK-NEXT: mov w11, #47143 // =0xb827 ; CHECK-NEXT: movk w11, #24749, lsl #16 ; CHECK-NEXT: msub w9, w9, w13, w10 ; CHECK-NEXT: smull x10, w12, w11 @@ -225,7 +225,7 @@ ; CHECK-NEXT: lsr x8, x10, #63 ; CHECK-NEXT: asr x10, x10, #43 ; CHECK-NEXT: add w8, w10, w8 -; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: mov w10, #5423 // =0x152f ; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: msub w8, w8, w10, w12 ; CHECK-NEXT: mov v0.h[3], w8 @@ -241,11 +241,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w9, v0.h[2] -; CHECK-NEXT: mov w8, #17097 +; CHECK-NEXT: mov w8, #17097 // =0x42c9 ; CHECK-NEXT: movk w8, #45590, lsl #16 ; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: mov w11, #23 +; CHECK-NEXT: mov w11, #23 // =0x17 ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: smull x8, w9, w8 ; CHECK-NEXT: lsr x8, x8, #32 @@ -256,7 +256,7 @@ ; CHECK-NEXT: and w10, w10, #0x7fff ; CHECK-NEXT: and w13, w13, #0x7fff ; CHECK-NEXT: csneg w10, w10, w13, mi -; CHECK-NEXT: mov w13, #47143 +; CHECK-NEXT: mov w13, #47143 // =0xb827 ; CHECK-NEXT: movk w13, #24749, lsl #16 ; CHECK-NEXT: msub w8, w8, w11, w9 ; CHECK-NEXT: smull x9, w12, w13 @@ -264,7 +264,7 @@ ; CHECK-NEXT: lsr x10, x9, #63 ; CHECK-NEXT: asr x9, x9, #43 ; CHECK-NEXT: add w9, w9, w10 -; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: mov w10, #5423 // =0x152f ; CHECK-NEXT: mov v1.h[2], w8 ; CHECK-NEXT: msub w8, w9, w10, w12 ; CHECK-NEXT: mov v1.h[3], w8 @@ -278,14 +278,14 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; CHECK-LABEL: dont_fold_srem_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #8549 +; CHECK-NEXT: mov x8, #8549 // =0x2165 ; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: movk x8, #22795, lsl #16 -; CHECK-NEXT: mov x12, #6055 +; CHECK-NEXT: mov x12, #6055 // =0x17a7 ; CHECK-NEXT: movk x8, #17096, lsl #32 ; CHECK-NEXT: movk x12, #58853, lsl #16 ; CHECK-NEXT: movk x8, #45590, lsl #48 -; CHECK-NEXT: mov x14, #21445 +; CHECK-NEXT: mov x14, #21445 // =0x53c5 ; CHECK-NEXT: mov x10, v1.d[1] ; CHECK-NEXT: movk x12, #47142, lsl #32 ; CHECK-NEXT: smulh x8, x9, x8 @@ -297,16 +297,16 @@ ; CHECK-NEXT: asr x13, x8, #4 ; CHECK-NEXT: movk x14, #25653, lsl #48 ; CHECK-NEXT: add x8, x13, x8, lsr #63 -; CHECK-NEXT: mov w13, #23 +; CHECK-NEXT: mov w13, #23 // =0x17 ; CHECK-NEXT: smulh x12, x10, x12 ; CHECK-NEXT: smulh x14, x11, x14 ; CHECK-NEXT: msub x8, x8, x13, x9 ; CHECK-NEXT: asr x13, x12, #11 ; CHECK-NEXT: add x12, x13, x12, lsr #63 ; CHECK-NEXT: asr x13, x14, #8 -; CHECK-NEXT: mov w9, #5423 +; CHECK-NEXT: mov w9, #5423 // =0x152f ; CHECK-NEXT: add x13, x13, x14, lsr #63 -; CHECK-NEXT: mov w14, #654 +; CHECK-NEXT: mov w14, #654 // =0x28e ; CHECK-NEXT: msub x9, x12, x9, x10 ; CHECK-NEXT: fmov d1, x8 ; CHECK-NEXT: msub x10, x13, x14, x11 diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -134,15 +134,17 @@ define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1 { v0.b }[0], [x1] -; CHECK-NEXT: add x8, x1, #1 -; CHECK-NEXT: ld1 { v1.b }[0], [x0] -; CHECK-NEXT: add x9, x0, #1 -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: ldrsb w8, [x1] +; CHECK-NEXT: ldrsb w9, [x0] +; CHECK-NEXT: ldrsb w10, [x1, #1] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: ldrsb w9, [x0, #1] +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: shl v1.2s, v1.2s, #24 -; CHECK-NEXT: sqsub v0.2s, v1.2s, v0.2s +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 @@ -174,15 +176,17 @@ define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1 { v0.h }[0], [x1] -; CHECK-NEXT: add x8, x1, #2 -; CHECK-NEXT: ld1 { v1.h }[0], [x0] -; CHECK-NEXT: add x9, x0, #2 -; CHECK-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-NEXT: ld1 { v1.h }[2], [x9] -; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: ldrsh w8, [x1] +; CHECK-NEXT: ldrsh w9, [x0] +; CHECK-NEXT: ldrsh w10, [x1, #2] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: ldrsh w9, [x0, #2] +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: sqsub v0.2s, v1.2s, v0.2s +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 diff --git a/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll b/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll --- a/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll +++ b/llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll @@ -16,12 +16,12 @@ ; CHECK-NEXT: bl return_in_block ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: str d0, [x8] -; CHECK-NEXT: str d1, [x8, #8] -; CHECK-NEXT: str d2, [x8, #16] ; CHECK-NEXT: str d3, [x8, #24] -; CHECK-NEXT: str d4, [x8, #32] ; CHECK-NEXT: str d5, [x8, #40] +; CHECK-NEXT: str d4, [x8, #32] +; CHECK-NEXT: str d2, [x8, #16] +; CHECK-NEXT: str d1, [x8, #8] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w30 @@ -40,9 +40,10 @@ ; CHECK-NEXT: bl return_in_block ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: stp d0, d1, [x8] -; CHECK-NEXT: stp d2, d3, [x8, #16] -; CHECK-NEXT: stp d4, d5, [x8, #32] +; CHECK-NEXT: stp d3, d4, [x8, #24] +; CHECK-NEXT: str d5, [x8, #40] +; CHECK-NEXT: stp d1, d2, [x8, #8] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %1 = call %T_IN_BLOCK @return_in_block() @@ -59,9 +60,10 @@ ; CHECK-NEXT: bl return_in_block ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: stp d0, d1, [x8] -; CHECK-NEXT: stp d2, d3, [x8, #16] -; CHECK-NEXT: stp d4, d5, [x8, #32] +; CHECK-NEXT: stp d3, d4, [x8, #24] +; CHECK-NEXT: str d5, [x8, #40] +; CHECK-NEXT: stp d1, d2, [x8, #8] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w30 diff --git a/llvm/test/CodeGen/AArch64/sve-aba.ll b/llvm/test/CodeGen/AArch64/sve-aba.ll --- a/llvm/test/CodeGen/AArch64/sve-aba.ll +++ b/llvm/test/CodeGen/AArch64/sve-aba.ll @@ -77,7 +77,9 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: sxtb z1.h, p0/m, z1.h ; CHECK-NEXT: sxtb z2.h, p0/m, z2.h -; CHECK-NEXT: saba z0.h, z1.h, z2.h +; CHECK-NEXT: sub z1.h, z1.h, z2.h +; CHECK-NEXT: abs z1.h, p0/m, z1.h +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: ret %b.sext = sext %b to %c.sext = sext %c to @@ -128,7 +130,9 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sxth z1.s, p0/m, z1.s ; CHECK-NEXT: sxth z2.s, p0/m, z2.s -; CHECK-NEXT: saba z0.s, z1.s, z2.s +; CHECK-NEXT: sub z1.s, z1.s, z2.s +; CHECK-NEXT: abs z1.s, p0/m, z1.s +; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: ret %b.sext = sext %b to %c.sext = sext %c to @@ -179,7 +183,9 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z1.d, p0/m, z1.d ; CHECK-NEXT: sxtw z2.d, p0/m, z2.d -; CHECK-NEXT: saba z0.d, z1.d, z2.d +; CHECK-NEXT: sub z1.d, z1.d, z2.d +; CHECK-NEXT: abs z1.d, p0/m, z1.d +; CHECK-NEXT: add z0.d, z0.d, z1.d ; CHECK-NEXT: ret %b.sext = sext %b to %c.sext = sext %c to @@ -231,9 +237,13 @@ define @uaba_b_promoted_ops( %a, %b, %c) #0 { ; CHECK-LABEL: uaba_b_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: mov z2.b, p1/z, #1 // =0x1 -; CHECK-NEXT: uaba z0.b, z1.b, z2.b +; CHECK-NEXT: mov z1.b, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.b, p0/z, #1 // =0x1 +; CHECK-NEXT: add z2.b, p1/m, z2.b, z1.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: abs z1.b, p0/m, z2.b +; CHECK-NEXT: add z0.b, z0.b, z1.b ; CHECK-NEXT: ret %b.zext = zext %b to %c.zext = zext %c to @@ -283,7 +293,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z2.h, z2.h, #0xff -; CHECK-NEXT: uaba z0.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sub z1.h, z1.h, z2.h +; CHECK-NEXT: abs z1.h, p0/m, z1.h +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: ret %b.zext = zext %b to %c.zext = zext %c to @@ -333,7 +346,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and z1.s, z1.s, #0xffff ; CHECK-NEXT: and z2.s, z2.s, #0xffff -; CHECK-NEXT: uaba z0.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sub z1.s, z1.s, z2.s +; CHECK-NEXT: abs z1.s, p0/m, z1.s +; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: ret %b.zext = zext %b to %c.zext = zext %c to @@ -383,7 +399,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff ; CHECK-NEXT: and z2.d, z2.d, #0xffffffff -; CHECK-NEXT: uaba z0.d, z1.d, z2.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sub z1.d, z1.d, z2.d +; CHECK-NEXT: abs z1.d, p0/m, z1.d +; CHECK-NEXT: add z0.d, z0.d, z1.d ; CHECK-NEXT: ret %b.zext = zext %b to %c.zext = zext %c to diff --git a/llvm/test/CodeGen/AArch64/sve-abd.ll b/llvm/test/CodeGen/AArch64/sve-abd.ll --- a/llvm/test/CodeGen/AArch64/sve-abd.ll +++ b/llvm/test/CodeGen/AArch64/sve-abd.ll @@ -56,7 +56,8 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: sxtb z0.h, p0/m, z0.h ; CHECK-NEXT: sxtb z1.h, p0/m, z1.h -; CHECK-NEXT: sabd z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: abs z0.h, p0/m, z0.h ; CHECK-NEXT: ret %a.sext = sext %a to %b.sext = sext %b to @@ -85,7 +86,8 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: sxth z1.s, p0/m, z1.s -; CHECK-NEXT: sabd z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: ret %a.sext = sext %a to %b.sext = sext %b to @@ -114,7 +116,8 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: sxtw z1.d, p0/m, z1.d -; CHECK-NEXT: sabd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: ret %a.sext = sext %a to %b.sext = sext %b to @@ -144,10 +147,12 @@ define @uabd_b_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_b_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.b -; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 -; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 -; CHECK-NEXT: uabd z0.b, p2/m, z0.b, z1.b +; CHECK-NEXT: mov z0.b, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 +; CHECK-NEXT: add z1.b, p1/m, z1.b, z0.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: abs z0.b, p0/m, z1.b ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -173,10 +178,11 @@ define @uabd_h_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_h_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: uabd z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h +; CHECK-NEXT: abs z0.h, p0/m, z0.h ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -202,10 +208,11 @@ define @uabd_s_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_s_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: and z1.s, z1.s, #0xffff -; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -231,10 +238,11 @@ define @uabd_d_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_d_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff -; CHECK-NEXT: uabd z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d +; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -248,17 +256,9 @@ define @uabd_non_matching_extension( %a, %b) #0 { ; CHECK-LABEL: uabd_non_matching_extension: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0xff -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpkhi z3.d, z1.s -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub z0.d, z0.d, z1.d -; CHECK-NEXT: sub z1.d, z2.d, z3.d -; CHECK-NEXT: abs z1.d, p0/m, z1.d -; CHECK-NEXT: abs z0.d, p0/m, z0.d -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -273,10 +273,11 @@ define @uabd_non_matching_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_non_matching_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: and z1.s, z1.s, #0xffff -; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s +; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -64,13 +64,13 @@ ; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1] ; CHECK-NEXT: fmov s0, #1.00000000 ; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: mov w1, #1 -; CHECK-NEXT: mov w2, #2 -; CHECK-NEXT: mov w3, #3 -; CHECK-NEXT: mov w4, #4 -; CHECK-NEXT: mov w5, #5 -; CHECK-NEXT: mov w6, #6 -; CHECK-NEXT: mov w7, #7 +; CHECK-NEXT: mov w1, #1 // =0x1 +; CHECK-NEXT: mov w2, #2 // =0x2 +; CHECK-NEXT: mov w3, #3 // =0x3 +; CHECK-NEXT: mov w4, #4 // =0x4 +; CHECK-NEXT: mov w5, #5 // =0x5 +; CHECK-NEXT: mov w6, #6 // =0x6 +; CHECK-NEXT: mov w7, #7 // =0x7 ; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z16.d }, p0, [x9] @@ -158,18 +158,18 @@ ; CHECK-LABEL: foo4: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x3, #1, mul vl] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x3] -; CHECK-NEXT: ld1d { z24.d }, p0/z, [x3, #3, mul vl] -; CHECK-NEXT: ld1d { z25.d }, p0/z, [x3, #2, mul vl] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x3] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x3, #1, mul vl] +; CHECK-NEXT: ld1d { z24.d }, p0/z, [x3, #2, mul vl] +; CHECK-NEXT: ld1d { z25.d }, p0/z, [x3, #3, mul vl] ; CHECK-NEXT: st1d { z4.d }, p0, [x0, #3, mul vl] ; CHECK-NEXT: st1d { z3.d }, p0, [x0, #2, mul vl] ; CHECK-NEXT: st1d { z2.d }, p0, [x0, #1, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [x0] -; CHECK-NEXT: st1d { z25.d }, p0, [x1, #2, mul vl] -; CHECK-NEXT: st1d { z24.d }, p0, [x1, #3, mul vl] -; CHECK-NEXT: st1d { z7.d }, p0, [x1] -; CHECK-NEXT: st1d { z6.d }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1d { z25.d }, p0, [x1, #3, mul vl] +; CHECK-NEXT: st1d { z24.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: st1d { z7.d }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1d { z6.d }, p0, [x1] ; CHECK-NEXT: st1d { z5.d }, p0, [x2] ; CHECK-NEXT: ret entry: @@ -184,18 +184,18 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr x8, [sp] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x8] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x8, #3, mul vl] -; CHECK-NEXT: ld1d { z24.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1d { z24.d }, p0/z, [x8, #3, mul vl] ; CHECK-NEXT: st1d { z4.d }, p0, [x6, #3, mul vl] ; CHECK-NEXT: st1d { z3.d }, p0, [x6, #2, mul vl] ; CHECK-NEXT: st1d { z2.d }, p0, [x6, #1, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [x6] -; CHECK-NEXT: st1d { z24.d }, p0, [x7, #2, mul vl] -; CHECK-NEXT: st1d { z7.d }, p0, [x7, #3, mul vl] -; CHECK-NEXT: st1d { z6.d }, p0, [x7] -; CHECK-NEXT: st1d { z5.d }, p0, [x7, #1, mul vl] +; CHECK-NEXT: st1d { z24.d }, p0, [x7, #3, mul vl] +; CHECK-NEXT: st1d { z7.d }, p0, [x7, #2, mul vl] +; CHECK-NEXT: st1d { z6.d }, p0, [x7, #1, mul vl] +; CHECK-NEXT: st1d { z5.d }, p0, [x7] ; CHECK-NEXT: ret entry: store volatile %x1, * %ptr1 @@ -208,14 +208,14 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x2] -; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2, #2, mul vl] -; CHECK-NEXT: ld1d { z7.d }, p0/z, [x2, #1, mul vl] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2, #1, mul vl] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x2, #2, mul vl] ; CHECK-NEXT: st1d { z5.d }, p0, [x0, #3, mul vl] ; CHECK-NEXT: st1d { z4.d }, p0, [x0, #2, mul vl] ; CHECK-NEXT: st1d { z3.d }, p0, [x0, #1, mul vl] ; CHECK-NEXT: st1d { z2.d }, p0, [x0] -; CHECK-NEXT: st1d { z7.d }, p0, [x1, #1, mul vl] -; CHECK-NEXT: st1d { z6.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: st1d { z7.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: st1d { z6.d }, p0, [x1, #1, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [x1] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll @@ -21,7 +21,7 @@ define void @masked_gather_base_plus_stride_v4f64(ptr %dst, ptr %src) #0 { ; CHECK-LABEL: masked_gather_base_plus_stride_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-32 +; CHECK-NEXT: mov x8, #-32 // =0xffffffffffffffe0 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: index z0.d, #-2, x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1, z0.d, lsl #3] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll @@ -46,7 +46,7 @@ define void @ctlz_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctlz_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -134,7 +134,7 @@ define void @ctlz_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctlz_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -222,7 +222,7 @@ define void @ctlz_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctlz_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -314,7 +314,7 @@ define void @ctlz_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctlz_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -406,7 +406,7 @@ define void @ctpop_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctpop_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -496,7 +496,7 @@ define void @ctpop_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctpop_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -588,7 +588,7 @@ define void @ctpop_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctpop_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -682,7 +682,7 @@ define void @ctpop_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: ctpop_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -779,7 +779,7 @@ define void @cttz_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: cttz_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -877,7 +877,7 @@ define void @cttz_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: cttz_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -977,7 +977,7 @@ define void @cttz_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: cttz_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1077,7 +1077,7 @@ define void @cttz_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: cttz_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll @@ -47,7 +47,7 @@ define void @bitcast_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: bitcast_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -135,7 +135,7 @@ define void @bitcast_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: bitcast_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -223,7 +223,7 @@ define void @bitcast_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: bitcast_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll @@ -42,7 +42,7 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: build_vector_minus2_dec32_v4i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #-32 +; VBITS_GE_256-NEXT: mov x8, #-32 // =0xffffffffffffffe0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: index z0.d, #-2, x8 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] @@ -53,11 +53,6 @@ ; Constant but not a sequence. define void @build_vector_no_stride_v4i64(ptr %a) #0 { -; VBITS_GE_256-LABEL: .LCPI4_0: -; VBITS_GE_256: .xword 0 -; VBITS_GE_256-NEXT: .xword 4 -; VBITS_GE_256-NEXT: .xword 1 -; VBITS_GE_256-NEXT: .xword 8 ; VBITS_GE_256-LABEL: build_vector_no_stride_v4i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: adrp x8, .LCPI4_0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll @@ -56,7 +56,7 @@ ; VBITS_GE_256-LABEL: concat_v64i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2, x8] @@ -214,7 +214,7 @@ ; VBITS_GE_256-LABEL: concat_v32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1] @@ -343,7 +343,7 @@ ; VBITS_GE_256-LABEL: concat_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] @@ -448,7 +448,7 @@ ; VBITS_GE_256-LABEL: concat_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] @@ -557,7 +557,7 @@ ; VBITS_GE_256-LABEL: concat_v32f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2, x8, lsl #1] @@ -686,7 +686,7 @@ ; VBITS_GE_256-LABEL: concat_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] @@ -791,7 +791,7 @@ ; VBITS_GE_256-LABEL: concat_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -45,7 +45,7 @@ define void @extract_subvector_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] @@ -137,7 +137,7 @@ define void @extract_subvector_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] @@ -228,7 +228,7 @@ define void @extract_subvector_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] @@ -308,7 +308,7 @@ define void @extract_subvector_v8i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -322,12 +322,12 @@ define void @extract_subvector_v16i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v16i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 -; VBITS_GE_256-NEXT: mov x9, #12 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x9, #12 // =0xc ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_256-NEXT: ret @@ -340,7 +340,7 @@ define void @extract_subvector_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -392,7 +392,7 @@ define void @extract_subvector_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] @@ -483,7 +483,7 @@ define void @extract_subvector_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] @@ -563,7 +563,7 @@ define void @extract_subvector_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll @@ -46,7 +46,7 @@ define half @extractelement_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: extractelement_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: mov z0.h, z0.h[15] @@ -69,7 +69,7 @@ ; CHECK-LABEL: extractelement_v64f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: whilels p0.h, xzr, x8 ; CHECK-NEXT: lastb h0, p0, z0.h @@ -83,7 +83,7 @@ ; CHECK-LABEL: extractelement_v128f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: whilels p0.h, xzr, x8 ; CHECK-NEXT: lastb h0, p0, z0.h @@ -130,7 +130,7 @@ define float @extractelement_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: extractelement_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: mov z0.s, z0.s[7] @@ -153,7 +153,7 @@ ; CHECK-LABEL: extractelement_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: whilels p0.s, xzr, x8 ; CHECK-NEXT: lastb s0, p0, z0.s @@ -167,7 +167,7 @@ ; CHECK-LABEL: extractelement_v64f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: whilels p0.s, xzr, x8 ; CHECK-NEXT: lastb s0, p0, z0.s @@ -212,7 +212,7 @@ define double @extractelement_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: extractelement_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: mov z0.d, z0.d[3] @@ -235,7 +235,7 @@ ; CHECK-LABEL: extractelement_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: whilels p0.d, xzr, x8 ; CHECK-NEXT: lastb d0, p0, z0.d @@ -249,7 +249,7 @@ ; CHECK-LABEL: extractelement_v32f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: whilels p0.d, xzr, x8 ; CHECK-NEXT: lastb d0, p0, z0.d diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll @@ -65,7 +65,7 @@ define void @test_copysign_v32f16_v32f16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: test_copysign_v32f16_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -189,7 +189,7 @@ define void @test_copysign_v16f32_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: test_copysign_v16f32_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -298,7 +298,7 @@ define void @test_copysign_v8f64_v8f64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: test_copysign_v8f64_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -436,30 +436,17 @@ ; SplitVecRes mismatched define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { -; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: -; CHECK_NO_EXTEND_ROUND: // %bb.0: -; CHECK_NO_EXTEND_ROUND-NEXT: ptrue p0.d, vl4 -; CHECK_NO_EXTEND_ROUND-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK_NO_EXTEND_ROUND-NEXT: ld1w { z1.d }, p0/z, [x1] -; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z1.d, p0/m, z1.s -; CHECK_NO_EXTEND_ROUND-NEXT: and z0.d, z0.d, #0x7fffffffffffffff -; CHECK_NO_EXTEND_ROUND-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK_NO_EXTEND_ROUND-NEXT: orr z0.d, z0.d, z1.d -; CHECK_NO_EXTEND_ROUND-NEXT: st1d { z0.d }, p0, [x0] -; CHECK_NO_EXTEND_ROUND-NEXT: ret -; -; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: -; CHECK_EXTEND_ROUND: // %bb.0: -; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d, vl4 -; CHECK_EXTEND_ROUND-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK_EXTEND_ROUND-NEXT: ldr q1, [x1] -; CHECK_EXTEND_ROUND-NEXT: uunpklo z1.d, z1.s -; CHECK_EXTEND_ROUND-NEXT: fcvt z1.d, p0/m, z1.s -; CHECK_EXTEND_ROUND-NEXT: and z0.d, z0.d, #0x7fffffffffffffff -; CHECK_EXTEND_ROUND-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK_EXTEND_ROUND-NEXT: orr z0.d, z0.d, z1.d -; CHECK_EXTEND_ROUND-NEXT: st1d { z0.d }, p0, [x0] -; CHECK_EXTEND_ROUND-NEXT: ret +; CHECK-LABEL: test_copysign_v4f64_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x1] +; CHECK-NEXT: fcvt z1.d, p0/m, z1.s +; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x float>, ptr %bp %tmp0 = fpext <4 x float> %b to <4 x double> @@ -556,3 +543,6 @@ declare <32 x double> @llvm.copysign.v32f64(<32 x double> %a, <32 x double> %b) #0 attributes #0 = { "target-features"="+sve" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK_EXTEND_ROUND: {{.*}} +; CHECK_NO_EXTEND_ROUND: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll @@ -48,7 +48,7 @@ define void @fadd_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fadd_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @fadd_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fadd_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -244,7 +244,7 @@ define void @fadd_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fadd_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -346,7 +346,7 @@ define void @fdiv_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fdiv_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -444,7 +444,7 @@ define void @fdiv_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fdiv_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -542,7 +542,7 @@ define void @fdiv_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fdiv_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -648,7 +648,7 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -758,7 +758,7 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -867,7 +867,7 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -977,7 +977,7 @@ define void @fmul_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fmul_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1075,7 +1075,7 @@ define void @fmul_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fmul_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1173,7 +1173,7 @@ define void @fmul_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fmul_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1273,7 +1273,7 @@ define void @fneg_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: fneg_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1361,7 +1361,7 @@ define void @fneg_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: fneg_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1449,7 +1449,7 @@ define void @fneg_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: fneg_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1541,7 +1541,7 @@ define void @fsqrt_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: fsqrt_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1629,7 +1629,7 @@ define void @fsqrt_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: fsqrt_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1717,7 +1717,7 @@ define void @fsqrt_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: fsqrt_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1811,7 +1811,7 @@ define void @fsub_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fsub_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1909,7 +1909,7 @@ define void @fsub_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fsub_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -2007,7 +2007,7 @@ define void @fsub_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fsub_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -2107,7 +2107,7 @@ define void @fabs_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: fabs_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -2195,7 +2195,7 @@ define void @fabs_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: fabs_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -2283,7 +2283,7 @@ define void @fabs_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: fabs_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-compares.ll @@ -52,7 +52,7 @@ define void @fcmp_oeq_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fcmp_oeq_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -162,7 +162,7 @@ define void @fcmp_oeq_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fcmp_oeq_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -272,7 +272,7 @@ define void @fcmp_oeq_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fcmp_oeq_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -54,7 +54,7 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0] @@ -157,7 +157,7 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v8f16_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0] @@ -257,7 +257,7 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0] @@ -357,7 +357,7 @@ define void @fcvt_v16f32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v16f32_v16f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -460,7 +460,7 @@ define void @fcvt_v8f64_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -562,7 +562,7 @@ define void @fcvt_v8f64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvt_v8f64_v8f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-fma.ll @@ -55,7 +55,7 @@ define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -171,7 +171,7 @@ define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -286,7 +286,7 @@ define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_256-LABEL: fma_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll @@ -48,7 +48,7 @@ define void @fmaxnm_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmaxnm_v32f16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @fmaxnm_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmaxnm_v16f32: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x8, #8 // =0x8 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -244,7 +244,7 @@ define void @fmaxnm_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmaxnm_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x8, #4 // =0x4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -346,7 +346,7 @@ define void @fminnm_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fminnm_v32f16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -444,7 +444,7 @@ define void @fminnm_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fminnm_v16f32: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x8, #8 // =0x8 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -542,7 +542,7 @@ define void @fminnm_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fminnm_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x8, #4 // =0x4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -644,7 +644,7 @@ define void @fmax_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmax_v32f16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -742,7 +742,7 @@ define void @fmax_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmax_v16f32: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x8, #8 // =0x8 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -840,7 +840,7 @@ define void @fmax_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmax_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x8, #4 // =0x4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -942,7 +942,7 @@ define void @fmin_v32f16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmin_v32f16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1040,7 +1040,7 @@ define void @fmin_v16f32(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmin_v16f32: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #8 +; VBITS_EQ_256-NEXT: mov x8, #8 // =0x8 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1138,7 +1138,7 @@ define void @fmin_v8f64(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: fmin_v8f64: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: mov x8, #4 // =0x4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-rounding.ll @@ -46,7 +46,7 @@ define void @frintp_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintp_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -134,7 +134,7 @@ define void @frintp_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintp_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -222,7 +222,7 @@ define void @frintp_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintp_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -314,7 +314,7 @@ define void @frintm_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintm_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -402,7 +402,7 @@ define void @frintm_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintm_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -490,7 +490,7 @@ define void @frintm_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintm_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -582,7 +582,7 @@ define void @frinti_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinti_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -670,7 +670,7 @@ define void @frinti_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinti_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -758,7 +758,7 @@ define void @frinti_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinti_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -850,7 +850,7 @@ define void @frintx_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintx_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -938,7 +938,7 @@ define void @frintx_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintx_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1026,7 +1026,7 @@ define void @frintx_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintx_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1118,7 +1118,7 @@ define void @frinta_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinta_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1206,7 +1206,7 @@ define void @frinta_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinta_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1294,7 +1294,7 @@ define void @frinta_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frinta_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1386,7 +1386,7 @@ define void @frintn_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintn_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1474,7 +1474,7 @@ define void @frintn_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintn_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1562,7 +1562,7 @@ define void @frintn_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintn_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1654,7 +1654,7 @@ define void @frintz_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintz_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1742,7 +1742,7 @@ define void @frintz_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintz_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1830,7 +1830,7 @@ define void @frintz_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: frintz_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll @@ -54,7 +54,7 @@ define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.h @@ -178,7 +178,7 @@ define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.s @@ -282,9 +282,8 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d @@ -303,10 +302,9 @@ define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 -; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: and x9, x2, #0x1 +; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.d ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -322,9 +320,8 @@ ; ; VBITS_GE_512-LABEL: select_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 +; VBITS_GE_512-NEXT: and w8, w2, #0x1 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: and x8, x2, #0x1 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: ptrue p1.d @@ -343,9 +340,8 @@ define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d @@ -364,9 +360,8 @@ define void @select_v32f64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -50,7 +50,7 @@ define void @fcvtzu_v32f16_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v32f16_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -147,7 +147,7 @@ ; VBITS_GE_256-LABEL: fcvtzu_v16f16_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h @@ -250,7 +250,7 @@ ; VBITS_GE_256-LABEL: fcvtzu_v8f16_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -355,7 +355,7 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v16f32_v16i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -451,7 +451,7 @@ define void @fcvtzu_v16f32_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v16f32_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -548,7 +548,7 @@ ; VBITS_GE_256-LABEL: fcvtzu_v8f32_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s @@ -650,7 +650,7 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -750,7 +750,7 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -847,7 +847,7 @@ define void @fcvtzu_v8f64_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzu_v8f64_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -943,7 +943,7 @@ define void @fcvtzs_v32f16_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v32f16_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1040,7 +1040,7 @@ ; VBITS_GE_256-LABEL: fcvtzs_v16f16_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h @@ -1143,7 +1143,7 @@ ; VBITS_GE_256-LABEL: fcvtzs_v8f16_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -1248,7 +1248,7 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v16f32_v16i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1344,7 +1344,7 @@ define void @fcvtzs_v16f32_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v16f32_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1441,7 +1441,7 @@ ; VBITS_GE_256-LABEL: fcvtzs_v8f32_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s @@ -1543,7 +1543,7 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1643,7 +1643,7 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1740,7 +1740,7 @@ define void @fcvtzs_v8f64_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: fcvtzs_v8f64_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -51,7 +51,7 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -164,7 +164,7 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -278,7 +278,7 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll @@ -114,7 +114,7 @@ ; CHECK-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: add x9, sp, #128 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ldr z1, [x9] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll @@ -11,58 +11,45 @@ define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v6, ptr %v7, ptr %v8, ; CHECK-LABEL: func1: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x25, [sp, #-64]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -40 -; CHECK-NEXT: .cfi_offset w24, -48 -; CHECK-NEXT: .cfi_offset w25, -56 -; CHECK-NEXT: .cfi_offset w29, -64 -; CHECK-NEXT: add x8, sp, #64 -; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: add x10, sp, #160 -; CHECK-NEXT: add x11, sp, #192 +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x8, sp, #56 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x20, sp, #192 +; CHECK-NEXT: add x9, sp, #88 +; CHECK-NEXT: add x10, sp, #120 +; CHECK-NEXT: add x11, sp, #152 +; CHECK-NEXT: add x12, sp, #240 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: add x8, sp, #272 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9] ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x10] ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x11] -; CHECK-NEXT: ldp x18, x19, [sp, #368] -; CHECK-NEXT: add x21, sp, #160 -; CHECK-NEXT: add x22, sp, #128 -; CHECK-NEXT: ldp x24, x14, [sp, #296] -; CHECK-NEXT: add x23, sp, #64 -; CHECK-NEXT: ldr x25, [sp, #288] -; CHECK-NEXT: ldp x9, x8, [sp, #344] -; CHECK-NEXT: ldp x11, x10, [sp, #328] -; CHECK-NEXT: ldp x13, x12, [sp, #312] -; CHECK-NEXT: ldr x15, [sp, #120] -; CHECK-NEXT: ldur q4, [sp, #104] -; CHECK-NEXT: ldp x16, x17, [sp, #224] -; CHECK-NEXT: st1d { z3.d }, p0, [x20] -; CHECK-NEXT: st1d { z2.d }, p0, [x21] -; CHECK-NEXT: st1d { z1.d }, p0, [x22] -; CHECK-NEXT: st1d { z0.d }, p0, [x23] -; CHECK-NEXT: stp x18, x19, [sp, #368] -; CHECK-NEXT: stp x25, x24, [sp, #288] -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: stp x16, x17, [sp, #224] -; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: stur q4, [sp, #104] -; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: str x15, [sp, #120] -; CHECK-NEXT: stp x14, x13, [sp, #304] -; CHECK-NEXT: stp x12, x11, [sp, #320] -; CHECK-NEXT: stp x10, x9, [sp, #336] -; CHECK-NEXT: str x8, [sp, #352] -; CHECK-NEXT: ldp x29, x25, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x12] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8] +; CHECK-NEXT: add x14, sp, #272 +; CHECK-NEXT: ldp x8, x9, [sp, #32] +; CHECK-NEXT: add x15, sp, #240 +; CHECK-NEXT: add x16, sp, #152 +; CHECK-NEXT: ldp x12, x13, [sp, #320] +; CHECK-NEXT: add x17, sp, #120 +; CHECK-NEXT: add x18, sp, #88 +; CHECK-NEXT: ldr q6, [sp, #16] +; CHECK-NEXT: ldr x10, [sp, #184] +; CHECK-NEXT: ldr x11, [sp, #304] +; CHECK-NEXT: st1d { z5.d }, p0, [x14] +; CHECK-NEXT: add x14, sp, #56 +; CHECK-NEXT: st1d { z4.d }, p0, [x15] +; CHECK-NEXT: st1d { z3.d }, p0, [x16] +; CHECK-NEXT: st1d { z2.d }, p0, [x17] +; CHECK-NEXT: st1d { z1.d }, p0, [x18] +; CHECK-NEXT: st1d { z0.d }, p0, [x14] +; CHECK-NEXT: stp x12, x13, [sp, #320] +; CHECK-NEXT: str x11, [sp, #304] +; CHECK-NEXT: str x10, [sp, #184] +; CHECK-NEXT: stp x8, x9, [sp, #32] +; CHECK-NEXT: str q6, [sp, #16] +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: b func2 ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr %v13, ptr %v14, ptr %v15, ptr %v16, ptr %v17, ptr %v18, ptr %v19, ptr %v20, ptr %v21, ptr %v22, ptr %v23, ptr %v24, diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -36,7 +36,7 @@ define <16 x half> @insertelement_v16f16(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #15 +; CHECK-NEXT: mov w9, #15 // =0xf ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: fmov h2, #5.00000000 @@ -55,8 +55,8 @@ define <32 x half> @insertelement_v32f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: insertelement_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #16 -; VBITS_GE_256-NEXT: mov w10, #15 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov w10, #15 // =0xf ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: fmov h3, #5.00000000 ; VBITS_GE_256-NEXT: index z4.h, #0, #1 @@ -72,7 +72,7 @@ ; ; VBITS_GE_512-LABEL: insertelement_v32f16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov w9, #31 +; VBITS_GE_512-NEXT: mov w9, #31 // =0x1f ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: fmov h2, #5.00000000 @@ -91,7 +91,7 @@ define <64 x half> @insertelement_v64f16(ptr %a) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v64f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #63 +; CHECK-NEXT: mov w9, #63 // =0x3f ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: fmov h2, #5.00000000 @@ -110,7 +110,7 @@ define <128 x half> @insertelement_v128f16(ptr %a) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v128f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #127 +; CHECK-NEXT: mov w9, #127 // =0x7f ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: fmov h2, #5.00000000 @@ -153,7 +153,7 @@ define <8 x float> @insertelement_v8f32(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #7 +; CHECK-NEXT: mov w9, #7 // =0x7 ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fmov s2, #5.00000000 @@ -172,8 +172,8 @@ define <16 x float> @insertelement_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: insertelement_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 -; VBITS_GE_256-NEXT: mov w10, #7 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov w10, #7 // =0x7 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: fmov s3, #5.00000000 ; VBITS_GE_256-NEXT: index z4.s, #0, #1 @@ -189,7 +189,7 @@ ; ; VBITS_GE_512-LABEL: insertelement_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov w9, #15 +; VBITS_GE_512-NEXT: mov w9, #15 // =0xf ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: fmov s2, #5.00000000 @@ -208,7 +208,7 @@ define <32 x float> @insertelement_v32f32(ptr %a) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #31 +; CHECK-NEXT: mov w9, #31 // =0x1f ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fmov s2, #5.00000000 @@ -227,7 +227,7 @@ define <64 x float> @insertelement_v64f32(ptr %a) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #63 +; CHECK-NEXT: mov w9, #63 // =0x3f ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fmov s2, #5.00000000 @@ -247,7 +247,7 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v1f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #4617315517961601024 +; CHECK-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret %r = insertelement <1 x double> %op1, double 5.0, i64 0 @@ -268,7 +268,7 @@ define <4 x double> @insertelement_v4f64(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #3 +; CHECK-NEXT: mov w9, #3 // =0x3 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fmov d2, #5.00000000 @@ -287,8 +287,8 @@ define <8 x double> @insertelement_v8f64(ptr %a) #0 { ; VBITS_GE_256-LABEL: insertelement_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 -; VBITS_GE_256-NEXT: mov w10, #3 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 +; VBITS_GE_256-NEXT: mov w10, #3 // =0x3 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: fmov d3, #5.00000000 ; VBITS_GE_256-NEXT: index z4.d, #0, #1 @@ -304,7 +304,7 @@ ; ; VBITS_GE_512-LABEL: insertelement_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov w9, #7 +; VBITS_GE_512-NEXT: mov w9, #7 // =0x7 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: fmov d2, #5.00000000 @@ -323,7 +323,7 @@ define <16 x double> @insertelement_v16f64(ptr %a) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #15 +; CHECK-NEXT: mov w9, #15 // =0xf ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fmov d2, #5.00000000 @@ -342,7 +342,7 @@ define <32 x double> @insertelement_v32f64(ptr %a) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #31 +; CHECK-NEXT: mov w9, #31 // =0x1f ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fmov d2, #5.00000000 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll @@ -48,7 +48,7 @@ define void @add_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: add_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @add_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: add_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -244,7 +244,7 @@ define void @add_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: add_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -342,7 +342,7 @@ define void @add_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: add_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -388,7 +388,7 @@ define void @add_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: add_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -449,7 +449,7 @@ define void @mul_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: mul_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -547,7 +547,7 @@ define void @mul_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: mul_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -645,7 +645,7 @@ define void @mul_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: mul_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -749,7 +749,7 @@ define void @mul_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: mul_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -851,7 +851,7 @@ define void @sub_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: sub_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -949,7 +949,7 @@ define void @sub_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: sub_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1047,7 +1047,7 @@ define void @sub_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: sub_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1145,7 +1145,7 @@ define void @sub_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: sub_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1246,7 +1246,7 @@ define void @abs_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: abs_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -1334,7 +1334,7 @@ define void @abs_v32i16(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: abs_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1352,9 +1352,9 @@ define void @abs_v64i16(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: abs_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #48 -; CHECK-NEXT: mov x9, #16 -; CHECK-NEXT: mov x10, #32 +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: mov x9, #16 // =0x10 +; CHECK-NEXT: mov x10, #48 // =0x30 ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] @@ -1378,13 +1378,13 @@ define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: abs_v128i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #96 -; CHECK-NEXT: mov x9, #48 -; CHECK-NEXT: mov x10, #16 -; CHECK-NEXT: mov x11, #80 -; CHECK-NEXT: mov x12, #32 -; CHECK-NEXT: mov x13, #112 -; CHECK-NEXT: mov x14, #64 +; CHECK-NEXT: mov x8, #112 // =0x70 +; CHECK-NEXT: mov x9, #32 // =0x20 +; CHECK-NEXT: mov x10, #16 // =0x10 +; CHECK-NEXT: mov x11, #64 // =0x40 +; CHECK-NEXT: mov x12, #48 // =0x30 +; CHECK-NEXT: mov x13, #96 // =0x60 +; CHECK-NEXT: mov x14, #80 // =0x50 ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x10, lsl #1] @@ -1454,7 +1454,7 @@ define void @abs_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: abs_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1542,7 +1542,7 @@ define void @abs_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: abs_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-compares.ll @@ -52,7 +52,7 @@ define void @icmp_eq_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: icmp_eq_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -162,7 +162,7 @@ define void @icmp_eq_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: icmp_eq_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -272,7 +272,7 @@ define void @icmp_eq_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: icmp_eq_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -382,7 +382,7 @@ define void @icmp_eq_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: icmp_eq_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -370,25 +370,25 @@ define void @sdiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q3, q0, [x1] +; VBITS_GE_128-NEXT: ldp q0, q3, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0 -; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 -; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: sshll2 v4.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 -; VBITS_GE_128-NEXT: sshll2 v7.4s, v1.8h, #0 -; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: ldp q2, q1, [x0] +; VBITS_GE_128-NEXT: sshll2 v6.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: sshll v3.4s, v3.4h, #0 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 -; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: sshll2 v7.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z2.s ; VBITS_GE_128-NEXT: movprfx z2, z7 ; VBITS_GE_128-NEXT: sdiv z2.s, p0/m, z2.s, z6.s +; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z3.s ; VBITS_GE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v4.8h -; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: stp q0, q1, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v16i16: @@ -543,24 +543,24 @@ define void @sdiv_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] -; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z5.s -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q6, q4, [x1] -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q4, q6, [x1] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z2 ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z6.s ; VBITS_GE_128-NEXT: movprfx z1, z3 ; VBITS_GE_128-NEXT: sdiv z1.s, p0/m, z1.s, z4.s -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -666,24 +666,24 @@ define void @sdiv_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] -; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z5.d -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q6, q4, [x1] -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q4, q6, [x1] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z2 ; VBITS_GE_128-NEXT: sdiv z0.d, p0/m, z0.d, z6.d ; VBITS_GE_128-NEXT: movprfx z1, z3 ; VBITS_GE_128-NEXT: sdiv z1.d, p0/m, z1.d, z4.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1093,25 +1093,25 @@ define void @udiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q3, q0, [x1] +; VBITS_GE_128-NEXT: ldp q0, q3, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0 -; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 -; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ushll2 v4.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 -; VBITS_GE_128-NEXT: ushll2 v7.4s, v1.8h, #0 -; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 +; VBITS_GE_128-NEXT: ldp q2, q1, [x0] +; VBITS_GE_128-NEXT: ushll2 v6.4s, v3.8h, #0 +; VBITS_GE_128-NEXT: ushll v3.4s, v3.4h, #0 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 -; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z3.s +; VBITS_GE_128-NEXT: ushll2 v7.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z2.s ; VBITS_GE_128-NEXT: movprfx z2, z7 ; VBITS_GE_128-NEXT: udiv z2.s, p0/m, z2.s, z6.s +; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z3.s ; VBITS_GE_128-NEXT: uzp1 v1.8h, v1.8h, v2.8h ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v4.8h -; VBITS_GE_128-NEXT: stp q1, q0, [x0] +; VBITS_GE_128-NEXT: stp q0, q1, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v16i16: @@ -1257,24 +1257,24 @@ define void @udiv_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] -; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z5.s -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q6, q4, [x1] -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z4.s +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q4, q6, [x1] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z2 ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z6.s ; VBITS_GE_128-NEXT: movprfx z1, z3 ; VBITS_GE_128-NEXT: udiv z1.s, p0/m, z1.s, z4.s -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1380,24 +1380,24 @@ define void @udiv_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] -; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z5.d -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q6, q4, [x1] -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z4.d +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q4, q6, [x1] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z2 ; VBITS_GE_128-NEXT: udiv z0.d, p0/m, z0.d, z6.d ; VBITS_GE_128-NEXT: movprfx z1, z3 ; VBITS_GE_128-NEXT: udiv z1.d, p0/m, z1.d, z4.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -73,7 +73,7 @@ ; VBITS_GE_256-LABEL: sext_v32i8_v32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b @@ -157,7 +157,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -242,7 +242,7 @@ ; VBITS_GE_256-LABEL: sext_v8i8_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -322,7 +322,7 @@ ; VBITS_GE_256-LABEL: sext_v16i16_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h @@ -406,7 +406,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s @@ -486,7 +486,7 @@ ; VBITS_GE_256-LABEL: sext_v8i32_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s @@ -569,7 +569,7 @@ ; VBITS_GE_256-LABEL: zext_v32i8_v32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b @@ -653,7 +653,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -738,7 +738,7 @@ ; VBITS_GE_256-LABEL: zext_v8i8_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -818,7 +818,7 @@ ; VBITS_GE_256-LABEL: zext_v16i16_v16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h @@ -902,7 +902,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s @@ -982,7 +982,7 @@ ; VBITS_GE_256-LABEL: zext_v8i32_v8i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll @@ -48,7 +48,7 @@ define void @and_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: and_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @and_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: and_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -244,7 +244,7 @@ define void @and_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: and_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -342,7 +342,7 @@ define void @and_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: and_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -444,7 +444,7 @@ define void @or_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: or_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -542,7 +542,7 @@ define void @or_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: or_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -640,7 +640,7 @@ define void @or_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: or_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -738,7 +738,7 @@ define void @or_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: or_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -840,7 +840,7 @@ define void @xor_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: xor_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -938,7 +938,7 @@ define void @xor_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: xor_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1036,7 +1036,7 @@ define void @xor_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: xor_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1134,7 +1134,7 @@ define void @xor_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: xor_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll @@ -48,7 +48,7 @@ define void @smax_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smax_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @smax_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smax_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -244,7 +244,7 @@ define void @smax_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smax_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -350,7 +350,7 @@ define void @smax_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smax_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -452,7 +452,7 @@ define void @smin_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smin_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -550,7 +550,7 @@ define void @smin_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smin_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -648,7 +648,7 @@ define void @smin_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smin_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -754,7 +754,7 @@ define void @smin_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smin_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -856,7 +856,7 @@ define void @umax_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umax_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -954,7 +954,7 @@ define void @umax_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umax_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1052,7 +1052,7 @@ define void @umax_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umax_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1158,7 +1158,7 @@ define void @umax_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umax_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1260,7 +1260,7 @@ define void @umin_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umin_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -1358,7 +1358,7 @@ define void @umin_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umin_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1456,7 +1456,7 @@ define void @umin_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umin_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1562,7 +1562,7 @@ define void @umin_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umin_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -53,11 +53,22 @@ define void @smulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: smulh_v32i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 // =0x10 +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1sb { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -73,26 +84,37 @@ define void @smulh_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smulh_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 +; VBITS_GE_256-NEXT: mov w8, #16 // =0x10 +; VBITS_GE_256-NEXT: mov w9, #32 // =0x20 +; VBITS_GE_256-NEXT: mov w10, #48 // =0x30 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1sb { z1.h }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1sb { z2.h }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1sb { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1sb { z4.h }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1sb { z5.h }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1sb { z6.h }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1sb { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.h, p0/m, z2.h, z5.h +; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z4.h +; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z6.h +; VBITS_GE_256-NEXT: mul z3.h, p0/m, z3.h, z7.h +; VBITS_GE_256-NEXT: lsr z2.h, z2.h, #8 +; VBITS_GE_256-NEXT: lsr z1.h, z1.h, #8 +; VBITS_GE_256-NEXT: lsr z0.h, z0.h, #8 +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: lsr z3.h, z3.h, #8 +; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b +; VBITS_GE_256-NEXT: splice z2.b, p0, z2.b, z0.b ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z2.b -; VBITS_GE_256-NEXT: smulh z1.b, p0/m, z1.b, z3.b -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: smulh_v64i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <64 x i8>, ptr %a %op2 = load <64 x i8>, ptr %b %insert = insertelement <64 x i16> undef, i16 8, i64 0 @@ -109,11 +131,22 @@ define void @smulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: smulh_v128i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #64 // =0x40 +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1sb { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -129,11 +162,22 @@ define void @smulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: smulh_v256i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #128 // =0x80 +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1sb { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl256 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a %op2 = load <256 x i8>, ptr %b @@ -185,11 +229,22 @@ define void @smulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: smulh_v16i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 // =0x8 +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1sh { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1sh { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -205,26 +260,37 @@ define void @smulh_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smulh_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x10, #24 // =0x18 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z1.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z2.s }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1sh { z4.s }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z5.s }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z6.s }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1sh { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.s, p0/m, z2.s, z5.s +; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z4.s +; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z6.s +; VBITS_GE_256-NEXT: mul z3.s, p0/m, z3.s, z7.s +; VBITS_GE_256-NEXT: lsr z2.s, z2.s, #16 +; VBITS_GE_256-NEXT: lsr z1.s, z1.s, #16 +; VBITS_GE_256-NEXT: lsr z0.s, z0.s, #16 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: lsr z3.s, z3.s, #16 +; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z2.h -; VBITS_GE_256-NEXT: smulh z1.h, p0/m, z1.h, z3.h -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: smulh_v32i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <32 x i16>, ptr %a %op2 = load <32 x i16>, ptr %b %1 = sext <32 x i16> %op1 to <32 x i32> @@ -239,11 +305,22 @@ define void @smulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: smulh_v64i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1sh { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1sh { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a %op2 = load <64 x i16>, ptr %b @@ -259,11 +336,22 @@ define void @smulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: smulh_v128i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #64 // =0x40 +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1sh { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1sh { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i16>, ptr %a %op2 = load <128 x i16>, ptr %b @@ -315,11 +403,22 @@ define void @smulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: smulh_v8i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1sw { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1sw { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -335,26 +434,37 @@ define void @smulh_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: smulh_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x10, #12 // =0xc +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z1.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z2.d }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1sw { z4.d }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z5.d }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z6.d }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1sw { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.d, p0/m, z2.d, z5.d +; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z4.d +; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z6.d +; VBITS_GE_256-NEXT: mul z3.d, p0/m, z3.d, z7.d +; VBITS_GE_256-NEXT: lsr z2.d, z2.d, #32 +; VBITS_GE_256-NEXT: lsr z1.d, z1.d, #32 +; VBITS_GE_256-NEXT: lsr z0.d, z0.d, #32 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 +; VBITS_GE_256-NEXT: lsr z3.d, z3.d, #32 +; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p0, z2.s, z0.s ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z2.s -; VBITS_GE_256-NEXT: smulh z1.s, p0/m, z1.s, z3.s -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: smulh_v16i32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, ptr %a %op2 = load <16 x i32>, ptr %b %1 = sext <16 x i32> %op1 to <16 x i64> @@ -369,11 +479,22 @@ define void @smulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: smulh_v32i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1sw { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1sw { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i32>, ptr %a %op2 = load <32 x i32>, ptr %b @@ -389,11 +510,22 @@ define void @smulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: smulh_v64i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1sw { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1sw { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i32>, ptr %a %op2 = load <64 x i32>, ptr %b @@ -582,11 +714,22 @@ define void @umulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: umulh_v32i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #16 // =0x10 +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -602,26 +745,37 @@ define void @umulh_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umulh_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 +; VBITS_GE_256-NEXT: mov w8, #16 // =0x10 +; VBITS_GE_256-NEXT: mov w9, #32 // =0x20 +; VBITS_GE_256-NEXT: mov w10, #48 // =0x30 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.h }, p0/z, [x0, x9] +; VBITS_GE_256-NEXT: ld1b { z2.h }, p0/z, [x0, x10] +; VBITS_GE_256-NEXT: ld1b { z3.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z4.h }, p0/z, [x1, x9] +; VBITS_GE_256-NEXT: ld1b { z5.h }, p0/z, [x1, x10] +; VBITS_GE_256-NEXT: ld1b { z6.h }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z7.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.h, p0/m, z2.h, z5.h +; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z4.h +; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z6.h +; VBITS_GE_256-NEXT: mul z3.h, p0/m, z3.h, z7.h +; VBITS_GE_256-NEXT: lsr z2.h, z2.h, #8 +; VBITS_GE_256-NEXT: lsr z1.h, z1.h, #8 +; VBITS_GE_256-NEXT: lsr z0.h, z0.h, #8 +; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: lsr z3.h, z3.h, #8 +; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z2.b +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z2.b, z3.b, z3.b +; VBITS_GE_256-NEXT: splice z2.b, p0, z2.b, z0.b ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z2.b -; VBITS_GE_256-NEXT: umulh z1.b, p0/m, z1.b, z3.b -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x9] +; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: umulh_v64i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <64 x i8>, ptr %a %op2 = load <64 x i8>, ptr %b %1 = zext <64 x i8> %op1 to <64 x i16> @@ -636,11 +790,22 @@ define void @umulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: umulh_v128i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #64 // =0x40 +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -658,11 +823,22 @@ define void @umulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: umulh_v256i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #128 // =0x80 +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z3.h }, p0/z, [x1] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl256 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: st1b { z1.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a %op2 = load <256 x i8>, ptr %b @@ -715,11 +891,22 @@ define void @umulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: umulh_v16i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #8 // =0x8 +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -735,26 +922,37 @@ define void @umulh_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umulh_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x10, #24 // =0x18 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.s }, p0/z, [x1, x9, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z5.s }, p0/z, [x1, x10, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z6.s }, p0/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z7.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.s, p0/m, z2.s, z5.s +; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z4.s +; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z6.s +; VBITS_GE_256-NEXT: mul z3.s, p0/m, z3.s, z7.s +; VBITS_GE_256-NEXT: lsr z2.s, z2.s, #16 +; VBITS_GE_256-NEXT: lsr z1.s, z1.s, #16 +; VBITS_GE_256-NEXT: lsr z0.s, z0.s, #16 +; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: lsr z3.s, z3.s, #16 +; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z2.h +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h +; VBITS_GE_256-NEXT: splice z2.h, p0, z2.h, z0.h ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z2.h -; VBITS_GE_256-NEXT: umulh z1.h, p0/m, z1.h, z3.h -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] +; VBITS_GE_256-NEXT: st1h { z2.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: umulh_v32i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <32 x i16>, ptr %a %op2 = load <32 x i16>, ptr %b %1 = zext <32 x i16> %op1 to <32 x i32> @@ -769,11 +967,22 @@ define void @umulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: umulh_v64i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a %op2 = load <64 x i16>, ptr %b @@ -789,11 +998,22 @@ define void @umulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: umulh_v128i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #64 // =0x40 +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z3.s }, p0/z, [x1] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i16>, ptr %a %op2 = load <128 x i16>, ptr %b @@ -845,11 +1065,22 @@ define void @umulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: umulh_v8i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -867,26 +1098,37 @@ define void @umulh_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: umulh_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x10, #12 // =0xc +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z4.d }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z5.d }, p0/z, [x1, x10, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z6.d }, p0/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z7.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mul z2.d, p0/m, z2.d, z5.d +; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z4.d +; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z6.d +; VBITS_GE_256-NEXT: mul z3.d, p0/m, z3.d, z7.d +; VBITS_GE_256-NEXT: lsr z2.d, z2.d, #32 +; VBITS_GE_256-NEXT: lsr z1.d, z1.d, #32 +; VBITS_GE_256-NEXT: lsr z0.d, z0.d, #32 +; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 +; VBITS_GE_256-NEXT: lsr z3.d, z3.d, #32 +; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z2.s +; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z2.s, z3.s, z3.s +; VBITS_GE_256-NEXT: splice z2.s, p0, z2.s, z0.s ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z2.s -; VBITS_GE_256-NEXT: umulh z1.s, p0/m, z1.s, z3.s -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: umulh_v16i32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, ptr %a %op2 = load <16 x i32>, ptr %b %1 = zext <16 x i32> %op1 to <16 x i64> @@ -901,11 +1143,22 @@ define void @umulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: umulh_v32i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #16 // =0x10 +; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i32>, ptr %a %op2 = load <32 x i32>, ptr %b @@ -921,11 +1174,22 @@ define void @umulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: umulh_v64i32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #32 // =0x20 +; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z3.d }, p0/z, [x1] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i32>, ptr %a %op2 = load <64 x i32>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll @@ -48,7 +48,7 @@ define i8 @uaddv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: uaddv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -138,7 +138,7 @@ define i16 @uaddv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: uaddv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -228,7 +228,7 @@ define i32 @uaddv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: uaddv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -317,7 +317,7 @@ define i64 @uaddv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: uaddv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -406,7 +406,7 @@ define i8 @smaxv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: smaxv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -491,7 +491,7 @@ define i16 @smaxv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: smaxv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -576,7 +576,7 @@ define i32 @smaxv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: smaxv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -663,7 +663,7 @@ define i64 @smaxv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: smaxv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -752,7 +752,7 @@ define i8 @sminv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: sminv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -837,7 +837,7 @@ define i16 @sminv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: sminv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -922,7 +922,7 @@ define i32 @sminv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: sminv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1009,7 +1009,7 @@ define i64 @sminv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: sminv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1098,7 +1098,7 @@ define i8 @umaxv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: umaxv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -1183,7 +1183,7 @@ define i16 @umaxv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: umaxv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1268,7 +1268,7 @@ define i32 @umaxv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: umaxv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1355,7 +1355,7 @@ define i64 @umaxv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: umaxv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1444,7 +1444,7 @@ define i8 @uminv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: uminv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -1529,7 +1529,7 @@ define i16 @uminv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: uminv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1614,7 +1614,7 @@ define i32 @uminv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: uminv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1701,7 +1701,7 @@ define i64 @uminv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: uminv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -391,26 +391,26 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: sshll2 v4.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] ; VBITS_GE_128-NEXT: sshll2 v5.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: sshll v7.4s, v0.4h, #0 -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: sshll2 v4.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: sshll2 v17.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v6.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z17.s ; VBITS_GE_128-NEXT: sshll v16.4s, v2.4h, #0 -; VBITS_GE_128-NEXT: sshll2 v17.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: sdivr z5.s, p0/m, z5.s, z6.s ; VBITS_GE_128-NEXT: sshll v6.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sdivr z7.s, p0/m, z7.s, z16.s ; VBITS_GE_128-NEXT: sshll v16.4s, v3.4h, #0 -; VBITS_GE_128-NEXT: sdivr z4.s, p0/m, z4.s, z17.s ; VBITS_GE_128-NEXT: uzp1 v5.8h, v7.8h, v5.8h ; VBITS_GE_128-NEXT: sdivr z6.s, p0/m, z6.s, z16.s ; VBITS_GE_128-NEXT: uzp1 v4.8h, v6.8h, v4.8h ; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v0.8h ; VBITS_GE_128-NEXT: mls v3.8h, v4.8h, v1.8h -; VBITS_GE_128-NEXT: stp q2, q3, [x0] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v16i16: @@ -583,25 +583,25 @@ define void @srem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z16, z0 ; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z4.s +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] ; VBITS_GE_128-NEXT: mls v0.4s, v16.4s, v4.4s -; VBITS_GE_128-NEXT: movprfx z16, z1 -; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z5.s -; VBITS_GE_128-NEXT: ldp q7, q6, [x1] ; VBITS_GE_128-NEXT: movprfx z4, z3 ; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z6.s +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: sdiv z16.s, p0/m, z16.s, z5.s ; VBITS_GE_128-NEXT: mls v1.4s, v16.4s, v5.4s ; VBITS_GE_128-NEXT: movprfx z5, z2 ; VBITS_GE_128-NEXT: sdiv z5.s, p0/m, z5.s, z7.s -; VBITS_GE_128-NEXT: mls v2.4s, v5.4s, v7.4s ; VBITS_GE_128-NEXT: mls v3.4s, v4.4s, v6.4s -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] -; VBITS_GE_128-NEXT: stp q2, q3, [x0] +; VBITS_GE_128-NEXT: mls v2.4s, v5.4s, v7.4s +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v16i32: @@ -730,27 +730,27 @@ define void @srem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_128-NEXT: ldp q3, q2, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z16, z1 ; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z3.d ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z3.d ; VBITS_GE_128-NEXT: movprfx z3, z0 ; VBITS_GE_128-NEXT: sdiv z3.d, p0/m, z3.d, z2.d ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z3.d, z2.d -; VBITS_GE_128-NEXT: ldp q4, q5, [x0] -; VBITS_GE_128-NEXT: ldp q7, q6, [x1] +; VBITS_GE_128-NEXT: ldp q5, q4, [x0] +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] ; VBITS_GE_128-NEXT: movprfx z16, z5 ; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z6.d ; VBITS_GE_128-NEXT: movprfx z2, z4 ; VBITS_GE_128-NEXT: sdiv z2.d, p0/m, z2.d, z7.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z4 ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z2.d, z7.d ; VBITS_GE_128-NEXT: movprfx z1, z5 ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z6.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v8i64: @@ -1209,26 +1209,26 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x1] +; VBITS_GE_128-NEXT: ldp q1, q0, [x1] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 +; VBITS_GE_128-NEXT: ushll2 v4.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] ; VBITS_GE_128-NEXT: ushll2 v5.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: ushll v7.4s, v0.4h, #0 -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ushll2 v4.4s, v1.8h, #0 +; VBITS_GE_128-NEXT: ushll2 v17.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v6.4s, v2.8h, #0 +; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z17.s ; VBITS_GE_128-NEXT: ushll v16.4s, v2.4h, #0 -; VBITS_GE_128-NEXT: ushll2 v17.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: udivr z5.s, p0/m, z5.s, z6.s ; VBITS_GE_128-NEXT: ushll v6.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: udivr z7.s, p0/m, z7.s, z16.s ; VBITS_GE_128-NEXT: ushll v16.4s, v3.4h, #0 -; VBITS_GE_128-NEXT: udivr z4.s, p0/m, z4.s, z17.s ; VBITS_GE_128-NEXT: uzp1 v5.8h, v7.8h, v5.8h ; VBITS_GE_128-NEXT: udivr z6.s, p0/m, z6.s, z16.s ; VBITS_GE_128-NEXT: uzp1 v4.8h, v6.8h, v4.8h ; VBITS_GE_128-NEXT: mls v2.8h, v5.8h, v0.8h ; VBITS_GE_128-NEXT: mls v3.8h, v4.8h, v1.8h -; VBITS_GE_128-NEXT: stp q2, q3, [x0] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v16i16: @@ -1401,25 +1401,25 @@ define void @urem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.s, vl4 -; VBITS_GE_128-NEXT: ldp q2, q3, [x0] -; VBITS_GE_128-NEXT: ldp q4, q5, [x1, #32] +; VBITS_GE_128-NEXT: ldp q3, q2, [x0] +; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z16, z0 ; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z4.s +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] ; VBITS_GE_128-NEXT: mls v0.4s, v16.4s, v4.4s -; VBITS_GE_128-NEXT: movprfx z16, z1 -; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z5.s -; VBITS_GE_128-NEXT: ldp q7, q6, [x1] ; VBITS_GE_128-NEXT: movprfx z4, z3 ; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z6.s +; VBITS_GE_128-NEXT: movprfx z16, z1 +; VBITS_GE_128-NEXT: udiv z16.s, p0/m, z16.s, z5.s ; VBITS_GE_128-NEXT: mls v1.4s, v16.4s, v5.4s ; VBITS_GE_128-NEXT: movprfx z5, z2 ; VBITS_GE_128-NEXT: udiv z5.s, p0/m, z5.s, z7.s -; VBITS_GE_128-NEXT: mls v2.4s, v5.4s, v7.4s ; VBITS_GE_128-NEXT: mls v3.4s, v4.4s, v6.4s -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] -; VBITS_GE_128-NEXT: stp q2, q3, [x0] +; VBITS_GE_128-NEXT: mls v2.4s, v5.4s, v7.4s +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] +; VBITS_GE_128-NEXT: stp q3, q2, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v16i32: @@ -1548,27 +1548,27 @@ define void @urem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: ldp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: ptrue p0.d, vl2 -; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32] +; VBITS_GE_128-NEXT: ldp q3, q2, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z16, z1 ; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z3.d ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z3.d ; VBITS_GE_128-NEXT: movprfx z3, z0 ; VBITS_GE_128-NEXT: udiv z3.d, p0/m, z3.d, z2.d ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z3.d, z2.d -; VBITS_GE_128-NEXT: ldp q4, q5, [x0] -; VBITS_GE_128-NEXT: ldp q7, q6, [x1] +; VBITS_GE_128-NEXT: ldp q5, q4, [x0] +; VBITS_GE_128-NEXT: ldp q6, q7, [x1] ; VBITS_GE_128-NEXT: movprfx z16, z5 ; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z6.d ; VBITS_GE_128-NEXT: movprfx z2, z4 ; VBITS_GE_128-NEXT: udiv z2.d, p0/m, z2.d, z7.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32] +; VBITS_GE_128-NEXT: stp q1, q0, [x0, #32] ; VBITS_GE_128-NEXT: movprfx z0, z4 ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z2.d, z7.d ; VBITS_GE_128-NEXT: movprfx z1, z5 ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z6.d -; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q1, q0, [x0] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll @@ -54,7 +54,7 @@ define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.b @@ -178,7 +178,7 @@ define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.h @@ -302,7 +302,7 @@ define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.s @@ -406,9 +406,8 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d @@ -427,10 +426,9 @@ define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 -; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: and x9, x2, #0x1 +; VBITS_GE_256-NEXT: and w9, w2, #0x1 ; VBITS_GE_256-NEXT: ptrue p1.d ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -446,9 +444,8 @@ ; ; VBITS_GE_512-LABEL: select_v8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 +; VBITS_GE_512-NEXT: and w8, w2, #0x1 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: and x8, x2, #0x1 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: ptrue p1.d @@ -467,9 +464,8 @@ define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d @@ -488,9 +484,8 @@ define void @select_v32i64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: and w8, w2, #0x1 ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: and x8, x2, #0x1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p1.d diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-shifts.ll @@ -50,7 +50,7 @@ define void @ashr_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ashr_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -150,7 +150,7 @@ define void @ashr_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ashr_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -250,7 +250,7 @@ define void @ashr_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ashr_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -350,7 +350,7 @@ define void @ashr_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ashr_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -454,7 +454,7 @@ define void @lshr_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: lshr_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -554,7 +554,7 @@ define void @lshr_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: lshr_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -654,7 +654,7 @@ define void @lshr_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: lshr_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -754,7 +754,7 @@ define void @lshr_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: lshr_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -856,7 +856,7 @@ define void @shl_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shl_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -954,7 +954,7 @@ define void @shl_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shl_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1052,7 +1052,7 @@ define void @shl_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shl_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1150,7 +1150,7 @@ define void @shl_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shl_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -50,7 +50,7 @@ define void @ucvtf_v32i16_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ucvtf_v32i16_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -147,7 +147,7 @@ ; VBITS_GE_256-LABEL: ucvtf_v16i16_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h @@ -252,7 +252,7 @@ ; VBITS_GE_256-LABEL: ucvtf_v8i16_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -352,7 +352,7 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -454,7 +454,7 @@ define void @ucvtf_v16i32_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ucvtf_v16i32_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -551,7 +551,7 @@ ; VBITS_GE_256-LABEL: ucvtf_v8i32_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s @@ -653,7 +653,7 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -757,7 +757,7 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -861,7 +861,7 @@ define void @ucvtf_v8i64_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: ucvtf_v8i64_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -957,7 +957,7 @@ define void @scvtf_v32i16_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: scvtf_v32i16_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -1054,7 +1054,7 @@ ; VBITS_GE_256-LABEL: scvtf_v16i16_v16f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h @@ -1165,7 +1165,7 @@ ; VBITS_GE_256-LABEL: scvtf_v8i16_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -1271,7 +1271,7 @@ define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: scvtf_v16i32_v16f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1373,7 +1373,7 @@ define void @scvtf_v16i32_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: scvtf_v16i32_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1470,7 +1470,7 @@ ; VBITS_GE_256-LABEL: scvtf_v8i32_v8f64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s @@ -1578,7 +1578,7 @@ define <8 x half> @scvtf_v8i64_v8f16(ptr %a) #0 { ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1682,7 +1682,7 @@ define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -1786,7 +1786,7 @@ define void @scvtf_v8i64_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: scvtf_v8i64_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -50,7 +50,7 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -163,7 +163,7 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -276,7 +276,7 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -390,7 +390,7 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: select_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads.ll @@ -52,7 +52,7 @@ define <16 x float> @load_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: load_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -87,9 +87,9 @@ define <32 x float> @load_v32f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: load_v32f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #16 -; VBITS_GE_256-NEXT: mov x10, #24 -; VBITS_GE_256-NEXT: mov x11, #8 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x10, #24 // =0x18 +; VBITS_GE_256-NEXT: mov x11, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] @@ -103,7 +103,7 @@ ; ; VBITS_GE_512-LABEL: load_v32f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov x9, #16 +; VBITS_GE_512-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -131,13 +131,13 @@ define <64 x float> @load_v64f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: load_v64f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 -; VBITS_GE_256-NEXT: mov x10, #48 -; VBITS_GE_256-NEXT: mov x11, #56 -; VBITS_GE_256-NEXT: mov x12, #32 -; VBITS_GE_256-NEXT: mov x13, #40 -; VBITS_GE_256-NEXT: mov x14, #16 -; VBITS_GE_256-NEXT: mov x15, #24 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 +; VBITS_GE_256-NEXT: mov x10, #48 // =0x30 +; VBITS_GE_256-NEXT: mov x11, #56 // =0x38 +; VBITS_GE_256-NEXT: mov x12, #32 // =0x20 +; VBITS_GE_256-NEXT: mov x13, #40 // =0x28 +; VBITS_GE_256-NEXT: mov x14, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x15, #24 // =0x18 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x10, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x11, lsl #2] @@ -159,9 +159,9 @@ ; ; VBITS_GE_512-LABEL: load_v64f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov x9, #32 -; VBITS_GE_512-NEXT: mov x10, #48 -; VBITS_GE_512-NEXT: mov x11, #16 +; VBITS_GE_512-NEXT: mov x9, #32 // =0x20 +; VBITS_GE_512-NEXT: mov x10, #48 // =0x30 +; VBITS_GE_512-NEXT: mov x11, #16 // =0x10 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0, x10, lsl #2] @@ -175,7 +175,7 @@ ; ; VBITS_GE_1024-LABEL: load_v64f32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: mov x9, #32 +; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll @@ -42,7 +42,7 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -146,7 +146,7 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -244,7 +244,7 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -338,7 +338,7 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -60,7 +60,7 @@ ; VBITS_GE_256-LABEL: masked_gather_v8i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr d0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] @@ -214,7 +214,7 @@ ; VBITS_GE_256-LABEL: masked_gather_v8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] @@ -350,7 +350,7 @@ ; VBITS_GE_256-LABEL: masked_gather_v8i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] @@ -497,7 +497,7 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -623,7 +623,7 @@ ; VBITS_GE_256-LABEL: masked_gather_v8f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: fcmeq v0.8h, v0.8h, #0.0 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] @@ -759,7 +759,7 @@ ; VBITS_GE_256-LABEL: masked_gather_v8f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] @@ -906,7 +906,7 @@ define void @masked_gather_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_gather_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -980,13 +980,29 @@ ; CHECK-LABEL: masked_gather_32b_scaled_sext_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, sxtw #1] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x1] +; CHECK-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #32 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: and p2.b, p2/z, p2.b, p1.b +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x2, z0.d, lsl #1] +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x2, z1.d, lsl #1] +; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1002,11 +1018,25 @@ ; CHECK-LABEL: masked_gather_32b_scaled_sext_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x2, z1.s, sxtw #2] -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1sw { z2.d }, p1/z, [x1] +; CHECK-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 +; CHECK-NEXT: and p2.b, p2/z, p2.b, p1.b +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: ld1w { z2.d }, p2/z, [x2, z2.d, lsl #2] +; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2] +; CHECK-NEXT: ptrue p1.s, vl16 +; CHECK-NEXT: uzp1 z1.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: splice z1.s, p1, z1.s, z0.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1042,13 +1072,29 @@ ; CHECK-LABEL: masked_gather_32b_scaled_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, uxtw #1] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ld1w { z1.d }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1] +; CHECK-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #32 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: and p2.b, p2/z, p2.b, p1.b +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x2, z0.d, lsl #1] +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x2, z1.d, lsl #1] +; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1064,13 +1110,29 @@ ; CHECK-LABEL: masked_gather_32b_unscaled_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, sxtw] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x1] +; CHECK-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #32 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: and p2.b, p2/z, p2.b, p1.b +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x2, z0.d] +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x2, z1.d] +; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1087,13 +1149,29 @@ ; CHECK-LABEL: masked_gather_32b_unscaled_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z1.s, uxtw] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ld1w { z1.d }, p1/z, [x1, x8, lsl #2] +; CHECK-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1] +; CHECK-NEXT: mov z2.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #32 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: and p2.b, p2/z, p2.b, p1.b +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: ld1h { z0.d }, p2/z, [x2, z0.d] +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-NEXT: ld1h { z1.d }, p1/z, [x2, z1.d] +; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: splice z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -88,7 +88,7 @@ define <16 x float> @masked_load_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -155,7 +155,7 @@ define <64 x i8> @masked_load_v64i8(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w9, #32 +; VBITS_GE_256-NEXT: mov w9, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x9] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -188,7 +188,7 @@ define <32 x i16> @masked_load_v32i16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -221,7 +221,7 @@ define <16 x i32> @masked_load_v16i32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -254,7 +254,7 @@ define <8 x i64> @masked_load_v8i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -287,7 +287,7 @@ define <8 x i64> @masked_load_passthru_v8i64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_passthru_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -323,7 +323,7 @@ define <8 x double> @masked_load_passthru_v8f64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_passthru_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -360,7 +360,7 @@ ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -392,7 +392,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -426,7 +426,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr d0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -460,7 +460,7 @@ ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -492,7 +492,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -525,7 +525,7 @@ ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -556,7 +556,7 @@ ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -588,7 +588,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -622,7 +622,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr d0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] @@ -656,7 +656,7 @@ ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -688,7 +688,7 @@ ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -721,7 +721,7 @@ ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -751,7 +751,7 @@ define <32 x i16> @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] @@ -791,7 +791,7 @@ define <16 x i32> @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] @@ -834,7 +834,7 @@ define <8 x i64> @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -879,7 +879,7 @@ define <16 x i32> @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] @@ -921,7 +921,7 @@ define <8 x i64> @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -964,7 +964,7 @@ define <8 x i64> @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -1004,7 +1004,7 @@ define <32 x i16> @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #16 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x9, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] @@ -1044,7 +1044,7 @@ define <16 x i32> @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] @@ -1087,7 +1087,7 @@ define <8 x i64> @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -1132,7 +1132,7 @@ define <16 x i32> @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x9, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] @@ -1174,7 +1174,7 @@ define <8 x i64> @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -1217,7 +1217,7 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x9, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] @@ -1450,7 +1450,7 @@ ; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -1481,7 +1481,7 @@ ; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x9, #4 +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpgt p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -57,7 +57,7 @@ ; VBITS_GE_256-LABEL: masked_scatter_v8i8: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr d0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: cmeq v1.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: zip1 v5.8b, v0.8b, v0.8b @@ -203,7 +203,7 @@ ; VBITS_GE_256-LABEL: masked_scatter_v8i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: cmeq v1.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] @@ -332,7 +332,7 @@ ; VBITS_GE_256-LABEL: masked_scatter_v8i32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] @@ -467,7 +467,7 @@ define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_scatter_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -581,7 +581,7 @@ ; VBITS_GE_256-LABEL: masked_scatter_v8f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ldr q0, [x0] -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: fcmeq v1.8h, v0.8h, #0.0 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] @@ -710,7 +710,7 @@ ; VBITS_GE_256-LABEL: masked_scatter_v8f32: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] @@ -845,7 +845,7 @@ define void @masked_scatter_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: masked_scatter_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -911,13 +911,15 @@ ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw #1] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1h { z0.d }, p0, [x2, z1.d, lsl #1] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -932,10 +934,13 @@ ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: st1w { z0.s }, p0, [x2, z1.s, sxtw #2] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] ; CHECK-NEXT: ret %vals = load <32 x float>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -968,13 +973,15 @@ ; CHECK-LABEL: masked_scatter_32b_scaled_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: ld1w { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw #1] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1h { z0.d }, p0, [x2, z1.d, lsl #1] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -989,13 +996,15 @@ ; CHECK-LABEL: masked_scatter_32b_unscaled_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: ld1sw { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1h { z0.d }, p0, [x2, z1.d] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1011,13 +1020,15 @@ ; CHECK-LABEL: masked_scatter_32b_unscaled_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: ld1w { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1h { z0.d }, p0, [x2, z1.d] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -84,7 +84,7 @@ define void @masked_store_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: masked_store_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -146,7 +146,7 @@ define void @masked_store_trunc_v8i64i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -187,7 +187,7 @@ define void @masked_store_trunc_v8i64i16(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -231,7 +231,7 @@ define void @masked_store_trunc_v8i64i32(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -272,7 +272,7 @@ define void @masked_store_trunc_v16i32i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -316,7 +316,7 @@ define void @masked_store_trunc_v16i32i16(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -360,7 +360,7 @@ define void @masked_store_trunc_v32i16i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-LABEL: masked_store_trunc_v32i16i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-rev.ll @@ -165,7 +165,7 @@ define void @test_revhv32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: test_revhv32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ptrue p1.d ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] @@ -472,16 +472,16 @@ ; CHECK-NEXT: st1 { v1.h }[4], [x9] ; CHECK-NEXT: orr x9, x8, #0x4 ; CHECK-NEXT: st1 { v1.h }[5], [x10] -; CHECK-NEXT: mov w10, #26 +; CHECK-NEXT: mov w10, #26 // =0x1a ; CHECK-NEXT: orr x10, x8, x10 ; CHECK-NEXT: st1 { v0.h }[3], [x12] ; CHECK-NEXT: st1 { v1.h }[1], [x9] ; CHECK-NEXT: orr x9, x8, #0x2 ; CHECK-NEXT: st1 { v1.h }[7], [x11] -; CHECK-NEXT: mov w11, #20 -; CHECK-NEXT: mov w12, #18 +; CHECK-NEXT: mov w11, #20 // =0x14 +; CHECK-NEXT: mov w12, #18 // =0x12 ; CHECK-NEXT: st1 { v0.h }[6], [x10] -; CHECK-NEXT: mov w10, #10 +; CHECK-NEXT: mov w10, #10 // =0xa ; CHECK-NEXT: orr x11, x8, x11 ; CHECK-NEXT: st1 { v1.h }[2], [x9] ; CHECK-NEXT: orr x9, x8, x12 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll @@ -32,7 +32,7 @@ define void @zip_v32i16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: zip_v32i16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -241,7 +241,7 @@ define void @trn_v32i16(ptr %a, ptr %b) #0 { ; VBITS_EQ_256-LABEL: trn_v32i16: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: mov x8, #16 +; VBITS_EQ_256-NEXT: mov x8, #16 // =0x10 ; VBITS_EQ_256-NEXT: ptrue p0.h ; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -514,7 +514,7 @@ define void @uzp_v32i16(ptr %a, ptr %b) #1 { ; CHECK-LABEL: uzp_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-rev.ll @@ -50,7 +50,7 @@ define void @bitreverse_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: bitreverse_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -142,7 +142,7 @@ define void @bitreverse_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: bitreverse_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -234,7 +234,7 @@ define void @bitreverse_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: bitreverse_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -326,7 +326,7 @@ define void @bitreverse_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: bitreverse_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -418,7 +418,7 @@ define void @bswap_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: bswap_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -506,7 +506,7 @@ define void @bswap_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: bswap_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -594,7 +594,7 @@ define void @bswap_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: bswap_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll @@ -47,7 +47,7 @@ define void @sdiv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: sdiv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -141,7 +141,7 @@ define void @sdiv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: sdiv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -236,7 +236,7 @@ define void @sdiv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: sdiv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -331,7 +331,7 @@ define void @sdiv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: sdiv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -56,9 +56,9 @@ ; CHECK-NEXT: mov v1.b[5], w10 ; CHECK-NEXT: umov w10, v0.b[14] ; CHECK-NEXT: mov v2.b[5], w8 -; CHECK-NEXT: mov x8, #16 +; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: mov v1.b[6], w9 -; CHECK-NEXT: mov x9, #24 +; CHECK-NEXT: mov x9, #24 // =0x18 ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: mov v2.b[6], w10 ; CHECK-NEXT: umov w10, v0.b[15] @@ -71,7 +71,7 @@ ; CHECK-NEXT: mov v2.b[7], w10 ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: mov x11, #8 +; CHECK-NEXT: mov x11, #8 // =0x8 ; CHECK-NEXT: lsl z0.s, z0.s, #31 ; CHECK-NEXT: lsl z3.s, z3.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -47,7 +47,7 @@ define void @splat_v64i8(i8 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: mov z0.b, w0 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] @@ -130,7 +130,7 @@ define void @splat_v32i16(i16 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: mov z0.h, w0 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] @@ -213,7 +213,7 @@ define void @splat_v16i32(i32 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: mov z0.s, w0 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] @@ -296,7 +296,7 @@ define void @splat_v8i64(i64 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: mov z0.d, x0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] @@ -386,7 +386,7 @@ define void @splat_v32f16(half %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov z0.h, h0 @@ -476,7 +476,7 @@ define void @splat_v16f32(float %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, s0 @@ -564,7 +564,7 @@ define void @splat_v8f64(double %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov z0.d, d0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-stores.ll @@ -52,7 +52,7 @@ define void @store_v16f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: store_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] @@ -86,9 +86,9 @@ define void @store_v32f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: store_v32f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #24 -; VBITS_GE_256-NEXT: mov x9, #16 -; VBITS_GE_256-NEXT: mov x10, #8 +; VBITS_GE_256-NEXT: mov x8, #24 // =0x18 +; VBITS_GE_256-NEXT: mov x9, #16 // =0x10 +; VBITS_GE_256-NEXT: mov x10, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -99,7 +99,7 @@ ; ; VBITS_GE_512-LABEL: store_v32f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov x8, #16 +; VBITS_GE_512-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] @@ -126,17 +126,17 @@ define void @store_v64f32(ptr %a) #0 { ; VBITS_GE_256-LABEL: store_v64f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #56 -; VBITS_GE_256-NEXT: mov x9, #48 +; VBITS_GE_256-NEXT: mov x8, #56 // =0x38 +; VBITS_GE_256-NEXT: mov x9, #48 // =0x30 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, #0 // =0x0 -; VBITS_GE_256-NEXT: mov x10, #40 -; VBITS_GE_256-NEXT: mov x11, #32 +; VBITS_GE_256-NEXT: mov x10, #40 // =0x28 +; VBITS_GE_256-NEXT: mov x11, #32 // =0x20 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: mov x8, #24 -; VBITS_GE_256-NEXT: mov x12, #16 +; VBITS_GE_256-NEXT: mov x8, #24 // =0x18 +; VBITS_GE_256-NEXT: mov x12, #16 // =0x10 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] -; VBITS_GE_256-NEXT: mov x9, #8 +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x10, lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x11, lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -147,9 +147,9 @@ ; ; VBITS_GE_512-LABEL: store_v64f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: mov x8, #48 -; VBITS_GE_512-NEXT: mov x9, #32 -; VBITS_GE_512-NEXT: mov x10, #16 +; VBITS_GE_512-NEXT: mov x8, #48 // =0x30 +; VBITS_GE_512-NEXT: mov x9, #32 // =0x20 +; VBITS_GE_512-NEXT: mov x10, #16 // =0x10 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: mov z0.s, #0 // =0x0 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -160,7 +160,7 @@ ; ; VBITS_GE_1024-LABEL: store_v64f32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: mov x8, #32 +; VBITS_GE_1024-NEXT: mov x8, #32 // =0x20 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: mov z0.s, #0 // =0x0 ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll @@ -46,7 +46,7 @@ define void @subvector_v32i16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: subvector_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -101,7 +101,7 @@ define void @subvector_v16i32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: subvector_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -157,7 +157,7 @@ define void @subvector_v8i64(ptr %in, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: subvector_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -234,7 +234,7 @@ define void @subvector_v32f16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: subvector_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -289,7 +289,7 @@ define void @subvector_v16f32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: subvector_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -343,7 +343,7 @@ define void @subvector_v8f64(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: subvector_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll @@ -34,7 +34,7 @@ define void @store_trunc_v8i64i8(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256-LABEL: store_trunc_v8i64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -88,7 +88,7 @@ ; Currently does not use the truncating store ; VBITS_GE_256-LABEL: store_trunc_v8i64i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -115,7 +115,7 @@ define void @store_trunc_v8i64i32(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256-LABEL: store_trunc_v8i64i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -143,7 +143,7 @@ ; Currently does not use the truncating store ; VBITS_GE_256-LABEL: store_trunc_v16i32i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -170,7 +170,7 @@ define void @store_trunc_v16i32i16(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256-LABEL: store_trunc_v16i32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -197,7 +197,7 @@ define void @store_trunc_v32i16i8(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256-LABEL: store_trunc_v32i16i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll @@ -26,7 +26,7 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: trunc_v32i16_v32i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -112,7 +112,7 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) #0 { ; VBITS_GE_256-LABEL: trunc_v16i32_v16i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -196,7 +196,7 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: trunc_v16i32_v16i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -283,7 +283,7 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) #0 { ; VBITS_GE_256-LABEL: trunc_v8i64_v8i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -366,7 +366,7 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) #0 { ; VBITS_GE_256-LABEL: trunc_v8i64_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -450,7 +450,7 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-LABEL: trunc_v8i64_v8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll @@ -50,7 +50,7 @@ define void @shuffle_ext_byone_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] @@ -93,7 +93,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v128i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: whilels p1.b, xzr, x8 @@ -127,7 +127,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v256i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl256 -; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov w8, #255 // =0xff ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: whilels p1.b, xzr, x8 @@ -215,7 +215,7 @@ define void @shuffle_ext_byone_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] @@ -254,7 +254,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -280,7 +280,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v128i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -351,7 +351,7 @@ define void @shuffle_ext_byone_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] @@ -388,7 +388,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -410,7 +410,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -463,7 +463,7 @@ define void @shuffle_ext_byone_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] @@ -499,7 +499,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 @@ -519,7 +519,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 @@ -578,7 +578,7 @@ define void @shuffle_ext_byone_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] @@ -614,7 +614,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -640,7 +640,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v128f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: whilels p1.h, xzr, x8 @@ -710,7 +710,7 @@ define void @shuffle_ext_byone_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] @@ -744,7 +744,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -766,7 +766,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v64f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: whilels p1.s, xzr, x8 @@ -818,7 +818,7 @@ define void @shuffle_ext_byone_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: shuffle_ext_byone_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] @@ -851,7 +851,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 @@ -871,7 +871,7 @@ ; CHECK-LABEL: shuffle_ext_byone_v32f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: whilels p1.d, xzr, x8 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-zext.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -opaque-pointers -aarch64-sve-vector-bits-min=256 -verify-machineinstrs | FileCheck %s --check-prefixes=SVE256 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=neoverse-v1 -O3 -opaque-pointers -aarch64-sve-vector-bits-min=128 -verify-machineinstrs | FileCheck %s --check-prefixes=NEON @@ -6,24 +7,38 @@ define internal i32 @test(ptr nocapture readonly %p1, i32 %i1, ptr nocapture readonly %p2, i32 %i2) { ; SVE256-LABEL: test: -; SVE256: ld1b { z0.h }, p0/z, -; SVE256: ld1b { z1.h }, p0/z, -; SVE256: sub z0.h, z0.h, z1.h -; SVE256-NEXT: sunpklo z1.s, z0.h -; SVE256-NEXT: ext z0.b, z0.b, z0.b, #16 -; SVE256-NEXT: sunpklo z0.s, z0.h -; SVE256-NEXT: add z0.s, z1.s, z0.s -; SVE256-NEXT: uaddv d0, p1, z0.s +; SVE256: // %bb.0: // %L.entry +; SVE256-NEXT: ptrue p0.s, vl8 +; SVE256-NEXT: mov w9, wzr +; SVE256-NEXT: mov w10, wzr +; SVE256-NEXT: mov w8, wzr +; SVE256-NEXT: mov w11, #-16 // =0xfffffff0 +; SVE256-NEXT: mov w12, #8 // =0x8 +; SVE256-NEXT: .p2align 5, , 16 +; SVE256-NEXT: .LBB0_1: // %L1 +; SVE256-NEXT: // =>This Inner Loop Header: Depth=1 +; SVE256-NEXT: sxtw x13, w9 +; SVE256-NEXT: sxtw x15, w10 +; SVE256-NEXT: adds w11, w11, #1 +; SVE256-NEXT: add w10, w10, w3 +; SVE256-NEXT: ld1b { z1.s }, p0/z, [x0, x13] +; SVE256-NEXT: add x14, x0, x13 +; SVE256-NEXT: add x16, x2, x15 +; SVE256-NEXT: ld1b { z3.s }, p0/z, [x2, x15] +; SVE256-NEXT: add w9, w9, w1 +; SVE256-NEXT: ld1b { z0.s }, p0/z, [x14, x12] +; SVE256-NEXT: ld1b { z2.s }, p0/z, [x16, x12] +; SVE256-NEXT: sub z1.s, z1.s, z3.s +; SVE256-NEXT: sub z0.s, z0.s, z2.s +; SVE256-NEXT: add z0.s, z1.s, z0.s +; SVE256-NEXT: uaddv d0, p0, z0.s +; SVE256-NEXT: fmov x13, d0 +; SVE256-NEXT: add w8, w13, w8 +; SVE256-NEXT: b.lo .LBB0_1 +; SVE256-NEXT: // %bb.2: // %L2 +; SVE256-NEXT: mov w0, w8 +; SVE256-NEXT: ret -; NEON-LABEL: test: -; NEON: ldr q0, [x0, w9, sxtw] -; NEON: ldr q1, [x2, w10, sxtw] -; NEON: usubl2 v2.8h, v0.16b, v1.16b -; NEON-NEXT: usubl v0.8h, v0.8b, v1.8b -; NEON: saddl2 v1.4s, v0.8h, v2.8h -; NEON-NEXT: saddl v0.4s, v0.4h, v2.4h -; NEON-NEXT: add v0.4s, v0.4s, v1.4s -; NEON-NEXT: addv s0, v0.4s L.entry: br label %L1 @@ -55,3 +70,5 @@ } declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; NEON: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll --- a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll +++ b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll @@ -33,9 +33,8 @@ ; CHECK-LABEL: sti32ldi32ext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: sxtw z0.d, p0/m, z0.d -; CHECK-NEXT: st1w { z1.d }, p0, [x0] +; CHECK-NEXT: st1w { z0.d }, p0, [x0] +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] ; CHECK-NEXT: ret entry: %0 = trunc %v to diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll --- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll +++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll @@ -6,7 +6,7 @@ define void @scatter_i8_index_offset_maximum(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_i8_index_offset_maximum: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #33554431 +; CHECK-NEXT: mov w8, #33554431 // =0x1ffffff ; CHECK-NEXT: add x9, x0, x1 ; CHECK-NEXT: index z1.s, #0, w8 ; CHECK-NEXT: st1b { z0.s }, p0, [x9, z1.s, sxtw] @@ -27,7 +27,7 @@ define void @scatter_i16_index_offset_minimum(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_i16_index_offset_minimum: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-33554432 +; CHECK-NEXT: mov w8, #-33554432 // =0xfe000000 ; CHECK-NEXT: add x9, x0, x1, lsl #1 ; CHECK-NEXT: index z1.s, #0, w8 ; CHECK-NEXT: st1h { z0.s }, p0, [x9, z1.s, sxtw #1] @@ -102,10 +102,10 @@ ; CHECK-LABEL: scatter_i8_index_offset_maximum_plus_one: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov w9, #67108864 +; CHECK-NEXT: mov w9, #67108864 // =0x4000000 ; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: add x11, x0, x1 -; CHECK-NEXT: mov w10, #33554432 +; CHECK-NEXT: mov w10, #33554432 // =0x2000000 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: madd x8, x8, x9, x11 ; CHECK-NEXT: uunpklo z2.d, z0.s @@ -132,11 +132,11 @@ ; CHECK-LABEL: scatter_i8_index_offset_minimum_minus_one: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov x9, #-2 +; CHECK-NEXT: mov x9, #-2 // =0xfffffffffffffffe ; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: movk x9, #64511, lsl #16 ; CHECK-NEXT: add x11, x0, x1 -; CHECK-NEXT: mov x10, #-33554433 +; CHECK-NEXT: mov x10, #-33554433 // =0xfffffffffdffffff ; CHECK-NEXT: madd x8, x8, x9, x11 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: uunpklo z2.d, z0.s @@ -163,10 +163,10 @@ ; CHECK-LABEL: scatter_i8_index_stride_too_big: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov x9, #-9223372036854775808 +; CHECK-NEXT: mov x9, #-9223372036854775808 // =0x8000000000000000 ; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: add x11, x0, x1 -; CHECK-NEXT: mov x10, #4611686018427387904 +; CHECK-NEXT: mov x10, #4611686018427387904 // =0x4000000000000000 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: madd x8, x8, x9, x11 ; CHECK-NEXT: uunpklo z2.d, z0.s @@ -214,7 +214,7 @@ define @gather_f32_index_offset_8(ptr %base, i64 %offset, %pg) #0 { ; CHECK-LABEL: gather_f32_index_offset_8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w8, #32 // =0x20 ; CHECK-NEXT: add x9, x0, x1, lsl #5 ; CHECK-NEXT: index z0.s, #0, w8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, z0.s, sxtw] @@ -255,7 +255,7 @@ define void @scatter_f16_index_offset_8(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_f16_index_offset_8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov w8, #16 // =0x10 ; CHECK-NEXT: add x9, x0, x1, lsl #4 ; CHECK-NEXT: index z1.s, #0, w8 ; CHECK-NEXT: st1h { z0.s }, p0, [x9, z1.s, sxtw] @@ -274,7 +274,7 @@ define void @scatter_f16_index_add_add(ptr %base, i64 %offset, i64 %offset2, %pg, %data) #0 { ; CHECK-LABEL: scatter_f16_index_add_add: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov w8, #16 // =0x10 ; CHECK-NEXT: add x9, x0, x2, lsl #4 ; CHECK-NEXT: add x9, x9, x1, lsl #4 ; CHECK-NEXT: index z1.s, #0, w8 @@ -297,7 +297,7 @@ define void @scatter_f16_index_add_add_mul(ptr %base, i64 %offset, i64 %offset2, %pg, %data) #0 { ; CHECK-LABEL: scatter_f16_index_add_add_mul: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #128 +; CHECK-NEXT: mov w8, #128 // =0x80 ; CHECK-NEXT: add x9, x0, x2, lsl #7 ; CHECK-NEXT: add x9, x9, x1, lsl #7 ; CHECK-NEXT: index z1.s, #0, w8 @@ -322,7 +322,7 @@ define @masked_gather_nxv2i64_const_with_vec_offsets( %vector_offsets, %pg) #0 { ; CHECK-LABEL: masked_gather_nxv2i64_const_with_vec_offsets: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 // =0x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3] ; CHECK-NEXT: ret %ptrs = getelementptr i64, ptr inttoptr (i64 8 to ptr), %vector_offsets @@ -347,7 +347,7 @@ define @masked_gather_nxv2i64_null_with__vec_plus_imm_offsets( %vector_offsets, %pg) #0 { ; CHECK-LABEL: masked_gather_nxv2i64_null_with__vec_plus_imm_offsets: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 // =0x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3] ; CHECK-NEXT: ret %scalar_offset.ins = insertelement undef, i64 1, i64 0 @@ -375,7 +375,7 @@ ; CHECK-LABEL: masked_gather_nxv4i32_u8_offsets: ; CHECK: // %bb.0: ; CHECK-NEXT: and z0.s, z0.s, #0xff -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i32, ptr %base, %offsets.zext @@ -400,7 +400,7 @@ define void @masked_scatter_nxv2i64_const_with_vec_offsets( %vector_offsets, %pg, %data) #0 { ; CHECK-LABEL: masked_scatter_nxv2i64_const_with_vec_offsets: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 // =0x8 ; CHECK-NEXT: st1d { z1.d }, p0, [x8, z0.d, lsl #3] ; CHECK-NEXT: ret %ptrs = getelementptr i64, ptr inttoptr (i64 8 to ptr), %vector_offsets @@ -425,7 +425,7 @@ define void @masked_scatter_nxv2i64_null_with__vec_plus_imm_offsets( %vector_offsets, %pg, %data) #0 { ; CHECK-LABEL: masked_scatter_nxv2i64_null_with__vec_plus_imm_offsets: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: mov w8, #8 // =0x8 ; CHECK-NEXT: st1d { z1.d }, p0, [x8, z0.d, lsl #3] ; CHECK-NEXT: ret %scalar_offset.ins = insertelement undef, i64 1, i64 0 @@ -453,7 +453,7 @@ ; CHECK-LABEL: masked_scatter_nxv4i32_u8_offsets: ; CHECK: // %bb.0: ; CHECK-NEXT: and z0.s, z0.s, #0xff -; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2] +; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i32, ptr %base, %offsets.zext diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll --- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll +++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll @@ -77,18 +77,30 @@ ; CHECK-LABEL: narrow_i64_gather_index_i8_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x1, x2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1, x2] -; CHECK-NEXT: ld1b { z1.s }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1b { z2.s }, p0/z, [x8, #2, mul vl] -; CHECK-NEXT: ld1b { z3.s }, p0/z, [x8, #3, mul vl] -; CHECK-NEXT: ld1b { z3.s }, p0/z, [x1, z3.s, uxtw] -; CHECK-NEXT: ld1b { z2.s }, p0/z, [x1, z2.s, uxtw] -; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1, z0.s, uxtw] -; CHECK-NEXT: ld1b { z1.s }, p0/z, [x1, z1.s, uxtw] -; CHECK-NEXT: uzp1 z2.h, z2.h, z3.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1, x2] +; CHECK-NEXT: ld1b { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1b { z2.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1b { z3.d }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1b { z4.d }, p0/z, [x8, #4, mul vl] +; CHECK-NEXT: ld1b { z5.d }, p0/z, [x8, #5, mul vl] +; CHECK-NEXT: ld1b { z6.d }, p0/z, [x8, #6, mul vl] +; CHECK-NEXT: ld1b { z7.d }, p0/z, [x8, #7, mul vl] +; CHECK-NEXT: ld1b { z7.d }, p0/z, [x1, z7.d] +; CHECK-NEXT: ld1b { z6.d }, p0/z, [x1, z6.d] +; CHECK-NEXT: ld1b { z5.d }, p0/z, [x1, z5.d] +; CHECK-NEXT: ld1b { z4.d }, p0/z, [x1, z4.d] +; CHECK-NEXT: ld1b { z3.d }, p0/z, [x1, z3.d] +; CHECK-NEXT: ld1b { z2.d }, p0/z, [x1, z2.d] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1, z0.d] +; CHECK-NEXT: ld1b { z1.d }, p0/z, [x1, z1.d] +; CHECK-NEXT: uzp1 z6.s, z6.s, z7.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z5.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z4.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: ret %1 = getelementptr inbounds i8, i8* %in, i64 %ptr %2 = bitcast i8* %1 to * @@ -103,18 +115,30 @@ ; CHECK-LABEL: narrow_i64_gather_index_i8_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x1, x2 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x1, x2] -; CHECK-NEXT: ld1sb { z1.s }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1sb { z2.s }, p0/z, [x8, #2, mul vl] -; CHECK-NEXT: ld1sb { z3.s }, p0/z, [x8, #3, mul vl] -; CHECK-NEXT: ld1b { z3.s }, p0/z, [x1, z3.s, sxtw] -; CHECK-NEXT: ld1b { z2.s }, p0/z, [x1, z2.s, sxtw] -; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1, z0.s, sxtw] -; CHECK-NEXT: ld1b { z1.s }, p0/z, [x1, z1.s, sxtw] -; CHECK-NEXT: uzp1 z2.h, z2.h, z3.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z2.b +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x1, x2] +; CHECK-NEXT: ld1sb { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1sb { z2.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1sb { z3.d }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1sb { z4.d }, p0/z, [x8, #4, mul vl] +; CHECK-NEXT: ld1sb { z5.d }, p0/z, [x8, #5, mul vl] +; CHECK-NEXT: ld1sb { z6.d }, p0/z, [x8, #6, mul vl] +; CHECK-NEXT: ld1sb { z7.d }, p0/z, [x8, #7, mul vl] +; CHECK-NEXT: ld1b { z7.d }, p0/z, [x1, z7.d] +; CHECK-NEXT: ld1b { z6.d }, p0/z, [x1, z6.d] +; CHECK-NEXT: ld1b { z5.d }, p0/z, [x1, z5.d] +; CHECK-NEXT: ld1b { z4.d }, p0/z, [x1, z4.d] +; CHECK-NEXT: ld1b { z3.d }, p0/z, [x1, z3.d] +; CHECK-NEXT: ld1b { z2.d }, p0/z, [x1, z2.d] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1, z0.d] +; CHECK-NEXT: ld1b { z1.d }, p0/z, [x1, z1.d] +; CHECK-NEXT: uzp1 z6.s, z6.s, z7.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z5.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z4.h, z6.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: ret %1 = getelementptr inbounds i8, i8* %in, i64 %ptr %2 = bitcast i8* %1 to * @@ -129,12 +153,18 @@ ; CHECK-LABEL: narrow_i64_gather_index_i16_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x1, x2, lsl #1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1, x2, lsl #1] -; CHECK-NEXT: ld1h { z1.s }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1, z0.s, uxtw #1] -; CHECK-NEXT: ld1h { z1.s }, p0/z, [x1, z1.s, uxtw #1] -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1, x2, lsl #1] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1h { z2.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1h { z3.d }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1h { z3.d }, p0/z, [x1, z3.d, lsl #1] +; CHECK-NEXT: ld1h { z2.d }, p0/z, [x1, z2.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x1, z1.d, lsl #1] +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h ; CHECK-NEXT: ret %1 = getelementptr inbounds i16, i16* %in, i64 %ptr %2 = bitcast i16* %1 to * @@ -149,12 +179,18 @@ ; CHECK-LABEL: narrow_i64_gather_index_i16_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x1, x2, lsl #1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x1, x2, lsl #1] -; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1, z0.s, sxtw #1] -; CHECK-NEXT: ld1h { z1.s }, p0/z, [x1, z1.s, sxtw #1] -; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x1, x2, lsl #1] +; CHECK-NEXT: ld1sh { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1sh { z2.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: ld1sh { z3.d }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1h { z3.d }, p0/z, [x1, z3.d, lsl #1] +; CHECK-NEXT: ld1h { z2.d }, p0/z, [x1, z2.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x1, z1.d, lsl #1] +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h ; CHECK-NEXT: ret %1 = getelementptr inbounds i16, i16* %in, i64 %ptr %2 = bitcast i16* %1 to * @@ -168,9 +204,13 @@ define @no_narrow_i64_gather_index_i32(i32* %out, i32* %in, %d, i64 %ptr){ ; CHECK-LABEL: no_narrow_i64_gather_index_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1, x2, lsl #2] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1, z0.s, uxtw #2] +; CHECK-NEXT: add x8, x1, x2, lsl #2 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1, x2, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1, z0.d, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x1, z1.d, lsl #2] +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: ret %1 = getelementptr inbounds i32, i32* %in, i64 %ptr %2 = bitcast i32* %1 to * diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll @@ -44,8 +44,8 @@ define @index_ii_range() { ; CHECK-LABEL: index_ii_range: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16 -; CHECK-NEXT: mov x9, #-17 +; CHECK-NEXT: mov w8, #16 // =0x10 +; CHECK-NEXT: mov x9, #-17 // =0xffffffffffffffef ; CHECK-NEXT: index z0.d, x9, x8 ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv2i64(i64 -17, i64 16) @@ -55,8 +55,7 @@ define @index_ii_range_combine(i16 %a) { ; CHECK-LABEL: index_ii_range_combine: ; CHECK: // %bb.0: -; CHECK-NEXT: index z0.h, #0, #8 -; CHECK-NEXT: orr z0.h, z0.h, #0x2 +; CHECK-NEXT: index z0.h, #2, #8 ; CHECK-NEXT: ret %val = insertelement poison, i16 2, i32 0 %val1 = shufflevector %val, poison, zeroinitializer @@ -109,7 +108,7 @@ define @index_ir_range(i32 %a) { ; CHECK-LABEL: index_ir_range: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-17 +; CHECK-NEXT: mov w8, #-17 // =0xffffffef ; CHECK-NEXT: index z0.s, w8, w0 ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv4i32(i32 -17, i32 %a) @@ -174,7 +173,7 @@ define @index_ri_range(i16 %a) { ; CHECK-LABEL: index_ri_range: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: mov w8, #16 // =0x10 ; CHECK-NEXT: index z0.h, w0, w8 ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.index.nxv8i16(i16 %a, i16 16) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll --- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll @@ -100,7 +100,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.s ; CHECK-NEXT: ret %wide.load = call @llvm.masked.load.nxv2i16(* %in, i32 2, %mask, undef) %zext = zext %wide.load to diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll @@ -48,11 +48,11 @@ define void @ctlz_v32i8(ptr %a) #0 { ; CHECK-LABEL: ctlz_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: clz z0.b, p0/m, z0.b ; CHECK-NEXT: clz z1.b, p0/m, z1.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: clz z0.b, p0/m, z0.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op) @@ -101,11 +101,11 @@ define void @ctlz_v16i16(ptr %a) #0 { ; CHECK-LABEL: ctlz_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: clz z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: clz z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op) @@ -140,11 +140,11 @@ define void @ctlz_v8i32(ptr %a) #0 { ; CHECK-LABEL: ctlz_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: clz z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: clz z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op) @@ -179,11 +179,11 @@ define void @ctlz_v4i64(ptr %a) #0 { ; CHECK-LABEL: ctlz_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: clz z0.d, p0/m, z0.d ; CHECK-NEXT: clz z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: clz z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op) @@ -235,11 +235,11 @@ define void @ctpop_v32i8(ptr %a) #0 { ; CHECK-LABEL: ctpop_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: cnt z0.b, p0/m, z0.b ; CHECK-NEXT: cnt z1.b, p0/m, z1.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: cnt z0.b, p0/m, z0.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op) @@ -287,11 +287,11 @@ define void @ctpop_v16i16(ptr %a) #0 { ; CHECK-LABEL: ctpop_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: cnt z0.h, p0/m, z0.h ; CHECK-NEXT: cnt z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: cnt z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op) @@ -326,11 +326,11 @@ define void @ctpop_v8i32(ptr %a) #0 { ; CHECK-LABEL: ctpop_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: cnt z0.s, p0/m, z0.s ; CHECK-NEXT: cnt z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: cnt z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op) @@ -365,11 +365,11 @@ define void @ctpop_v4i64(ptr %a) #0 { ; CHECK-LABEL: ctpop_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: cnt z0.d, p0/m, z0.d ; CHECK-NEXT: cnt z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: cnt z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op) @@ -424,13 +424,13 @@ define void @cttz_v32i8(ptr %a) #0 { ; CHECK-LABEL: cttz_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: rbit z0.b, p0/m, z0.b -; CHECK-NEXT: clz z0.b, p0/m, z0.b ; CHECK-NEXT: rbit z1.b, p0/m, z1.b ; CHECK-NEXT: clz z1.b, p0/m, z1.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.b, p0/m, z0.b +; CHECK-NEXT: clz z0.b, p0/m, z0.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op) @@ -481,13 +481,13 @@ define void @cttz_v16i16(ptr %a) #0 { ; CHECK-LABEL: cttz_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: rbit z0.h, p0/m, z0.h -; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: rbit z1.h, p0/m, z1.h ; CHECK-NEXT: clz z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.h, p0/m, z0.h +; CHECK-NEXT: clz z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op) @@ -524,13 +524,13 @@ define void @cttz_v8i32(ptr %a) #0 { ; CHECK-LABEL: cttz_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: rbit z0.s, p0/m, z0.s -; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: rbit z1.s, p0/m, z1.s ; CHECK-NEXT: clz z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.s, p0/m, z0.s +; CHECK-NEXT: clz z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op) @@ -567,13 +567,13 @@ define void @cttz_v4i64(ptr %a) #0 { ; CHECK-LABEL: cttz_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: rbit z0.d, p0/m, z0.d -; CHECK-NEXT: clz z0.d, p0/m, z0.d ; CHECK-NEXT: rbit z1.d, p0/m, z1.d ; CHECK-NEXT: clz z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.d, p0/m, z0.d +; CHECK-NEXT: clz z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll @@ -77,10 +77,10 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -156,10 +156,10 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -219,10 +219,10 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -267,10 +267,10 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -334,10 +334,10 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v32f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -397,10 +397,10 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -445,10 +445,10 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: concat_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: stp q0, q1, [x2, #32] -; CHECK-NEXT: stp q2, q3, [x2] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q1, q0, [x2, #32] +; CHECK-NEXT: stp q3, q2, [x2] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -44,15 +44,15 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) #0 { ; CHECK-LABEL: test_copysign_v16f16_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: and z0.h, z0.h, #0x8000 -; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: and z2.h, z2.h, #0x7fff -; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: and z0.h, z0.h, #0x8000 ; CHECK-NEXT: and z3.h, z3.h, #0x7fff ; CHECK-NEXT: orr z1.d, z3.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z2.h, z2.h, #0x7fff +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %a = load <16 x half>, ptr %ap %b = load <16 x half>, ptr %bp @@ -100,15 +100,15 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) #0 { ; CHECK-LABEL: test_copysign_v8f32_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: and z0.s, z0.s, #0x80000000 -; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: and z2.s, z2.s, #0x7fffffff -; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: and z0.s, z0.s, #0x80000000 ; CHECK-NEXT: and z3.s, z3.s, #0x7fffffff ; CHECK-NEXT: orr z1.d, z3.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z2.s, z2.s, #0x7fffffff +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %a = load <8 x float>, ptr %ap %b = load <8 x float>, ptr %bp @@ -139,15 +139,15 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) #0 { ; CHECK-LABEL: test_copysign_v4f64_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] -; CHECK-NEXT: and z0.d, z0.d, #0x8000000000000000 -; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK-NEXT: and z2.d, z2.d, #0x7fffffffffffffff -; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: and z0.d, z0.d, #0x8000000000000000 ; CHECK-NEXT: and z3.d, z3.d, #0x7fffffffffffffff ; CHECK-NEXT: orr z1.d, z3.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z2.d, z2.d, #0x7fffffffffffffff +; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x double>, ptr %bp @@ -237,17 +237,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #2 // =0x2 ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ld1w { z2.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x1, x8, lsl #2] ; CHECK-NEXT: ld1w { z3.d }, p0/z, [x1] ; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NEXT: fcvt z1.d, p0/m, z1.s ; CHECK-NEXT: fcvt z3.d, p0/m, z3.s -; CHECK-NEXT: fcvt z2.d, p0/m, z2.s -; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff +; CHECK-NEXT: and z2.d, z2.d, #0x7fffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 ; CHECK-NEXT: and z3.d, z3.d, #0x8000000000000000 -; CHECK-NEXT: and z2.d, z2.d, #0x8000000000000000 +; CHECK-NEXT: orr z1.d, z2.d, z1.d ; CHECK-NEXT: orr z0.d, z0.d, z3.d -; CHECK-NEXT: orr z1.d, z1.d, z2.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %a = load <4 x double>, ptr %ap diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll @@ -49,12 +49,12 @@ define void @fadd_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fadd_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -92,12 +92,12 @@ define void @fadd_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fadd_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -122,12 +122,12 @@ define void @fadd_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fadd_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -182,12 +182,12 @@ define void @fdiv_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fdiv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fdiv z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -225,12 +225,12 @@ define void @fdiv_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fdiv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fdiv z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -255,12 +255,12 @@ define void @fdiv_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fdiv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fdiv z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -318,14 +318,14 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z4.h ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.h, p0/m, z2.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -366,14 +366,14 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z4.s ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.s, p0/m, z2.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -400,14 +400,14 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z4.d ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -463,12 +463,12 @@ define void @fmul_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmul_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -506,12 +506,12 @@ define void @fmul_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmul_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -536,12 +536,12 @@ define void @fmul_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmul_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmul z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -593,11 +593,11 @@ define void @fneg_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fneg_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: fneg z0.h, p0/m, z0.h ; CHECK-NEXT: fneg z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fneg z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = fneg <16 x half> %op @@ -632,11 +632,11 @@ define void @fneg_v8f32(ptr %a) #0 { ; CHECK-LABEL: fneg_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fneg z0.s, p0/m, z0.s ; CHECK-NEXT: fneg z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fneg z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = fneg <8 x float> %op @@ -659,11 +659,11 @@ define void @fneg_v4f64(ptr %a) #0 { ; CHECK-LABEL: fneg_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fneg z0.d, p0/m, z0.d ; CHECK-NEXT: fneg z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fneg z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = fneg <4 x double> %op @@ -714,11 +714,11 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fsqrt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h ; CHECK-NEXT: fsqrt z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op) @@ -753,11 +753,11 @@ define void @fsqrt_v8f32(ptr %a) #0 { ; CHECK-LABEL: fsqrt_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s ; CHECK-NEXT: fsqrt z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op) @@ -780,11 +780,11 @@ define void @fsqrt_v4f64(ptr %a) #0 { ; CHECK-LABEL: fsqrt_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d ; CHECK-NEXT: fsqrt z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op) @@ -838,12 +838,12 @@ define void @fsub_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fsub_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fsub z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -881,12 +881,12 @@ define void @fsub_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fsub_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fsub z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -911,12 +911,12 @@ define void @fsub_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fsub_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fsub z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -968,11 +968,11 @@ define void @fabs_v16f16(ptr %a) #0 { ; CHECK-LABEL: fabs_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: fabs z0.h, p0/m, z0.h ; CHECK-NEXT: fabs z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fabs z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op) @@ -1007,11 +1007,11 @@ define void @fabs_v8f32(ptr %a) #0 { ; CHECK-LABEL: fabs_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fabs z0.s, p0/m, z0.s ; CHECK-NEXT: fabs z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fabs z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op) @@ -1034,11 +1034,11 @@ define void @fabs_v4f64(ptr %a) #0 { ; CHECK-LABEL: fabs_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fabs z0.d, p0/m, z0.d ; CHECK-NEXT: fabs z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fabs z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll @@ -56,14 +56,14 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_oeq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -106,14 +106,14 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_oeq_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s -; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -156,14 +156,14 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_oeq_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d -; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -180,18 +180,18 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ueq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p2.h, p0/z, z1.h, z0.h ; CHECK-NEXT: mov p1.b, p2/m, p2.b ; CHECK-NEXT: fcmuo p2.h, p0/z, z2.h, z3.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: sel p0.b, p0, p0.b, p2.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: sel p0.b, p0, p0.b, p2.b ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -208,18 +208,18 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_one_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmgt p2.h, p0/z, z1.h, z0.h ; CHECK-NEXT: mov p1.b, p2/m, p2.b ; CHECK-NEXT: fcmgt p2.h, p0/z, z3.h, z2.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: sel p0.b, p0, p0.b, p2.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: sel p0.b, p0, p0.b, p2.b ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -236,14 +236,14 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmne p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmne p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -260,14 +260,14 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ogt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -284,17 +284,17 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ugt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h ; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h -; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -311,14 +311,14 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_olt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -335,17 +335,17 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ult_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h ; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -362,14 +362,14 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_oge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -386,17 +386,17 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_uge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h ; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h -; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -413,14 +413,14 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ole_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -437,17 +437,17 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ule_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h ; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -464,14 +464,14 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_uno_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmuo p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -488,17 +488,17 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ord_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: fcmuo p0.h, p0/z, z2.h, z3.h ; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: fcmuo p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: stp q0, q1, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -515,14 +515,14 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_eq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -539,14 +539,14 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ne_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmne p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmne p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -563,14 +563,14 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_gt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -587,14 +587,14 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_lt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -611,14 +611,14 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_ge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -635,14 +635,14 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fcmp_le_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x2] +; CHECK-NEXT: stp q1, q0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll @@ -7,14 +7,14 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fp_convert_combine_crash: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: fmov z2.s, #8.00000000 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %f = load <8 x float>, ptr %a %mul.i = fmul <8 x float> %f, , ptr %a %res = fpext <16 x half> %op1 to <16 x float> @@ -204,8 +204,8 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvt_v8f16_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #4 // =0x4 -; CHECK-NEXT: mov x9, #6 // =0x6 +; CHECK-NEXT: mov x8, #6 // =0x6 +; CHECK-NEXT: mov x9, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x10, #2 // =0x2 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] @@ -214,12 +214,12 @@ ; CHECK-NEXT: ld1h { z3.d }, p0/z, [x0] ; CHECK-NEXT: fcvt z0.d, p0/m, z0.h ; CHECK-NEXT: fcvt z1.d, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x1, #32] -; CHECK-NEXT: movprfx z0, z3 -; CHECK-NEXT: fcvt z0.d, p0/m, z3.h -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fcvt z1.d, p0/m, z2.h -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1, #32] +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvt z0.d, p0/m, z2.h +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: fcvt z1.d, p0/m, z3.h +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fpext <8 x half> %op1 to <8 x double> @@ -230,16 +230,16 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvt_v16f16_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #14 // =0xe -; CHECK-NEXT: mov x10, #12 // =0xc +; CHECK-NEXT: mov x9, #12 // =0xc +; CHECK-NEXT: mov x10, #14 // =0xe ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x8, #2 // =0x2 -; CHECK-NEXT: mov x11, #6 // =0x6 -; CHECK-NEXT: mov x12, #4 // =0x4 +; CHECK-NEXT: mov x11, #4 // =0x4 +; CHECK-NEXT: mov x12, #6 // =0x6 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, x10, lsl #1] -; CHECK-NEXT: mov x9, #8 // =0x8 -; CHECK-NEXT: mov x10, #10 // =0xa +; CHECK-NEXT: mov x9, #10 // =0xa +; CHECK-NEXT: mov x10, #8 // =0x8 ; CHECK-NEXT: ld1h { z2.d }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z3.d }, p0/z, [x0, x11, lsl #1] ; CHECK-NEXT: ld1h { z5.d }, p0/z, [x0, x12, lsl #1] @@ -248,22 +248,22 @@ ; CHECK-NEXT: ld1h { z4.d }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ld1h { z6.d }, p0/z, [x0, x10, lsl #1] ; CHECK-NEXT: ld1h { z7.d }, p0/z, [x0] -; CHECK-NEXT: stp q1, q0, [x1, #96] +; CHECK-NEXT: stp q0, q1, [x1, #96] ; CHECK-NEXT: movprfx z1, z4 ; CHECK-NEXT: fcvt z1.d, p0/m, z4.h ; CHECK-NEXT: movprfx z0, z6 ; CHECK-NEXT: fcvt z0.d, p0/m, z6.h -; CHECK-NEXT: stp q1, q0, [x1, #64] +; CHECK-NEXT: stp q0, q1, [x1, #64] ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fcvt z1.d, p0/m, z5.h ; CHECK-NEXT: movprfx z0, z3 ; CHECK-NEXT: fcvt z0.d, p0/m, z3.h -; CHECK-NEXT: stp q1, q0, [x1, #32] -; CHECK-NEXT: movprfx z1, z7 -; CHECK-NEXT: fcvt z1.d, p0/m, z7.h -; CHECK-NEXT: movprfx z0, z2 -; CHECK-NEXT: fcvt z0.d, p0/m, z2.h -; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvt z1.d, p0/m, z2.h +; CHECK-NEXT: movprfx z0, z7 +; CHECK-NEXT: fcvt z0.d, p0/m, z7.h +; CHECK-NEXT: stp q0, q1, [x1] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fpext <16 x half> %op1 to <16 x double> @@ -322,8 +322,8 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvt_v8f32_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #4 // =0x4 -; CHECK-NEXT: mov x9, #6 // =0x6 +; CHECK-NEXT: mov x8, #6 // =0x6 +; CHECK-NEXT: mov x9, #4 // =0x4 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x10, #2 // =0x2 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] @@ -332,12 +332,12 @@ ; CHECK-NEXT: ld1w { z3.d }, p0/z, [x0] ; CHECK-NEXT: fcvt z0.d, p0/m, z0.s ; CHECK-NEXT: fcvt z1.d, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x1, #32] -; CHECK-NEXT: movprfx z0, z3 -; CHECK-NEXT: fcvt z0.d, p0/m, z3.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fcvt z1.d, p0/m, z2.s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1, #32] +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvt z0.d, p0/m, z2.s +; CHECK-NEXT: movprfx z1, z3 +; CHECK-NEXT: fcvt z1.d, p0/m, z3.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fpext <8 x float> %op1 to <8 x double> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll @@ -40,14 +40,14 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z4.h ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.h, p0/m, z2.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -91,14 +91,14 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z4.s ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.s, p0/m, z2.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -140,14 +140,14 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fma_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q3, q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: ldp q4, q5, [x2] +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ldp q5, q4, [x2] ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z4.d ; CHECK-NEXT: movprfx z1, z5 ; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll @@ -36,12 +36,12 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmaxnm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmaxnm z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -79,12 +79,12 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmaxnm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmaxnm z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -120,12 +120,12 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmaxnm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmaxnm z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -167,12 +167,12 @@ define void @fminnm_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fminnm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fminnm z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -210,12 +210,12 @@ define void @fminnm_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fminnm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fminnm z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -251,12 +251,12 @@ define void @fminnm_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fminnm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fminnm z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -298,12 +298,12 @@ define void @fmax_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmax_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmax z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -341,12 +341,12 @@ define void @fmax_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmax_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmax z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -382,12 +382,12 @@ define void @fmax_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmax_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmax z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -429,12 +429,12 @@ define void @fmin_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmin_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmin z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -472,12 +472,12 @@ define void @fmin_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmin_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmin z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -513,12 +513,12 @@ define void @fmin_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fmin_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fmin z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll @@ -46,11 +46,11 @@ define void @frintp_v16f16(ptr %a) #0 { ; CHECK-LABEL: frintp_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: frintp z0.h, p0/m, z0.h ; CHECK-NEXT: frintp z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintp z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op) @@ -85,11 +85,11 @@ define void @frintp_v8f32(ptr %a) #0 { ; CHECK-LABEL: frintp_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: frintp z0.s, p0/m, z0.s ; CHECK-NEXT: frintp z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintp z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op) @@ -122,11 +122,11 @@ define void @frintp_v4f64(ptr %a) #0 { ; CHECK-LABEL: frintp_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: frintp z0.d, p0/m, z0.d ; CHECK-NEXT: frintp z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintp z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op) @@ -177,11 +177,11 @@ define void @frintm_v16f16(ptr %a) #0 { ; CHECK-LABEL: frintm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: frintm z0.h, p0/m, z0.h ; CHECK-NEXT: frintm z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintm z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op) @@ -216,11 +216,11 @@ define void @frintm_v8f32(ptr %a) #0 { ; CHECK-LABEL: frintm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: frintm z0.s, p0/m, z0.s ; CHECK-NEXT: frintm z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintm z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op) @@ -253,11 +253,11 @@ define void @frintm_v4f64(ptr %a) #0 { ; CHECK-LABEL: frintm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: frintm z0.d, p0/m, z0.d ; CHECK-NEXT: frintm z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintm z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op) @@ -308,11 +308,11 @@ define void @frinti_v16f16(ptr %a) #0 { ; CHECK-LABEL: frinti_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: frinti z0.h, p0/m, z0.h ; CHECK-NEXT: frinti z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frinti z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op) @@ -347,11 +347,11 @@ define void @frinti_v8f32(ptr %a) #0 { ; CHECK-LABEL: frinti_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: frinti z0.s, p0/m, z0.s ; CHECK-NEXT: frinti z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frinti z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op) @@ -384,11 +384,11 @@ define void @frinti_v4f64(ptr %a) #0 { ; CHECK-LABEL: frinti_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: frinti z0.d, p0/m, z0.d ; CHECK-NEXT: frinti z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frinti z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op) @@ -439,11 +439,11 @@ define void @frintx_v16f16(ptr %a) #0 { ; CHECK-LABEL: frintx_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: frintx z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintx z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op) @@ -478,11 +478,11 @@ define void @frintx_v8f32(ptr %a) #0 { ; CHECK-LABEL: frintx_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: frintx z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintx z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op) @@ -515,11 +515,11 @@ define void @frintx_v4f64(ptr %a) #0 { ; CHECK-LABEL: frintx_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: frintx z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintx z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op) @@ -570,11 +570,11 @@ define void @frinta_v16f16(ptr %a) #0 { ; CHECK-LABEL: frinta_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: frinta z0.h, p0/m, z0.h ; CHECK-NEXT: frinta z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frinta z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op) @@ -609,11 +609,11 @@ define void @frinta_v8f32(ptr %a) #0 { ; CHECK-LABEL: frinta_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: frinta z0.s, p0/m, z0.s ; CHECK-NEXT: frinta z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frinta z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op) @@ -646,11 +646,11 @@ define void @frinta_v4f64(ptr %a) #0 { ; CHECK-LABEL: frinta_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: frinta z0.d, p0/m, z0.d ; CHECK-NEXT: frinta z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frinta z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op) @@ -701,11 +701,11 @@ define void @frintn_v16f16(ptr %a) #0 { ; CHECK-LABEL: frintn_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: frintn z0.h, p0/m, z0.h ; CHECK-NEXT: frintn z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintn z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op) @@ -740,11 +740,11 @@ define void @frintn_v8f32(ptr %a) #0 { ; CHECK-LABEL: frintn_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: frintn z0.s, p0/m, z0.s ; CHECK-NEXT: frintn z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintn z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op) @@ -777,11 +777,11 @@ define void @frintn_v4f64(ptr %a) #0 { ; CHECK-LABEL: frintn_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: frintn z0.d, p0/m, z0.d ; CHECK-NEXT: frintn z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintn z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op) @@ -832,11 +832,11 @@ define void @frintz_v16f16(ptr %a) #0 { ; CHECK-LABEL: frintz_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: frintz z0.h, p0/m, z0.h ; CHECK-NEXT: frintz z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintz z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op) @@ -871,11 +871,11 @@ define void @frintz_v8f32(ptr %a) #0 { ; CHECK-LABEL: frintz_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: frintz z0.s, p0/m, z0.s ; CHECK-NEXT: frintz z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintz z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op) @@ -908,11 +908,11 @@ define void @frintz_v4f64(ptr %a) #0 { ; CHECK-LABEL: frintz_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: frintz z0.d, p0/m, z0.d ; CHECK-NEXT: frintz z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: frintz z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll @@ -55,16 +55,16 @@ ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q3, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: mov z4.h, w8 ; CHECK-NEXT: cmpne p0.h, p0/z, z4.h, #0 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x half>, ptr %a %op2 = load volatile <16 x half>, ptr %b @@ -109,16 +109,16 @@ ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q3, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: mov z4.s, w8 ; CHECK-NEXT: cmpne p0.s, p0/z, z4.s, #0 ; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s ; CHECK-NEXT: sel z1.s, p0, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x float>, ptr %a %op2 = load volatile <8 x float>, ptr %b @@ -149,8 +149,7 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 @@ -166,18 +165,17 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q3, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: cmpne p0.d, p0/z, z4.d, #0 ; CHECK-NEXT: sel z0.d, p0, z0.d, z2.d ; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x double>, ptr %a %op2 = load volatile <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -36,11 +36,11 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvtzu_v16f16_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h ; CHECK-NEXT: fcvtzu z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptoui <16 x half> %op1 to <16 x i16> @@ -374,11 +374,11 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvtzu_v8f32_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s ; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptoui <8 x float> %op1 to <8 x i32> @@ -741,11 +741,11 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvtzu_v4f64_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptoui <4 x double> %op1 to <4 x i64> @@ -786,11 +786,11 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvtzs_v16f16_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h ; CHECK-NEXT: fcvtzs z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptosi <16 x half> %op1 to <16 x i16> @@ -1125,11 +1125,11 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvtzs_v8f32_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptosi <8 x float> %op1 to <8 x i32> @@ -1494,11 +1494,11 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fcvtzs_v4f64_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptosi <4 x double> %op1 to <4 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -70,14 +70,14 @@ define void @select_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z2.h -; CHECK-NEXT: sel z1.h, p1, z1.h, z2.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, z3.h +; CHECK-NEXT: sel z1.h, p1, z1.h, z2.h ; CHECK-NEXT: sel z0.h, p0, z0.h, z3.h -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -127,14 +127,14 @@ define void @select_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z2.s -; CHECK-NEXT: sel z1.s, p1, z1.s, z2.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z3.s +; CHECK-NEXT: sel z1.s, p1, z1.s, z2.s ; CHECK-NEXT: sel z0.s, p0, z0.s, z3.s -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -185,14 +185,14 @@ define void @select_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z2.d -; CHECK-NEXT: sel z1.d, p1, z1.d, z2.d ; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, z3.d +; CHECK-NEXT: sel z1.d, p1, z1.d, z2.d ; CHECK-NEXT: sel z0.d, p0, z0.d, z3.d -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -45,11 +45,11 @@ define void @add_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: add_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add z0.b, z0.b, z2.b +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add z1.b, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add z0.b, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -97,11 +97,11 @@ define void @add_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: add_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add z1.h, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -137,11 +137,11 @@ define void @add_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: add_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add z1.s, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -177,11 +177,11 @@ define void @add_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: add_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -236,12 +236,12 @@ define void @mul_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: mul_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: mul z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mul z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -292,12 +292,12 @@ define void @mul_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: mul_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -335,12 +335,12 @@ define void @mul_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: mul_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: mul z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -378,12 +378,12 @@ define void @mul_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: mul_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: mul z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -435,11 +435,11 @@ define void @sub_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: sub_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: sub z0.b, z0.b, z2.b +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: sub z1.b, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sub z0.b, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -487,11 +487,11 @@ define void @sub_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: sub_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: sub z0.h, z0.h, z2.h +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: sub z1.h, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sub z0.h, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -527,11 +527,11 @@ define void @sub_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: sub_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: sub z0.s, z0.s, z2.s +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: sub z1.s, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sub z0.s, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -567,11 +567,11 @@ define void @sub_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: sub_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: sub z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: sub z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sub z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -624,11 +624,11 @@ define void @abs_v32i8(ptr %a) #0 { ; CHECK-LABEL: abs_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: abs z0.b, p0/m, z0.b ; CHECK-NEXT: abs z1.b, p0/m, z1.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: abs z0.b, p0/m, z0.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false) @@ -676,11 +676,11 @@ define void @abs_v16i16(ptr %a) #0 { ; CHECK-LABEL: abs_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: abs z0.h, p0/m, z0.h ; CHECK-NEXT: abs z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: abs z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false) @@ -715,11 +715,11 @@ define void @abs_v8i32(ptr %a) #0 { ; CHECK-LABEL: abs_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: abs z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) @@ -754,11 +754,11 @@ define void @abs_v4i64(ptr %a) #0 { ; CHECK-LABEL: abs_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: abs z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll @@ -40,14 +40,14 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: icmp_eq_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z2.b -; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z3.b +; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -90,14 +90,14 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: icmp_eq_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -140,14 +140,14 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: icmp_eq_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s -; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -190,14 +190,14 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: icmp_eq_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d -; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -214,14 +214,14 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: icmp_ne_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, z2.b -; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, z3.b +; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -260,14 +260,14 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: icmp_sgt_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpgt p1.h, p0/z, z0.h, z2.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmpgt p0.h, p0/z, z1.h, z3.h +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -306,14 +306,14 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: icmp_slt_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpgt p1.s, p0/z, z2.s, z0.s -; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmpgt p0.s, p0/z, z3.s, z1.s +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -104,14 +104,14 @@ define void @sdiv_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: sdiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ptrue p1.h, vl4 ; CHECK-NEXT: ptrue p2.b, vl8 -; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: sunpklo z2.h, z2.b ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ldp q3, q1, [x1] ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z7.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 @@ -173,7 +173,7 @@ ; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b ; CHECK-NEXT: splice z1.b, p2, z1.b, z0.b ; CHECK-NEXT: splice z2.b, p2, z2.b, z4.b -; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -242,34 +242,34 @@ define void @sdiv_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: sdiv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: sunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: sunpklo z4.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z7.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: sunpklo z6.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: sunpklo z5.s, z2.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpklo z7.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: sunpklo z2.s, z3.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: movprfx z2, z7 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z6.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h ; CHECK-NEXT: splice z2.h, p1, z2.h, z1.h ; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h -; CHECK-NEXT: stp q2, q3, [x0] +; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -307,12 +307,12 @@ define void @sdiv_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: sdiv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -350,12 +350,12 @@ define void @sdiv_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: sdiv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: sdiv z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sdiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -464,14 +464,14 @@ define void @udiv_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: udiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ptrue p1.h, vl4 ; CHECK-NEXT: ptrue p2.b, vl8 -; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ldp q3, q1, [x1] ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z7.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 @@ -533,7 +533,7 @@ ; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b ; CHECK-NEXT: splice z1.b, p2, z1.b, z0.b ; CHECK-NEXT: splice z2.b, p2, z2.b, z4.b -; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -602,34 +602,34 @@ define void @udiv_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: udiv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q3, q0, [x1] +; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: uunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: uunpklo z4.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z7.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: uunpklo z6.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: uunpklo z5.s, z2.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpklo z7.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: uunpklo z2.s, z3.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: movprfx z2, z7 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z6.s -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h ; CHECK-NEXT: splice z2.h, p1, z2.h, z1.h ; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h -; CHECK-NEXT: stp q2, q3, [x0] +; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -667,12 +667,12 @@ define void @udiv_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: udiv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: udiv z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -710,12 +710,12 @@ define void @udiv_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: udiv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: udiv z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -727,9 +727,9 @@ define void @udiv_constantsplat_v8i32(ptr %a) #0 { ; CHECK-LABEL: udiv_constantsplat_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: mov w8, #8969 // =0x2309 ; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: movk w8, #22765, lsl #16 ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: movprfx z3, z0 @@ -743,7 +743,7 @@ ; CHECK-NEXT: add z1.s, z1.s, z2.s ; CHECK-NEXT: lsr z0.s, z0.s, #6 ; CHECK-NEXT: lsr z1.s, z1.s, #6 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = udiv <8 x i32> %op1, diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll @@ -16,10 +16,10 @@ define void @add_v32i8(ptr %a) #0 { ; CHECK-LABEL: add_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: add z0.b, z0.b, #7 // =0x7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z1.b, z1.b, #7 // =0x7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add z0.b, z0.b, #7 // =0x7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 @@ -32,10 +32,10 @@ define void @add_v16i16(ptr %a) #0 { ; CHECK-LABEL: add_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: add z0.h, z0.h, #15 // =0xf +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z1.h, z1.h, #15 // =0xf -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add z0.h, z0.h, #15 // =0xf +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -48,10 +48,10 @@ define void @add_v8i32(ptr %a) #0 { ; CHECK-LABEL: add_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: add z0.s, z0.s, #31 // =0x1f +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z1.s, z1.s, #31 // =0x1f -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add z0.s, z0.s, #31 // =0x1f +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -64,10 +64,10 @@ define void @add_v4i64(ptr %a) #0 { ; CHECK-LABEL: add_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: add z0.d, z0.d, #63 // =0x3f +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: add z1.d, z1.d, #63 // =0x3f -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add z0.d, z0.d, #63 // =0x3f +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -84,10 +84,10 @@ define void @and_v32i8(ptr %a) #0 { ; CHECK-LABEL: and_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: and z0.b, z0.b, #0x7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: and z1.b, z1.b, #0x7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z0.b, z0.b, #0x7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 @@ -100,10 +100,10 @@ define void @and_v16i16(ptr %a) #0 { ; CHECK-LABEL: and_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: and z0.h, z0.h, #0xf +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: and z1.h, z1.h, #0xf -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z0.h, z0.h, #0xf +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -116,10 +116,10 @@ define void @and_v8i32(ptr %a) #0 { ; CHECK-LABEL: and_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: and z0.s, z0.s, #0x1f +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: and z1.s, z1.s, #0x1f -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z0.s, z0.s, #0x1f +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -132,10 +132,10 @@ define void @and_v4i64(ptr %a) #0 { ; CHECK-LABEL: and_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: and z0.d, z0.d, #0x3f +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: and z1.d, z1.d, #0x3f -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z0.d, z0.d, #0x3f +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -152,10 +152,10 @@ define void @ashr_v32i8(ptr %a) #0 { ; CHECK-LABEL: ashr_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: asr z0.b, z0.b, #7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: asr z1.b, z1.b, #7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z0.b, z0.b, #7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 @@ -168,10 +168,10 @@ define void @ashr_v16i16(ptr %a) #0 { ; CHECK-LABEL: ashr_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: asr z0.h, z0.h, #15 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: asr z1.h, z1.h, #15 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z0.h, z0.h, #15 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -184,10 +184,10 @@ define void @ashr_v8i32(ptr %a) #0 { ; CHECK-LABEL: ashr_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: asr z0.s, z0.s, #31 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: asr z1.s, z1.s, #31 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z0.s, z0.s, #31 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -200,10 +200,10 @@ define void @ashr_v4i64(ptr %a) #0 { ; CHECK-LABEL: ashr_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: asr z0.d, z0.d, #63 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: asr z1.d, z1.d, #63 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z0.d, z0.d, #63 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -220,13 +220,13 @@ define void @icmp_eq_v32i8(ptr %a) #0 { ; CHECK-LABEL: icmp_eq_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, #7 -; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, #7 +; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -240,13 +240,13 @@ define void @icmp_sge_v16i16(ptr %a) #0 { ; CHECK-LABEL: icmp_sge_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: cmpge p1.h, p0/z, z0.h, #15 -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmpge p0.h, p0/z, z1.h, #15 +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -260,13 +260,13 @@ define void @icmp_sgt_v8i32(ptr %a) #0 { ; CHECK-LABEL: icmp_sgt_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: cmpgt p1.s, p0/z, z0.s, #-8 -; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmpgt p0.s, p0/z, z1.s, #-8 +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 -8, i64 0 @@ -280,13 +280,13 @@ define void @icmp_ult_v4i64(ptr %a) #0 { ; CHECK-LABEL: icmp_ult_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cmplo p1.d, p0/z, z0.d, #63 -; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: cmplo p0.d, p0/z, z1.d, #63 +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -304,10 +304,10 @@ define void @lshr_v32i8(ptr %a) #0 { ; CHECK-LABEL: lshr_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: lsr z0.b, z0.b, #7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: lsr z1.b, z1.b, #7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z0.b, z0.b, #7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -320,10 +320,10 @@ define void @lshr_v16i16(ptr %a) #0 { ; CHECK-LABEL: lshr_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: lsr z0.h, z0.h, #15 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: lsr z1.h, z1.h, #15 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z0.h, z0.h, #15 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -336,10 +336,10 @@ define void @lshr_v8i32(ptr %a) #0 { ; CHECK-LABEL: lshr_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: lsr z0.s, z0.s, #31 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: lsr z1.s, z1.s, #31 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z0.s, z0.s, #31 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -352,10 +352,10 @@ define void @lshr_v4i64(ptr %a) #0 { ; CHECK-LABEL: lshr_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: lsr z0.d, z0.d, #63 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: lsr z1.d, z1.d, #63 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z0.d, z0.d, #63 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -372,10 +372,10 @@ define void @mul_v32i8(ptr %a) #0 { ; CHECK-LABEL: mul_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: mul z0.b, z0.b, #7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: mul z1.b, z1.b, #7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mul z0.b, z0.b, #7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -388,10 +388,10 @@ define void @mul_v16i16(ptr %a) #0 { ; CHECK-LABEL: mul_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: mul z0.h, z0.h, #15 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: mul z1.h, z1.h, #15 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mul z0.h, z0.h, #15 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -404,10 +404,10 @@ define void @mul_v8i32(ptr %a) #0 { ; CHECK-LABEL: mul_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: mul z0.s, z0.s, #31 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: mul z1.s, z1.s, #31 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mul z0.s, z0.s, #31 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -420,10 +420,10 @@ define void @mul_v4i64(ptr %a) #0 { ; CHECK-LABEL: mul_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: mul z0.d, z0.d, #63 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: mul z1.d, z1.d, #63 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mul z0.d, z0.d, #63 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -440,10 +440,10 @@ define void @or_v32i8(ptr %a) #0 { ; CHECK-LABEL: or_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: orr z0.b, z0.b, #0x7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: orr z1.b, z1.b, #0x7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z0.b, z0.b, #0x7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -456,10 +456,10 @@ define void @or_v16i16(ptr %a) #0 { ; CHECK-LABEL: or_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: orr z0.h, z0.h, #0xf +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: orr z1.h, z1.h, #0xf -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z0.h, z0.h, #0xf +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -472,10 +472,10 @@ define void @or_v8i32(ptr %a) #0 { ; CHECK-LABEL: or_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: orr z0.s, z0.s, #0x1f +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: orr z1.s, z1.s, #0x1f -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z0.s, z0.s, #0x1f +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -488,10 +488,10 @@ define void @or_v4i64(ptr %a) #0 { ; CHECK-LABEL: or_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: orr z0.d, z0.d, #0x3f +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: orr z1.d, z1.d, #0x3f -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z0.d, z0.d, #0x3f +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -508,10 +508,10 @@ define void @shl_v32i8(ptr %a) #0 { ; CHECK-LABEL: shl_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: lsl z0.b, z0.b, #7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: lsl z1.b, z1.b, #7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z0.b, z0.b, #7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -524,10 +524,10 @@ define void @shl_v16i16(ptr %a) #0 { ; CHECK-LABEL: shl_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: lsl z0.h, z0.h, #15 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: lsl z1.h, z1.h, #15 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z0.h, z0.h, #15 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -540,10 +540,10 @@ define void @shl_v8i32(ptr %a) #0 { ; CHECK-LABEL: shl_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: lsl z0.s, z0.s, #31 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: lsl z1.s, z1.s, #31 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z0.s, z0.s, #31 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -556,10 +556,10 @@ define void @shl_v4i64(ptr %a) #0 { ; CHECK-LABEL: shl_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: lsl z0.d, z0.d, #63 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: lsl z1.d, z1.d, #63 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z0.d, z0.d, #63 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -576,10 +576,10 @@ define void @smax_v32i8(ptr %a) #0 { ; CHECK-LABEL: smax_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: smax z0.b, z0.b, #7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: smax z1.b, z1.b, #7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z0.b, z0.b, #7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -592,10 +592,10 @@ define void @smax_v16i16(ptr %a) #0 { ; CHECK-LABEL: smax_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: smax z0.h, z0.h, #15 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: smax z1.h, z1.h, #15 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z0.h, z0.h, #15 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -608,10 +608,10 @@ define void @smax_v8i32(ptr %a) #0 { ; CHECK-LABEL: smax_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: smax z0.s, z0.s, #31 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: smax z1.s, z1.s, #31 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z0.s, z0.s, #31 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -624,10 +624,10 @@ define void @smax_v4i64(ptr %a) #0 { ; CHECK-LABEL: smax_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: smax z0.d, z0.d, #63 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: smax z1.d, z1.d, #63 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z0.d, z0.d, #63 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -644,10 +644,10 @@ define void @smin_v32i8(ptr %a) #0 { ; CHECK-LABEL: smin_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: smin z0.b, z0.b, #7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: smin z1.b, z1.b, #7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z0.b, z0.b, #7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -660,10 +660,10 @@ define void @smin_v16i16(ptr %a) #0 { ; CHECK-LABEL: smin_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: smin z0.h, z0.h, #15 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: smin z1.h, z1.h, #15 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z0.h, z0.h, #15 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -676,10 +676,10 @@ define void @smin_v8i32(ptr %a) #0 { ; CHECK-LABEL: smin_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: smin z0.s, z0.s, #31 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: smin z1.s, z1.s, #31 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z0.s, z0.s, #31 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -692,10 +692,10 @@ define void @smin_v4i64(ptr %a) #0 { ; CHECK-LABEL: smin_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: smin z0.d, z0.d, #63 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: smin z1.d, z1.d, #63 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z0.d, z0.d, #63 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -712,10 +712,10 @@ define void @sub_v32i8(ptr %a) #0 { ; CHECK-LABEL: sub_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: sub z0.b, z0.b, #7 // =0x7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: sub z1.b, z1.b, #7 // =0x7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sub z0.b, z0.b, #7 // =0x7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -728,10 +728,10 @@ define void @sub_v16i16(ptr %a) #0 { ; CHECK-LABEL: sub_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: sub z0.h, z0.h, #15 // =0xf +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: sub z1.h, z1.h, #15 // =0xf -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sub z0.h, z0.h, #15 // =0xf +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -744,10 +744,10 @@ define void @sub_v8i32(ptr %a) #0 { ; CHECK-LABEL: sub_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: sub z0.s, z0.s, #31 // =0x1f +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: sub z1.s, z1.s, #31 // =0x1f -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sub z0.s, z0.s, #31 // =0x1f +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -760,10 +760,10 @@ define void @sub_v4i64(ptr %a) #0 { ; CHECK-LABEL: sub_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: sub z0.d, z0.d, #63 // =0x3f +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: sub z1.d, z1.d, #63 // =0x3f -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: sub z0.d, z0.d, #63 // =0x3f +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -780,10 +780,10 @@ define void @umax_v32i8(ptr %a) #0 { ; CHECK-LABEL: umax_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: umax z0.b, z0.b, #7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: umax z1.b, z1.b, #7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z0.b, z0.b, #7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -796,10 +796,10 @@ define void @umax_v16i16(ptr %a) #0 { ; CHECK-LABEL: umax_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: umax z0.h, z0.h, #15 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: umax z1.h, z1.h, #15 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z0.h, z0.h, #15 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -812,10 +812,10 @@ define void @umax_v8i32(ptr %a) #0 { ; CHECK-LABEL: umax_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: umax z0.s, z0.s, #31 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: umax z1.s, z1.s, #31 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z0.s, z0.s, #31 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -828,10 +828,10 @@ define void @umax_v4i64(ptr %a) #0 { ; CHECK-LABEL: umax_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: umax z0.d, z0.d, #63 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: umax z1.d, z1.d, #63 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z0.d, z0.d, #63 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -848,10 +848,10 @@ define void @umin_v32i8(ptr %a) #0 { ; CHECK-LABEL: umin_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: umin z0.b, z0.b, #7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: umin z1.b, z1.b, #7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z0.b, z0.b, #7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -864,10 +864,10 @@ define void @umin_v16i16(ptr %a) #0 { ; CHECK-LABEL: umin_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: umin z0.h, z0.h, #15 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: umin z1.h, z1.h, #15 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z0.h, z0.h, #15 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -880,10 +880,10 @@ define void @umin_v8i32(ptr %a) #0 { ; CHECK-LABEL: umin_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: umin z0.s, z0.s, #31 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: umin z1.s, z1.s, #31 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z0.s, z0.s, #31 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -896,10 +896,10 @@ define void @umin_v4i64(ptr %a) #0 { ; CHECK-LABEL: umin_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: umin z0.d, z0.d, #63 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: umin z1.d, z1.d, #63 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z0.d, z0.d, #63 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -916,10 +916,10 @@ define void @xor_v32i8(ptr %a) #0 { ; CHECK-LABEL: xor_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: eor z0.b, z0.b, #0x7 +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: eor z1.b, z1.b, #0x7 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: eor z0.b, z0.b, #0x7 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -932,10 +932,10 @@ define void @xor_v16i16(ptr %a) #0 { ; CHECK-LABEL: xor_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: eor z0.h, z0.h, #0xf +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: eor z1.h, z1.h, #0xf -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: eor z0.h, z0.h, #0xf +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -948,10 +948,10 @@ define void @xor_v8i32(ptr %a) #0 { ; CHECK-LABEL: xor_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: eor z0.s, z0.s, #0x1f +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: eor z1.s, z1.s, #0x1f -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: eor z0.s, z0.s, #0x1f +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -964,10 +964,10 @@ define void @xor_v4i64(ptr %a) #0 { ; CHECK-LABEL: xor_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: eor z0.d, z0.d, #0x3f +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: eor z1.d, z1.d, #0x3f -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: eor z0.d, z0.d, #0x3f +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll @@ -34,11 +34,11 @@ define void @and_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: and_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -74,11 +74,11 @@ define void @and_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: and_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -114,11 +114,11 @@ define void @and_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: and_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -154,11 +154,11 @@ define void @and_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: and_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -198,11 +198,11 @@ define void @or_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: or_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -238,11 +238,11 @@ define void @or_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: or_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -278,11 +278,11 @@ define void @or_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: or_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -318,11 +318,11 @@ define void @or_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: or_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: orr z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -362,11 +362,11 @@ define void @xor_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: xor_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: eor z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -402,11 +402,11 @@ define void @xor_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: xor_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: eor z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -442,11 +442,11 @@ define void @xor_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: xor_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: eor z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -482,11 +482,11 @@ define void @xor_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: xor_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: eor z1.d, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: eor z0.d, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll @@ -36,12 +36,12 @@ define void @smax_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smax_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smax z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: smax z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -79,12 +79,12 @@ define void @smax_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smax_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smax z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: smax z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -122,12 +122,12 @@ define void @smax_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smax_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smax z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: smax z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -167,12 +167,12 @@ define void @smax_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smax_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smax z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: smax z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smax z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -214,12 +214,12 @@ define void @smin_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smin_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smin z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: smin z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -257,12 +257,12 @@ define void @smin_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smin_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smin z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: smin z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -300,12 +300,12 @@ define void @smin_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smin_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smin z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: smin z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -345,12 +345,12 @@ define void @smin_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smin_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smin z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: smin z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smin z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -392,12 +392,12 @@ define void @umax_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umax_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umax z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: umax z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -435,12 +435,12 @@ define void @umax_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umax_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umax z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: umax z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -478,12 +478,12 @@ define void @umax_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umax_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umax z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: umax z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -523,12 +523,12 @@ define void @umax_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umax_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umax z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: umax z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umax z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -570,12 +570,12 @@ define void @umin_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umin_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umin z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: umin z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -613,12 +613,12 @@ define void @umin_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umin_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umin z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: umin z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -656,12 +656,12 @@ define void @umin_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umin_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umin z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: umin z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -701,12 +701,12 @@ define void @umin_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umin_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umin z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: umin z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umin z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -71,12 +71,34 @@ define void @smulh_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smulh_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z2.b -; CHECK-NEXT: smulh z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mov w8, #8 // =0x8 +; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: mov w10, #24 // =0x18 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, x9] +; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0, x10] +; CHECK-NEXT: ld1sb { z3.h }, p0/z, [x0] +; CHECK-NEXT: ld1sb { z4.h }, p0/z, [x1, x9] +; CHECK-NEXT: ld1sb { z5.h }, p0/z, [x1, x10] +; CHECK-NEXT: ld1sb { z6.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1sb { z7.h }, p0/z, [x1] +; CHECK-NEXT: mul z2.h, p0/m, z2.h, z5.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z4.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z6.h +; CHECK-NEXT: mul z3.h, p0/m, z3.h, z7.h +; CHECK-NEXT: lsr z2.h, z2.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: lsr z3.h, z3.h, #8 +; CHECK-NEXT: splice z1.b, p0, z1.b, z2.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b +; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -146,12 +168,34 @@ define void @smulh_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smulh_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z2.h -; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: mov x9, #8 // =0x8 +; CHECK-NEXT: mov x10, #12 // =0xc +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1sh { z2.s }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1sh { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1sh { z4.s }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1sh { z5.s }, p0/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1sh { z6.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1sh { z7.s }, p0/z, [x1] +; CHECK-NEXT: mul z2.s, p0/m, z2.s, z5.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z4.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: mul z3.s, p0/m, z3.s, z7.s +; CHECK-NEXT: lsr z2.s, z2.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: lsr z3.s, z3.s, #16 +; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z3.h, z3.h +; CHECK-NEXT: splice z2.h, p0, z2.h, z0.h +; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -201,12 +245,34 @@ define void @smulh_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smulh_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mov x8, #2 // =0x2 +; CHECK-NEXT: mov x9, #4 // =0x4 +; CHECK-NEXT: mov x10, #6 // =0x6 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1sw { z2.d }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1sw { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1sw { z4.d }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1sw { z5.d }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1sw { z6.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1sw { z7.d }, p0/z, [x1] +; CHECK-NEXT: mul z2.d, p0/m, z2.d, z5.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z4.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z6.d +; CHECK-NEXT: mul z3.d, p0/m, z3.d, z7.d +; CHECK-NEXT: lsr z2.d, z2.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: lsr z3.d, z3.d, #32 +; CHECK-NEXT: splice z1.s, p0, z1.s, z2.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z2.s, z3.s, z3.s +; CHECK-NEXT: splice z2.s, p0, z2.s, z0.s +; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -258,12 +324,12 @@ define void @smulh_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: smulh_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: smulh z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -337,12 +403,34 @@ define void @umulh_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umulh_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z2.b -; CHECK-NEXT: umulh z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mov w8, #8 // =0x8 +; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: mov w10, #24 // =0x18 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0, x9] +; CHECK-NEXT: ld1b { z2.h }, p0/z, [x0, x10] +; CHECK-NEXT: ld1b { z3.h }, p0/z, [x0] +; CHECK-NEXT: ld1b { z4.h }, p0/z, [x1, x9] +; CHECK-NEXT: ld1b { z5.h }, p0/z, [x1, x10] +; CHECK-NEXT: ld1b { z6.h }, p0/z, [x1, x8] +; CHECK-NEXT: ld1b { z7.h }, p0/z, [x1] +; CHECK-NEXT: mul z2.h, p0/m, z2.h, z5.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z4.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z6.h +; CHECK-NEXT: mul z3.h, p0/m, z3.h, z7.h +; CHECK-NEXT: lsr z2.h, z2.h, #8 +; CHECK-NEXT: lsr z1.h, z1.h, #8 +; CHECK-NEXT: lsr z0.h, z0.h, #8 +; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: lsr z3.h, z3.h, #8 +; CHECK-NEXT: splice z1.b, p0, z1.b, z2.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b +; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -412,12 +500,34 @@ define void @umulh_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umulh_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z2.h -; CHECK-NEXT: umulh z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mov x8, #4 // =0x4 +; CHECK-NEXT: mov x9, #8 // =0x8 +; CHECK-NEXT: mov x10, #12 // =0xc +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: ld1h { z3.s }, p0/z, [x0] +; CHECK-NEXT: ld1h { z4.s }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: ld1h { z5.s }, p0/z, [x1, x10, lsl #1] +; CHECK-NEXT: ld1h { z6.s }, p0/z, [x1, x8, lsl #1] +; CHECK-NEXT: ld1h { z7.s }, p0/z, [x1] +; CHECK-NEXT: mul z2.s, p0/m, z2.s, z5.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z4.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z6.s +; CHECK-NEXT: mul z3.s, p0/m, z3.s, z7.s +; CHECK-NEXT: lsr z2.s, z2.s, #16 +; CHECK-NEXT: lsr z1.s, z1.s, #16 +; CHECK-NEXT: lsr z0.s, z0.s, #16 +; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: lsr z3.s, z3.s, #16 +; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z3.h, z3.h +; CHECK-NEXT: splice z2.h, p0, z2.h, z0.h +; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -467,12 +577,34 @@ define void @umulh_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umulh_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: umulh z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mov x8, #2 // =0x2 +; CHECK-NEXT: mov x9, #4 // =0x4 +; CHECK-NEXT: mov x10, #6 // =0x6 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z2.d }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z4.d }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: ld1w { z5.d }, p0/z, [x1, x10, lsl #2] +; CHECK-NEXT: ld1w { z6.d }, p0/z, [x1, x8, lsl #2] +; CHECK-NEXT: ld1w { z7.d }, p0/z, [x1] +; CHECK-NEXT: mul z2.d, p0/m, z2.d, z5.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z4.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z6.d +; CHECK-NEXT: mul z3.d, p0/m, z3.d, z7.d +; CHECK-NEXT: lsr z2.d, z2.d, #32 +; CHECK-NEXT: lsr z1.d, z1.d, #32 +; CHECK-NEXT: lsr z0.d, z0.d, #32 +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: lsr z3.d, z3.d, #32 +; CHECK-NEXT: splice z1.s, p0, z1.s, z2.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z2.s, z3.s, z3.s +; CHECK-NEXT: splice z2.s, p0, z2.s, z0.s +; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -524,12 +656,12 @@ define void @umulh_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: umulh_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: umulh z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -109,13 +109,13 @@ define void @srem_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: srem_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ldp q0, q2, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: ldp q3, q1, [x1] ; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: sunpklo z7.h, z0.b ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 +; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z18.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 @@ -183,7 +183,7 @@ ; CHECK-NEXT: splice z5.b, p0, z5.b, z6.b ; CHECK-NEXT: mls z2.b, p1/m, z7.b, z3.b ; CHECK-NEXT: mls z0.b, p1/m, z5.b, z1.b -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -241,26 +241,26 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: srem_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ldp q0, q2, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ptrue p1.h, vl8 -; CHECK-NEXT: mov z17.d, z2.d -; CHECK-NEXT: ext z17.b, z17.b, z2.b, #8 -; CHECK-NEXT: ldp q3, q1, [x1] ; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: sunpklo z7.s, z0.h ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 +; CHECK-NEXT: ldp q1, q3, [x1] +; CHECK-NEXT: mov z17.d, z2.d ; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: mov z16.d, z3.d -; CHECK-NEXT: ext z16.b, z16.b, z3.b, #8 +; CHECK-NEXT: ext z17.b, z17.b, z2.b, #8 ; CHECK-NEXT: mov z4.d, z1.d ; CHECK-NEXT: sunpklo z6.s, z1.h ; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 ; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: mov z16.d, z3.d ; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: ext z16.b, z16.b, z3.b, #8 +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: sunpklo z7.s, z16.h ; CHECK-NEXT: sunpklo z16.s, z17.h -; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: movprfx z5, z16 ; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z7.s ; CHECK-NEXT: sunpklo z7.s, z3.h @@ -275,7 +275,7 @@ ; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h ; CHECK-NEXT: mls z2.h, p1/m, z7.h, z3.h ; CHECK-NEXT: mls z0.h, p1/m, z5.h, z1.h -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -317,16 +317,16 @@ define void @srem_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: srem_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z2.s ; CHECK-NEXT: movprfx z5, z1 ; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z3.s -; CHECK-NEXT: mls z0.s, p0/m, z4.s, z2.s ; CHECK-NEXT: mls z1.s, p0/m, z5.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mls z0.s, p0/m, z4.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -368,16 +368,16 @@ define void @srem_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: srem_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: sdiv z4.d, p0/m, z4.d, z2.d ; CHECK-NEXT: movprfx z5, z1 ; CHECK-NEXT: sdiv z5.d, p0/m, z5.d, z3.d -; CHECK-NEXT: mls z0.d, p0/m, z4.d, z2.d ; CHECK-NEXT: mls z1.d, p0/m, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mls z0.d, p0/m, z4.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -492,13 +492,13 @@ define void @urem_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: urem_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ldp q0, q2, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: ldp q3, q1, [x1] ; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: uunpklo z7.h, z0.b ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 +; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z18.s, z5.h ; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 @@ -566,7 +566,7 @@ ; CHECK-NEXT: splice z5.b, p0, z5.b, z6.b ; CHECK-NEXT: mls z2.b, p1/m, z7.b, z3.b ; CHECK-NEXT: mls z0.b, p1/m, z5.b, z1.b -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -624,26 +624,26 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: urem_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ldp q0, q2, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ptrue p1.h, vl8 -; CHECK-NEXT: mov z17.d, z2.d -; CHECK-NEXT: ext z17.b, z17.b, z2.b, #8 -; CHECK-NEXT: ldp q3, q1, [x1] ; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: uunpklo z7.s, z0.h ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 +; CHECK-NEXT: ldp q1, q3, [x1] +; CHECK-NEXT: mov z17.d, z2.d ; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: mov z16.d, z3.d -; CHECK-NEXT: ext z16.b, z16.b, z3.b, #8 +; CHECK-NEXT: ext z17.b, z17.b, z2.b, #8 ; CHECK-NEXT: mov z4.d, z1.d ; CHECK-NEXT: uunpklo z6.s, z1.h ; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 ; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: mov z16.d, z3.d ; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: ext z16.b, z16.b, z3.b, #8 +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: uunpklo z7.s, z16.h ; CHECK-NEXT: uunpklo z16.s, z17.h -; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: movprfx z5, z16 ; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z7.s ; CHECK-NEXT: uunpklo z7.s, z3.h @@ -658,7 +658,7 @@ ; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h ; CHECK-NEXT: mls z2.h, p1/m, z7.h, z3.h ; CHECK-NEXT: mls z0.h, p1/m, z5.h, z1.h -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -700,16 +700,16 @@ define void @urem_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: urem_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z2.s ; CHECK-NEXT: movprfx z5, z1 ; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z3.s -; CHECK-NEXT: mls z0.s, p0/m, z4.s, z2.s ; CHECK-NEXT: mls z1.s, p0/m, z5.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mls z0.s, p0/m, z4.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -751,16 +751,16 @@ define void @urem_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: urem_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: udiv z4.d, p0/m, z4.d, z2.d ; CHECK-NEXT: movprfx z5, z1 ; CHECK-NEXT: udiv z5.d, p0/m, z5.d, z3.d -; CHECK-NEXT: mls z0.d, p0/m, z4.d, z2.d ; CHECK-NEXT: mls z1.d, p0/m, z5.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: mls z0.d, p0/m, z4.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll @@ -55,16 +55,16 @@ ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q3, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: mov z4.b, w8 ; CHECK-NEXT: cmpne p0.b, p0/z, z4.b, #0 ; CHECK-NEXT: sel z0.b, p0, z0.b, z2.b ; CHECK-NEXT: sel z1.b, p0, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i8>, ptr %a %op2 = load volatile <32 x i8>, ptr %b @@ -125,16 +125,16 @@ ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q3, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: mov z4.h, w8 ; CHECK-NEXT: cmpne p0.h, p0/z, z4.h, #0 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x i16>, ptr %a %op2 = load volatile <16 x i16>, ptr %b @@ -179,16 +179,16 @@ ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q3, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: mov z4.s, w8 ; CHECK-NEXT: cmpne p0.s, p0/z, z4.s, #0 ; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s ; CHECK-NEXT: sel z1.s, p0, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x i32>, ptr %a %op2 = load volatile <8 x i32>, ptr %b @@ -200,8 +200,7 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 @@ -217,8 +216,7 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) #0 { ; CHECK-LABEL: select_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 @@ -234,18 +232,17 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) #0 { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: ldr q1, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q3, [x1, #16] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: ldr q3, [x1] ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: cmpne p0.d, p0/z, z4.d, #0 ; CHECK-NEXT: sel z0.d, p0, z0.d, z2.d ; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x i64>, ptr %a %op2 = load volatile <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll @@ -51,12 +51,12 @@ define void @ashr_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: ashr_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: asr z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: asr z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -109,12 +109,12 @@ define void @ashr_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: ashr_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: asr z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -152,12 +152,12 @@ define void @ashr_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: ashr_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: asr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: asr z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -195,12 +195,12 @@ define void @ashr_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: ashr_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: asr z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: asr z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asr z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -257,12 +257,12 @@ define void @lshr_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: lshr_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: lsr z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -315,12 +315,12 @@ define void @lshr_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: lshr_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: lsr z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -358,12 +358,12 @@ define void @lshr_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: lshr_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: lsr z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -401,12 +401,12 @@ define void @lshr_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: lshr_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -476,12 +476,12 @@ define void @shl_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shl_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: lsl z1.b, p0/m, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -519,12 +519,12 @@ define void @shl_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shl_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: lsl z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -562,12 +562,12 @@ define void @shl_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shl_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: lsl z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -605,12 +605,12 @@ define void @shl_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shl_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -36,11 +36,11 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: ucvtf_v16i16_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h ; CHECK-NEXT: ucvtf z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = uitofp <16 x i16> %op1 to <16 x half> @@ -348,11 +348,11 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: ucvtf_v8i32_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = uitofp <8 x i32> %op1 to <8 x float> @@ -577,11 +577,11 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: ucvtf_v4i64_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = uitofp <4 x i64> %op1 to <4 x double> @@ -622,11 +622,11 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: scvtf_v16i16_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: scvtf z0.h, p0/m, z0.h ; CHECK-NEXT: scvtf z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: scvtf z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sitofp <16 x i16> %op1 to <16 x half> @@ -897,11 +897,11 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: scvtf_v8i32_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: scvtf z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sitofp <8 x i32> %op1 to <8 x float> @@ -972,38 +972,36 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: scvtf_v16i32_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: scvtf z2.d, p0/m, z2.d ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: mov z7.d, z3.d -; CHECK-NEXT: sunpklo z3.d, z3.s -; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 -; CHECK-NEXT: scvtf z3.d, p0/m, z3.d -; CHECK-NEXT: sunpklo z7.d, z7.s -; CHECK-NEXT: sunpklo z6.d, z6.s +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z4.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf z7.d, p0/m, z7.d +; CHECK-NEXT: ldp q2, q3, [x0, #32] ; CHECK-NEXT: sunpklo z5.d, z1.s ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: stp q3, q7, [x1, #96] +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: sunpklo z6.d, z2.s +; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z7.d, z3.s +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: scvtf z7.d, p0/m, z7.d +; CHECK-NEXT: scvtf z3.d, p0/m, z3.d +; CHECK-NEXT: scvtf z2.d, p0/m, z2.d +; CHECK-NEXT: stp q7, q3, [x1, #96] ; CHECK-NEXT: movprfx z3, z6 ; CHECK-NEXT: scvtf z3.d, p0/m, z6.d -; CHECK-NEXT: stp q2, q3, [x1, #64] +; CHECK-NEXT: stp q3, q2, [x1, #64] ; CHECK-NEXT: movprfx z2, z5 ; CHECK-NEXT: scvtf z2.d, p0/m, z5.d -; CHECK-NEXT: scvtf z1.d, p0/m, z1.d -; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q2, q1, [x1, #32] -; CHECK-NEXT: movprfx z2, z4 -; CHECK-NEXT: scvtf z2.d, p0/m, z4.d -; CHECK-NEXT: stp q2, q0, [x1] +; CHECK-NEXT: movprfx z1, z4 +; CHECK-NEXT: scvtf z1.d, p0/m, z4.d +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i32>, ptr %a %res = sitofp <16 x i32> %op1 to <16 x double> @@ -1111,11 +1109,11 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: scvtf_v4i64_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: scvtf z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sitofp <4 x i64> %op1 to <4 x double> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -60,14 +60,14 @@ define void @select_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z2.b -; CHECK-NEXT: sel z1.b, p1, z1.b, z2.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, z3.b +; CHECK-NEXT: sel z1.b, p1, z1.b, z2.b ; CHECK-NEXT: sel z0.b, p0, z0.b, z3.b -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -135,14 +135,14 @@ define void @select_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z2.h -; CHECK-NEXT: sel z1.h, p1, z1.h, z2.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z3.h +; CHECK-NEXT: sel z1.h, p1, z1.h, z2.h ; CHECK-NEXT: sel z0.h, p0, z0.h, z3.h -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -192,14 +192,14 @@ define void @select_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z2.s -; CHECK-NEXT: sel z1.s, p1, z1.s, z2.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z3.s +; CHECK-NEXT: sel z1.s, p1, z1.s, z2.s ; CHECK-NEXT: sel z0.s, p0, z0.s, z3.s -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -212,8 +212,7 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) #0 { ; CHECK-LABEL: select_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 @@ -248,14 +247,14 @@ define void @select_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z2.d -; CHECK-NEXT: sel z1.d, p1, z1.d, z2.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z3.d +; CHECK-NEXT: sel z1.d, p1, z1.d, z2.d ; CHECK-NEXT: sel z0.d, p0, z0.d, z3.d -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll @@ -8,13 +8,13 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q2, q1, [x0, #32] ; CHECK-NEXT: add z2.s, z2.s, z2.s -; CHECK-NEXT: ldp q3, q4, [x0] +; CHECK-NEXT: ldp q4, q3, [x0] ; CHECK-NEXT: mov z0.s, z1.s[2] ; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: stp q2, q1, [x0, #32] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: add z2.s, z3.s, z3.s -; CHECK-NEXT: add z1.s, z4.s, z4.s +; CHECK-NEXT: add z2.s, z4.s, z4.s +; CHECK-NEXT: add z1.s, z3.s, z3.s ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret entry: @@ -31,14 +31,14 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: add z1.s, z1.s, z1.s -; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q3, q2, [x0] ; CHECK-NEXT: add z4.s, z0.s, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: stp q1, q4, [x0, #32] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 -; CHECK-NEXT: add z1.s, z2.s, z2.s -; CHECK-NEXT: add z2.s, z3.s, z3.s +; CHECK-NEXT: add z1.s, z3.s, z3.s +; CHECK-NEXT: add z2.s, z2.s, z2.s ; CHECK-NEXT: stp q1, q2, [x0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll @@ -52,11 +52,11 @@ define void @add_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: add_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add z0.b, z0.b, z2.b +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add z1.b, z1.b, z3.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add z0.b, z0.b, z2.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -124,11 +124,11 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: add_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: add z1.h, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: add z0.h, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -168,11 +168,11 @@ define void @abs_v8i32(ptr %a) #0 { ; CHECK-LABEL: abs_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: abs z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: abs z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) @@ -197,11 +197,11 @@ define void @abs_v4i64(ptr %a) #0 { ; CHECK-LABEL: abs_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: abs z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: abs z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) @@ -261,12 +261,12 @@ define void @fadd_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fadd_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z3.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -310,12 +310,12 @@ define void @fadd_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fadd_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -343,12 +343,12 @@ define void @fadd_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: fadd_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll @@ -7,11 +7,11 @@ define void @test_revbv16i16(ptr %a) #0 { ; CHECK-LABEL: test_revbv16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: revb z0.h, p0/m, z0.h ; CHECK-NEXT: revb z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revb z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> @@ -23,11 +23,11 @@ define void @test_revbv8i32(ptr %a) #0 { ; CHECK-LABEL: test_revbv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: revb z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revb z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> @@ -39,11 +39,11 @@ define void @test_revbv4i64(ptr %a) #0 { ; CHECK-LABEL: test_revbv4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: revb z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revb z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> @@ -55,11 +55,11 @@ define void @test_revhv8i32(ptr %a) #0 { ; CHECK-LABEL: test_revhv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: revh z0.s, p0/m, z0.s ; CHECK-NEXT: revh z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revh z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> @@ -71,11 +71,11 @@ define void @test_revhv8f32(ptr %a) #0 { ; CHECK-LABEL: test_revhv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: revh z0.s, p0/m, z0.s ; CHECK-NEXT: revh z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revh z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <16 x half>, ptr %a %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> @@ -87,11 +87,11 @@ define void @test_revhv4i64(ptr %a) #0 { ; CHECK-LABEL: test_revhv4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: revh z0.d, p0/m, z0.d ; CHECK-NEXT: revh z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revh z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> @@ -103,11 +103,11 @@ define void @test_revwv4i64(ptr %a) #0 { ; CHECK-LABEL: test_revwv4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: revw z0.d, p0/m, z0.d ; CHECK-NEXT: revw z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revw z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -119,11 +119,11 @@ define void @test_revwv4f64(ptr %a) #0 { ; CHECK-LABEL: test_revwv4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: revw z0.d, p0/m, z0.d ; CHECK-NEXT: revw z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revw z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <8 x float>, ptr %a %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> @@ -148,11 +148,11 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: test_revwv8i32v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: revw z0.d, p0/m, z0.d ; CHECK-NEXT: revw z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revw z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = load <8 x i32>, ptr %b @@ -164,15 +164,15 @@ define void @test_revhv32i16(ptr %a) #0 { ; CHECK-LABEL: test_revhv32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: revh z0.d, p0/m, z0.d -; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: revh z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: revh z0.d, p0/m, z2.d +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: revh z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0, #32] ; CHECK-NEXT: revh z1.d, p0/m, z3.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revh z0.d, p0/m, z2.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <32 x i16>, ptr %a %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> @@ -183,18 +183,19 @@ define void @test_rev_elts_fail(ptr %a) #0 { ; CHECK-LABEL: test_rev_elts_fail: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: mov z2.d, z0.d[1] ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: fmov x9, d2 ; CHECK-NEXT: mov z0.d, z1.d[1] +; CHECK-NEXT: fmov x10, d1 ; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: stp x9, x8, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: ldp q1, q0, [sp] -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp x9, x8, [sp, #16] +; CHECK-NEXT: stp x11, x10, [sp] +; CHECK-NEXT: ldp q0, q1, [sp] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, ptr %a @@ -208,11 +209,11 @@ define void @test_revdv4i64_sve2p1(ptr %a) #1 { ; CHECK-LABEL: test_revdv4i64_sve2p1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: revd z0.q, p0/m, z0.q ; CHECK-NEXT: revd z1.q, p0/m, z1.q -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revd z0.q, p0/m, z0.q +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> @@ -223,11 +224,11 @@ define void @test_revdv4f64_sve2p1(ptr %a) #1 { ; CHECK-LABEL: test_revdv4f64_sve2p1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: revd z0.q, p0/m, z0.q ; CHECK-NEXT: revd z1.q, p0/m, z1.q -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revd z0.q, p0/m, z0.q +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> undef, <4 x i32> @@ -240,7 +241,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: mov z2.s, z0.s[1] ; CHECK-NEXT: mov z3.s, z0.s[2] ; CHECK-NEXT: mov z4.s, z0.s[3] @@ -251,16 +252,16 @@ ; CHECK-NEXT: mov z0.s, z1.s[1] ; CHECK-NEXT: mov z2.s, z1.s[2] ; CHECK-NEXT: mov z3.s, z1.s[3] -; CHECK-NEXT: stp w9, w8, [sp, #24] +; CHECK-NEXT: stp w9, w8, [sp, #8] ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: stp w11, w10, [sp, #16] +; CHECK-NEXT: stp w11, w10, [sp] ; CHECK-NEXT: fmov w10, s2 ; CHECK-NEXT: fmov w11, s3 -; CHECK-NEXT: stp w9, w8, [sp, #8] -; CHECK-NEXT: stp w11, w10, [sp] -; CHECK-NEXT: ldp q0, q1, [sp] -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp w9, w8, [sp, #24] +; CHECK-NEXT: stp w11, w10, [sp, #16] +; CHECK-NEXT: ldp q1, q0, [sp] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret %tmp1 = load <8 x i32>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -287,13 +287,13 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: zip1 z4.d, z1.d, z2.d -; CHECK-NEXT: trn2 z1.d, z1.d, z2.d -; CHECK-NEXT: zip1 z2.d, z0.d, z3.d -; CHECK-NEXT: trn2 z0.d, z0.d, z3.d +; CHECK-NEXT: trn2 z4.d, z1.d, z2.d +; CHECK-NEXT: zip1 z1.d, z1.d, z2.d +; CHECK-NEXT: trn2 z2.d, z0.d, z3.d +; CHECK-NEXT: zip1 z0.d, z0.d, z3.d ; CHECK-NEXT: fadd z2.d, p0/m, z2.d, z4.d ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b @@ -364,15 +364,15 @@ define void @trn_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: trn_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: trn1 z4.b, z1.b, z2.b -; CHECK-NEXT: trn2 z1.b, z1.b, z2.b -; CHECK-NEXT: add z1.b, z4.b, z1.b +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: trn1 z5.b, z0.b, z3.b ; CHECK-NEXT: trn2 z0.b, z0.b, z3.b ; CHECK-NEXT: add z0.b, z5.b, z0.b -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: trn1 z4.b, z1.b, z2.b +; CHECK-NEXT: trn2 z1.b, z1.b, z2.b +; CHECK-NEXT: add z1.b, z4.b, z1.b +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = load <32 x i8>, ptr %b @@ -434,15 +434,15 @@ define void @trn_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: trn_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: trn1 z4.h, z1.h, z2.h -; CHECK-NEXT: trn2 z1.h, z1.h, z2.h -; CHECK-NEXT: add z1.h, z4.h, z1.h +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: trn1 z5.h, z0.h, z3.h ; CHECK-NEXT: trn2 z0.h, z0.h, z3.h ; CHECK-NEXT: add z0.h, z5.h, z0.h -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: trn1 z4.h, z1.h, z2.h +; CHECK-NEXT: trn2 z1.h, z1.h, z2.h +; CHECK-NEXT: add z1.h, z4.h, z1.h +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = load <16 x i16>, ptr %b @@ -456,15 +456,15 @@ define void @trn_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: trn_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: zip1 z4.s, z1.s, z2.s -; CHECK-NEXT: trn2 z1.s, z1.s, z2.s -; CHECK-NEXT: add z1.s, z4.s, z1.s -; CHECK-NEXT: trn1 z5.s, z0.s, z3.s +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: zip1 z5.s, z0.s, z3.s ; CHECK-NEXT: trn2 z0.s, z0.s, z3.s ; CHECK-NEXT: add z0.s, z5.s, z0.s -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: trn1 z4.s, z1.s, z2.s +; CHECK-NEXT: trn2 z1.s, z1.s, z2.s +; CHECK-NEXT: add z1.s, z4.s, z1.s +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = load <8 x i32>, ptr %b @@ -478,16 +478,16 @@ define void @trn_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: trn_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: zip1 z4.d, z1.d, z2.d -; CHECK-NEXT: trn2 z1.d, z1.d, z2.d -; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z4.d +; CHECK-NEXT: ldp q3, q2, [x1] ; CHECK-NEXT: zip1 z5.d, z0.d, z3.d ; CHECK-NEXT: trn2 z0.d, z0.d, z3.d ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z5.d -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: zip1 z4.d, z1.d, z2.d +; CHECK-NEXT: trn2 z1.d, z1.d, z2.d +; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z4.d +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b @@ -521,14 +521,14 @@ define void @trn_v8i32_undef(ptr %a) #0 { ; CHECK-LABEL: trn_v8i32_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: trn1 z2.s, z0.s, z0.s -; CHECK-NEXT: trn2 z0.s, z0.s, z0.s -; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: trn1 z3.s, z1.s, z1.s ; CHECK-NEXT: trn2 z1.s, z1.s, z1.s ; CHECK-NEXT: add z1.s, z3.s, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: trn1 z2.s, z0.s, z0.s +; CHECK-NEXT: trn2 z0.s, z0.s, z0.s +; CHECK-NEXT: add z0.s, z2.s, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -728,197 +728,197 @@ ; CHECK-NEXT: .cfi_offset b13, -48 ; CHECK-NEXT: .cfi_offset b14, -56 ; CHECK-NEXT: .cfi_offset b15, -64 -; CHECK-NEXT: ldp q0, q3, [x0] +; CHECK-NEXT: ldp q0, q7, [x1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov z27.b, z0.b[14] -; CHECK-NEXT: mov z28.b, z0.b[12] -; CHECK-NEXT: mov z30.b, z0.b[8] -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: mov z2.b, z3.b[12] -; CHECK-NEXT: mov z4.b, z3.b[10] -; CHECK-NEXT: mov z1.b, z3.b[14] -; CHECK-NEXT: ldp q10, q11, [x1] -; CHECK-NEXT: strb w8, [sp, #40] +; CHECK-NEXT: mov z29.b, z0.b[14] +; CHECK-NEXT: mov z30.b, z0.b[12] +; CHECK-NEXT: mov z8.b, z0.b[8] +; CHECK-NEXT: fmov w8, s7 +; CHECK-NEXT: mov z2.b, z7.b[12] +; CHECK-NEXT: ldp q12, q13, [x0] +; CHECK-NEXT: mov z1.b, z7.b[14] +; CHECK-NEXT: mov z3.b, z7.b[10] +; CHECK-NEXT: strb w8, [sp, #8] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strb w9, [sp, #32] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z6.b, z3.b[6] -; CHECK-NEXT: mov z7.b, z3.b[4] +; CHECK-NEXT: mov z5.b, z7.b[6] ; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: strb w8, [sp, #46] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strb w9, [sp, #45] -; CHECK-NEXT: fmov w9, s7 -; CHECK-NEXT: mov z5.b, z3.b[8] -; CHECK-NEXT: strb w10, [sp, #47] -; CHECK-NEXT: fmov w10, s5 -; CHECK-NEXT: strb w8, [sp, #43] -; CHECK-NEXT: fmov w8, s27 -; CHECK-NEXT: strb w9, [sp, #42] -; CHECK-NEXT: fmov w9, s28 -; CHECK-NEXT: mov z16.b, z3.b[2] -; CHECK-NEXT: mov z31.b, z0.b[6] -; CHECK-NEXT: strb w10, [sp, #44] +; CHECK-NEXT: strb w9, [sp] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w8, [sp, #14] +; CHECK-NEXT: fmov w8, s5 +; CHECK-NEXT: mov z4.b, z7.b[8] +; CHECK-NEXT: mov z6.b, z7.b[4] +; CHECK-NEXT: strb w10, [sp, #15] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w9, [sp, #13] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w8, [sp, #11] +; CHECK-NEXT: fmov w8, s29 +; CHECK-NEXT: mov z16.b, z7.b[2] +; CHECK-NEXT: strb w10, [sp, #12] ; CHECK-NEXT: fmov w10, s16 -; CHECK-NEXT: strb w8, [sp, #39] +; CHECK-NEXT: strb w9, [sp, #10] +; CHECK-NEXT: fmov w9, s30 +; CHECK-NEXT: strb w8, [sp, #7] +; CHECK-NEXT: fmov w8, s8 +; CHECK-NEXT: mov z31.b, z0.b[10] +; CHECK-NEXT: mov z9.b, z0.b[6] +; CHECK-NEXT: mov z11.b, z0.b[2] +; CHECK-NEXT: strb w10, [sp, #9] +; CHECK-NEXT: fmov w10, s31 +; CHECK-NEXT: strb w9, [sp, #6] +; CHECK-NEXT: fmov w9, s9 +; CHECK-NEXT: strb w8, [sp, #4] +; CHECK-NEXT: fmov w8, s11 +; CHECK-NEXT: mov z10.b, z0.b[4] +; CHECK-NEXT: strb w10, [sp, #5] +; CHECK-NEXT: fmov w10, s10 +; CHECK-NEXT: mov z30.b, z13.b[2] +; CHECK-NEXT: strb w9, [sp, #3] +; CHECK-NEXT: fmov w9, s13 +; CHECK-NEXT: strb w8, [sp, #1] ; CHECK-NEXT: fmov w8, s30 -; CHECK-NEXT: strb w9, [sp, #38] -; CHECK-NEXT: fmov w9, s31 -; CHECK-NEXT: mov z29.b, z0.b[10] -; CHECK-NEXT: mov z9.b, z0.b[2] -; CHECK-NEXT: strb w10, [sp, #41] -; CHECK-NEXT: fmov w10, s29 +; CHECK-NEXT: mov z3.b, z13.b[14] +; CHECK-NEXT: mov z6.b, z13.b[8] +; CHECK-NEXT: strb w10, [sp, #2] +; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: fmov w13, s6 +; CHECK-NEXT: mov z3.b, z12.b[14] +; CHECK-NEXT: mov z6.b, z12.b[8] +; CHECK-NEXT: strb w9, [sp, #40] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w8, [sp, #41] +; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: mov z4.b, z13.b[12] +; CHECK-NEXT: mov z16.b, z13.b[6] +; CHECK-NEXT: fmov w11, s4 +; CHECK-NEXT: fmov w14, s16 +; CHECK-NEXT: mov z4.b, z12.b[12] +; CHECK-NEXT: mov z16.b, z12.b[6] +; CHECK-NEXT: mov z15.b, z12.b[2] +; CHECK-NEXT: strb w10, [sp, #47] +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: strb w9, [sp, #39] +; CHECK-NEXT: fmov w9, s16 ; CHECK-NEXT: strb w8, [sp, #36] -; CHECK-NEXT: fmov w8, s9 +; CHECK-NEXT: fmov w8, s15 +; CHECK-NEXT: mov z17.b, z7.b[15] +; CHECK-NEXT: mov z18.b, z7.b[13] +; CHECK-NEXT: mov z14.b, z12.b[4] +; CHECK-NEXT: strb w10, [sp, #38] +; CHECK-NEXT: fmov w10, s14 ; CHECK-NEXT: strb w9, [sp, #35] -; CHECK-NEXT: fmov w9, s11 -; CHECK-NEXT: mov z8.b, z0.b[4] -; CHECK-NEXT: mov z16.b, z11.b[4] -; CHECK-NEXT: mov z27.b, z11.b[2] -; CHECK-NEXT: strb w10, [sp, #37] -; CHECK-NEXT: fmov w10, s8 +; CHECK-NEXT: fmov w9, s17 ; CHECK-NEXT: strb w8, [sp, #33] -; CHECK-NEXT: fmov w8, s16 -; CHECK-NEXT: strb w9, [sp, #8] -; CHECK-NEXT: fmov w9, s27 -; CHECK-NEXT: mov z5.b, z11.b[10] -; CHECK-NEXT: mov z6.b, z11.b[8] -; CHECK-NEXT: mov z2.b, z11.b[14] -; CHECK-NEXT: fmov w12, s5 -; CHECK-NEXT: fmov w13, s6 -; CHECK-NEXT: mov z5.b, z10.b[10] -; CHECK-NEXT: mov z6.b, z10.b[8] +; CHECK-NEXT: fmov w8, s18 +; CHECK-NEXT: mov z19.b, z7.b[11] +; CHECK-NEXT: mov z20.b, z7.b[9] +; CHECK-NEXT: mov z21.b, z7.b[7] ; CHECK-NEXT: strb w10, [sp, #34] -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: strb w8, [sp, #10] -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strb w9, [sp, #9] -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: mov z4.b, z11.b[12] -; CHECK-NEXT: mov z7.b, z11.b[6] -; CHECK-NEXT: mov z28.b, z11.b[15] -; CHECK-NEXT: mov z29.b, z11.b[13] -; CHECK-NEXT: mov z30.b, z11.b[11] -; CHECK-NEXT: mov z31.b, z11.b[9] -; CHECK-NEXT: mov z8.b, z11.b[7] -; CHECK-NEXT: mov z9.b, z11.b[5] -; CHECK-NEXT: mov z12.b, z11.b[3] -; CHECK-NEXT: mov z13.b, z11.b[1] -; CHECK-NEXT: mov z2.b, z10.b[14] -; CHECK-NEXT: mov z11.b, z10.b[4] -; CHECK-NEXT: mov z14.b, z10.b[2] -; CHECK-NEXT: strb w10, [sp, #15] -; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: strb w8, [sp, #5] -; CHECK-NEXT: fmov w8, s11 -; CHECK-NEXT: strb w9, [sp, #4] -; CHECK-NEXT: fmov w9, s14 -; CHECK-NEXT: mov z17.b, z3.b[15] -; CHECK-NEXT: mov z18.b, z3.b[13] -; CHECK-NEXT: fmov w14, s7 -; CHECK-NEXT: mov z7.b, z10.b[6] -; CHECK-NEXT: strb w10, [sp, #7] -; CHECK-NEXT: fmov w10, s7 -; CHECK-NEXT: strb w8, [sp, #2] -; CHECK-NEXT: fmov w8, s17 -; CHECK-NEXT: strb w9, [sp, #1] -; CHECK-NEXT: fmov w9, s18 -; CHECK-NEXT: mov z19.b, z3.b[11] -; CHECK-NEXT: mov z20.b, z3.b[9] -; CHECK-NEXT: mov z21.b, z3.b[7] -; CHECK-NEXT: strb w10, [sp, #3] ; CHECK-NEXT: fmov w10, s19 -; CHECK-NEXT: strb w8, [sp, #63] -; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: strb w9, [sp, #62] -; CHECK-NEXT: fmov w9, s21 -; CHECK-NEXT: mov z22.b, z3.b[5] -; CHECK-NEXT: mov z23.b, z3.b[3] -; CHECK-NEXT: mov z3.b, z0.b[13] -; CHECK-NEXT: strb w10, [sp, #61] -; CHECK-NEXT: fmov w10, s22 -; CHECK-NEXT: strb w8, [sp, #60] -; CHECK-NEXT: fmov w8, s23 -; CHECK-NEXT: strb w9, [sp, #59] -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov z24.b, z0.b[11] -; CHECK-NEXT: mov z25.b, z0.b[9] -; CHECK-NEXT: mov z26.b, z0.b[5] -; CHECK-NEXT: strb w10, [sp, #58] -; CHECK-NEXT: fmov w10, s24 -; CHECK-NEXT: strb w8, [sp, #57] -; CHECK-NEXT: fmov w8, s25 -; CHECK-NEXT: strb w9, [sp, #54] -; CHECK-NEXT: fmov w9, s26 -; CHECK-NEXT: mov z1.b, z0.b[3] -; CHECK-NEXT: mov z0.b, z0.b[1] -; CHECK-NEXT: strb w10, [sp, #53] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: strb w8, [sp, #52] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: strb w9, [sp, #50] -; CHECK-NEXT: fmov w9, s28 -; CHECK-NEXT: strb w10, [sp, #49] -; CHECK-NEXT: fmov w10, s29 -; CHECK-NEXT: strb w8, [sp, #48] -; CHECK-NEXT: fmov w8, s30 ; CHECK-NEXT: strb w9, [sp, #31] -; CHECK-NEXT: fmov w9, s31 -; CHECK-NEXT: strb w10, [sp, #30] -; CHECK-NEXT: fmov w10, s8 -; CHECK-NEXT: strb w8, [sp, #29] -; CHECK-NEXT: fmov w8, s9 +; CHECK-NEXT: fmov w9, s20 +; CHECK-NEXT: strb w8, [sp, #30] +; CHECK-NEXT: fmov w8, s21 +; CHECK-NEXT: mov z22.b, z7.b[5] +; CHECK-NEXT: mov z23.b, z7.b[3] +; CHECK-NEXT: mov z24.b, z7.b[1] +; CHECK-NEXT: strb w10, [sp, #29] +; CHECK-NEXT: fmov w10, s22 ; CHECK-NEXT: strb w9, [sp, #28] -; CHECK-NEXT: fmov w9, s12 -; CHECK-NEXT: fmov w11, s4 -; CHECK-NEXT: mov z15.b, z10.b[15] -; CHECK-NEXT: mov z16.b, z10.b[13] -; CHECK-NEXT: strb w10, [sp, #27] -; CHECK-NEXT: fmov w10, s13 -; CHECK-NEXT: strb w8, [sp, #26] -; CHECK-NEXT: fmov w8, s15 +; CHECK-NEXT: fmov w9, s23 +; CHECK-NEXT: strb w8, [sp, #27] +; CHECK-NEXT: fmov w8, s24 +; CHECK-NEXT: mov z7.b, z0.b[15] +; CHECK-NEXT: mov z25.b, z0.b[13] +; CHECK-NEXT: mov z26.b, z0.b[11] +; CHECK-NEXT: strb w10, [sp, #26] +; CHECK-NEXT: fmov w10, s7 ; CHECK-NEXT: strb w9, [sp, #25] -; CHECK-NEXT: fmov w9, s16 -; CHECK-NEXT: mov z4.b, z10.b[12] -; CHECK-NEXT: mov z27.b, z10.b[11] -; CHECK-NEXT: strb w11, [sp, #14] -; CHECK-NEXT: mov z2.b, z10.b[9] -; CHECK-NEXT: fmov w11, s4 -; CHECK-NEXT: mov z4.b, z10.b[7] -; CHECK-NEXT: strb w10, [sp, #24] +; CHECK-NEXT: fmov w9, s25 +; CHECK-NEXT: strb w8, [sp, #24] +; CHECK-NEXT: fmov w8, s26 +; CHECK-NEXT: mov z27.b, z0.b[9] +; CHECK-NEXT: mov z28.b, z0.b[7] +; CHECK-NEXT: mov z1.b, z0.b[5] +; CHECK-NEXT: strb w10, [sp, #23] ; CHECK-NEXT: fmov w10, s27 -; CHECK-NEXT: strb w8, [sp, #23] -; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strb w9, [sp, #22] -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: mov z5.b, z10.b[5] -; CHECK-NEXT: mov z6.b, z10.b[3] -; CHECK-NEXT: mov z7.b, z10.b[1] -; CHECK-NEXT: fmov w15, s10 -; CHECK-NEXT: strb w10, [sp, #21] -; CHECK-NEXT: fmov w10, s5 -; CHECK-NEXT: strb w8, [sp, #20] -; CHECK-NEXT: fmov w8, s6 +; CHECK-NEXT: fmov w9, s28 +; CHECK-NEXT: strb w8, [sp, #21] +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z2.b, z0.b[3] +; CHECK-NEXT: mov z0.b, z0.b[1] +; CHECK-NEXT: mov z31.b, z13.b[15] +; CHECK-NEXT: strb w10, [sp, #20] +; CHECK-NEXT: fmov w10, s2 ; CHECK-NEXT: strb w9, [sp, #19] -; CHECK-NEXT: fmov w9, s7 -; CHECK-NEXT: strb w15, [sp] -; CHECK-NEXT: strb w12, [sp, #13] -; CHECK-NEXT: ldr q17, [sp, #32] -; CHECK-NEXT: strb w13, [sp, #12] -; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: strb w14, [sp, #11] -; CHECK-NEXT: strb w11, [sp, #6] -; CHECK-NEXT: strb w10, [sp, #18] -; CHECK-NEXT: ldr q18, [sp] -; CHECK-NEXT: strb w8, [sp, #17] -; CHECK-NEXT: add z0.b, z17.b, z0.b +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: strb w8, [sp, #18] +; CHECK-NEXT: fmov w8, s31 +; CHECK-NEXT: mov z8.b, z13.b[13] +; CHECK-NEXT: mov z9.b, z13.b[11] +; CHECK-NEXT: mov z10.b, z13.b[9] +; CHECK-NEXT: strb w10, [sp, #17] +; CHECK-NEXT: fmov w10, s8 ; CHECK-NEXT: strb w9, [sp, #16] -; CHECK-NEXT: ldr q1, [sp, #16] +; CHECK-NEXT: fmov w9, s9 +; CHECK-NEXT: strb w8, [sp, #63] +; CHECK-NEXT: fmov w8, s10 +; CHECK-NEXT: mov z29.b, z13.b[4] +; CHECK-NEXT: mov z5.b, z13.b[10] +; CHECK-NEXT: mov z11.b, z13.b[7] +; CHECK-NEXT: fmov w16, s29 +; CHECK-NEXT: mov z29.b, z13.b[5] +; CHECK-NEXT: mov z13.b, z13.b[3] +; CHECK-NEXT: strb w10, [sp, #62] +; CHECK-NEXT: fmov w10, s11 +; CHECK-NEXT: strb w9, [sp, #61] +; CHECK-NEXT: fmov w9, s29 +; CHECK-NEXT: strb w8, [sp, #60] +; CHECK-NEXT: fmov w8, s13 +; CHECK-NEXT: mov z30.b, z12.b[13] +; CHECK-NEXT: mov z3.b, z12.b[11] +; CHECK-NEXT: mov z4.b, z12.b[9] +; CHECK-NEXT: strb w10, [sp, #59] +; CHECK-NEXT: fmov w10, s30 +; CHECK-NEXT: strb w9, [sp, #58] +; CHECK-NEXT: fmov w9, s3 +; CHECK-NEXT: strb w8, [sp, #57] +; CHECK-NEXT: fmov w8, s4 +; CHECK-NEXT: fmov w12, s5 +; CHECK-NEXT: mov z5.b, z12.b[10] +; CHECK-NEXT: strb w11, [sp, #46] +; CHECK-NEXT: fmov w11, s5 +; CHECK-NEXT: mov z5.b, z12.b[5] +; CHECK-NEXT: mov z6.b, z12.b[3] +; CHECK-NEXT: mov z16.b, z12.b[1] +; CHECK-NEXT: fmov w15, s12 +; CHECK-NEXT: strb w10, [sp, #54] +; CHECK-NEXT: fmov w10, s5 +; CHECK-NEXT: strb w9, [sp, #53] +; CHECK-NEXT: fmov w9, s6 +; CHECK-NEXT: strb w8, [sp, #52] +; CHECK-NEXT: fmov w8, s16 +; CHECK-NEXT: strb w15, [sp, #32] +; CHECK-NEXT: ldr q17, [sp] +; CHECK-NEXT: strb w12, [sp, #45] +; CHECK-NEXT: ldr q0, [sp, #16] +; CHECK-NEXT: strb w13, [sp, #44] +; CHECK-NEXT: strb w14, [sp, #43] +; CHECK-NEXT: strb w16, [sp, #42] +; CHECK-NEXT: add z0.b, z17.b, z0.b +; CHECK-NEXT: strb w11, [sp, #37] +; CHECK-NEXT: strb w10, [sp, #50] +; CHECK-NEXT: ldr q18, [sp, #32] +; CHECK-NEXT: strb w9, [sp, #49] +; CHECK-NEXT: strb w8, [sp, #48] +; CHECK-NEXT: ldr q1, [sp, #48] ; CHECK-NEXT: ldp d9, d8, [sp, #112] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: add z1.b, z18.b, z1.b ; CHECK-NEXT: ldp d13, d12, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ldp d15, d14, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #128 ; CHECK-NEXT: ret @@ -969,38 +969,38 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q0, q1, [x1] ; CHECK-NEXT: mov z17.h, z0.h[4] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: mov z18.h, z0.h[2] ; CHECK-NEXT: mov z19.h, z0.h[7] ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z3.h, z1.h[4] -; CHECK-NEXT: ldp q21, q22, [x1] +; CHECK-NEXT: ldp q21, q22, [x0] ; CHECK-NEXT: mov z2.h, z1.h[6] ; CHECK-NEXT: mov z4.h, z1.h[2] -; CHECK-NEXT: strh w8, [sp, #40] +; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: fmov w10, s2 ; CHECK-NEXT: mov z5.h, z1.h[7] ; CHECK-NEXT: mov z6.h, z1.h[5] ; CHECK-NEXT: mov z7.h, z1.h[3] -; CHECK-NEXT: strh w8, [sp, #44] +; CHECK-NEXT: strh w8, [sp, #12] ; CHECK-NEXT: fmov w8, s17 ; CHECK-NEXT: mov z16.h, z1.h[1] ; CHECK-NEXT: mov z1.h, z0.h[6] -; CHECK-NEXT: strh w9, [sp, #32] +; CHECK-NEXT: strh w9, [sp] ; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: strh w10, [sp, #46] +; CHECK-NEXT: strh w10, [sp, #14] ; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: strh w8, [sp, #36] +; CHECK-NEXT: strh w8, [sp, #4] ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z2.h, z22.h[6] -; CHECK-NEXT: strh w9, [sp, #42] -; CHECK-NEXT: strh w10, [sp, #38] +; CHECK-NEXT: strh w9, [sp, #10] +; CHECK-NEXT: strh w10, [sp, #6] ; CHECK-NEXT: fmov w9, s22 ; CHECK-NEXT: fmov w10, s21 -; CHECK-NEXT: strh w8, [sp, #34] +; CHECK-NEXT: strh w8, [sp, #2] ; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: mov z3.h, z22.h[4] ; CHECK-NEXT: mov z4.h, z22.h[2] @@ -1009,67 +1009,67 @@ ; CHECK-NEXT: mov z23.h, z22.h[3] ; CHECK-NEXT: mov z24.h, z22.h[1] ; CHECK-NEXT: mov z22.h, z21.h[6] -; CHECK-NEXT: strh w9, [sp, #8] +; CHECK-NEXT: strh w9, [sp, #40] ; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: strh w10, [sp] +; CHECK-NEXT: strh w10, [sp, #32] ; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: strh w8, [sp, #14] +; CHECK-NEXT: strh w8, [sp, #46] ; CHECK-NEXT: fmov w8, s22 ; CHECK-NEXT: mov z25.h, z21.h[4] ; CHECK-NEXT: mov z26.h, z21.h[2] -; CHECK-NEXT: strh w9, [sp, #12] +; CHECK-NEXT: strh w9, [sp, #44] ; CHECK-NEXT: fmov w9, s25 -; CHECK-NEXT: strh w10, [sp, #10] +; CHECK-NEXT: strh w10, [sp, #42] ; CHECK-NEXT: fmov w10, s26 -; CHECK-NEXT: strh w8, [sp, #6] +; CHECK-NEXT: strh w8, [sp, #38] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w9, [sp, #4] +; CHECK-NEXT: strh w9, [sp, #36] ; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: strh w10, [sp, #2] +; CHECK-NEXT: strh w10, [sp, #34] ; CHECK-NEXT: fmov w10, s7 -; CHECK-NEXT: strh w8, [sp, #62] +; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s16 ; CHECK-NEXT: mov z20.h, z0.h[5] ; CHECK-NEXT: mov z1.h, z0.h[3] -; CHECK-NEXT: strh w9, [sp, #60] +; CHECK-NEXT: strh w9, [sp, #28] ; CHECK-NEXT: fmov w9, s19 -; CHECK-NEXT: strh w10, [sp, #58] +; CHECK-NEXT: strh w10, [sp, #26] ; CHECK-NEXT: fmov w10, s20 -; CHECK-NEXT: strh w8, [sp, #56] +; CHECK-NEXT: strh w8, [sp, #24] ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z0.h, z0.h[1] -; CHECK-NEXT: strh w9, [sp, #54] +; CHECK-NEXT: strh w9, [sp, #22] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w10, [sp, #52] +; CHECK-NEXT: strh w10, [sp, #20] ; CHECK-NEXT: fmov w10, s17 -; CHECK-NEXT: strh w8, [sp, #50] +; CHECK-NEXT: strh w8, [sp, #18] ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z27.h, z21.h[7] -; CHECK-NEXT: strh w9, [sp, #48] +; CHECK-NEXT: strh w9, [sp, #16] ; CHECK-NEXT: fmov w9, s23 -; CHECK-NEXT: strh w10, [sp, #30] +; CHECK-NEXT: strh w10, [sp, #62] ; CHECK-NEXT: fmov w10, s24 -; CHECK-NEXT: strh w8, [sp, #28] +; CHECK-NEXT: strh w8, [sp, #60] ; CHECK-NEXT: fmov w8, s27 ; CHECK-NEXT: mov z28.h, z21.h[5] ; CHECK-NEXT: mov z2.h, z21.h[3] ; CHECK-NEXT: mov z3.h, z21.h[1] -; CHECK-NEXT: strh w9, [sp, #26] +; CHECK-NEXT: strh w9, [sp, #58] ; CHECK-NEXT: fmov w9, s28 -; CHECK-NEXT: strh w10, [sp, #24] +; CHECK-NEXT: strh w10, [sp, #56] ; CHECK-NEXT: fmov w10, s2 -; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: strh w8, [sp, #54] ; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: ldr q4, [sp, #32] -; CHECK-NEXT: strh w9, [sp, #20] -; CHECK-NEXT: ldr q5, [sp] -; CHECK-NEXT: strh w10, [sp, #18] -; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: strh w8, [sp, #16] -; CHECK-NEXT: ldr q1, [sp, #16] +; CHECK-NEXT: ldr q4, [sp] +; CHECK-NEXT: strh w9, [sp, #52] +; CHECK-NEXT: ldr q5, [sp, #32] +; CHECK-NEXT: strh w10, [sp, #50] +; CHECK-NEXT: ldr q0, [sp, #16] +; CHECK-NEXT: strh w8, [sp, #48] +; CHECK-NEXT: ldr q1, [sp, #48] ; CHECK-NEXT: add z0.h, z4.h, z0.h ; CHECK-NEXT: add z1.h, z5.h, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret %tmp1 = load <16 x i16>, ptr %a @@ -1086,28 +1086,31 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: mov z4.s, z0.s[2] -; CHECK-NEXT: stp s0, s4, [sp, #24] -; CHECK-NEXT: mov z4.s, z3.s[2] -; CHECK-NEXT: mov z5.s, z2.s[2] -; CHECK-NEXT: stp s4, s2, [sp, #4] -; CHECK-NEXT: stp s5, s1, [sp, #12] -; CHECK-NEXT: mov z5.s, z0.s[3] -; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: mov z5.s, z1.s[3] +; CHECK-NEXT: ldp q3, q4, [x0] +; CHECK-NEXT: mov z2.s, z0.s[2] +; CHECK-NEXT: stp s0, s2, [sp, #8] +; CHECK-NEXT: mov z2.s, z1.s[2] ; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: stp s0, s5, [sp, #40] -; CHECK-NEXT: mov z0.s, z3.s[3] -; CHECK-NEXT: str s1, [sp, #32] -; CHECK-NEXT: mov z1.s, z3.s[1] -; CHECK-NEXT: stp s1, s0, [sp, #48] -; CHECK-NEXT: ldp q4, q2, [sp] -; CHECK-NEXT: ldp q0, q1, [sp, #32] -; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z4.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: str s2, [sp, #4] +; CHECK-NEXT: stp s1, s5, [sp, #48] +; CHECK-NEXT: str s3, [sp, #16] +; CHECK-NEXT: mov z3.s, z3.s[1] +; CHECK-NEXT: mov z1.s, z4.s[3] +; CHECK-NEXT: str s3, [sp, #32] +; CHECK-NEXT: str s1, [sp, #44] +; CHECK-NEXT: mov z1.s, z4.s[1] +; CHECK-NEXT: str s1, [sp, #40] +; CHECK-NEXT: mov z0.s, z4.s[2] +; CHECK-NEXT: ldp q3, q1, [sp, #32] +; CHECK-NEXT: stp s4, s0, [sp, #24] +; CHECK-NEXT: ldp q0, q2, [sp] +; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret %tmp1 = load <8 x float>, ptr %a @@ -1122,15 +1125,15 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{ ; CHECK-LABEL: uzp_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: ldp q3, q2, [x1] +; CHECK-NEXT: ldp q1, q0, [x1] +; CHECK-NEXT: ldp q3, q2, [x0] ; CHECK-NEXT: zip1 z4.d, z1.d, z0.d ; CHECK-NEXT: trn2 z0.d, z1.d, z0.d ; CHECK-NEXT: add z0.d, z4.d, z0.d ; CHECK-NEXT: zip1 z5.d, z3.d, z2.d ; CHECK-NEXT: trn2 z1.d, z3.d, z2.d ; CHECK-NEXT: add z1.d, z5.d, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = load <4 x i64>, ptr %b @@ -1251,13 +1254,13 @@ ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: zip1 z4.d, z1.d, z2.d -; CHECK-NEXT: trn2 z1.d, z1.d, z2.d -; CHECK-NEXT: zip1 z2.d, z0.d, z3.d -; CHECK-NEXT: trn2 z0.d, z0.d, z3.d +; CHECK-NEXT: trn2 z4.d, z1.d, z2.d +; CHECK-NEXT: zip1 z1.d, z1.d, z2.d +; CHECK-NEXT: trn2 z2.d, z0.d, z3.d +; CHECK-NEXT: zip1 z0.d, z0.d, z3.d ; CHECK-NEXT: fadd z2.d, p0/m, z2.d, z4.d ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll @@ -47,11 +47,11 @@ define void @bitreverse_v32i8(ptr %a) #0 { ; CHECK-LABEL: bitreverse_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: rbit z0.b, p0/m, z0.b ; CHECK-NEXT: rbit z1.b, p0/m, z1.b -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.b, p0/m, z0.b +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op) @@ -99,11 +99,11 @@ define void @bitreverse_v16i16(ptr %a) #0 { ; CHECK-LABEL: bitreverse_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: rbit z0.h, p0/m, z0.h ; CHECK-NEXT: rbit z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op) @@ -138,11 +138,11 @@ define void @bitreverse_v8i32(ptr %a) #0 { ; CHECK-LABEL: bitreverse_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: rbit z0.s, p0/m, z0.s ; CHECK-NEXT: rbit z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op) @@ -177,11 +177,11 @@ define void @bitreverse_v4i64(ptr %a) #0 { ; CHECK-LABEL: bitreverse_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: rbit z0.d, p0/m, z0.d ; CHECK-NEXT: rbit z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: rbit z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op) @@ -233,11 +233,11 @@ define void @bswap_v16i16(ptr %a) #0 { ; CHECK-LABEL: bswap_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: revb z0.h, p0/m, z0.h ; CHECK-NEXT: revb z1.h, p0/m, z1.h -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revb z0.h, p0/m, z0.h +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op) @@ -272,11 +272,11 @@ define void @bswap_v8i32(ptr %a) #0 { ; CHECK-LABEL: bswap_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: revb z1.s, p0/m, z1.s -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revb z0.s, p0/m, z0.s +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op) @@ -311,11 +311,11 @@ define void @bswap_v4i64(ptr %a) #0 { ; CHECK-LABEL: bswap_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: revb z1.d, p0/m, z1.d -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: revb z0.d, p0/m, z0.d +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll @@ -43,11 +43,11 @@ define void @sdiv_v32i8(ptr %a) #0 { ; CHECK-LABEL: sdiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 ; CHECK-NEXT: asrd z1.b, p0/m, z1.b, #5 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer) @@ -95,11 +95,11 @@ define void @sdiv_v16i16(ptr %a) #0 { ; CHECK-LABEL: sdiv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 ; CHECK-NEXT: asrd z1.h, p0/m, z1.h, #5 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer) @@ -134,11 +134,11 @@ define void @sdiv_v8i32(ptr %a) #0 { ; CHECK-LABEL: sdiv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 ; CHECK-NEXT: asrd z1.s, p0/m, z1.s, #5 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer) @@ -174,11 +174,11 @@ define void @sdiv_v4i64(ptr %a) #0 { ; CHECK-LABEL: sdiv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 ; CHECK-NEXT: asrd z1.d, p0/m, z1.d, #5 -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll @@ -129,7 +129,8 @@ define void @store_v2i32(ptr %a) #0 { ; CHECK-LABEL: store_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str xzr, [x0] +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret store <2 x i32> zeroinitializer, ptr %a ret void @@ -138,7 +139,8 @@ define void @store_v2f32(ptr %a) #0 { ; CHECK-LABEL: store_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: str xzr, [x0] +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret store <2 x float> zeroinitializer, ptr %a ret void @@ -147,7 +149,8 @@ define void @store_v4i32(ptr %a) #0 { ; CHECK-LABEL: store_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: stp xzr, xzr, [x0] +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret store <4 x i32> zeroinitializer, ptr %a ret void @@ -156,7 +159,8 @@ define void @store_v4f32(ptr %a) #0 { ; CHECK-LABEL: store_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: stp xzr, xzr, [x0] +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret store <4 x float> zeroinitializer, ptr %a ret void @@ -205,7 +209,8 @@ define void @store_v2i64(ptr %a) #0 { ; CHECK-LABEL: store_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: stp xzr, xzr, [x0] +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret store <2 x i64> zeroinitializer, ptr %a ret void @@ -214,7 +219,8 @@ define void @store_v2f64(ptr %a) #0 { ; CHECK-LABEL: store_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: stp xzr, xzr, [x0] +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret store <2 x double> zeroinitializer, ptr %a ret void diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll @@ -60,8 +60,8 @@ define void @subvector_v32i8(ptr %in, ptr %out) #0 { ; CHECK-LABEL: subvector_v32i8: ; CHECK: // %bb.0: // %bb1 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %in br label %bb1 @@ -125,8 +125,8 @@ define void @subvector_v16i16(ptr %in, ptr %out) #0 { ; CHECK-LABEL: subvector_v16i16: ; CHECK: // %bb.0: // %bb1 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %in br label %bb1 @@ -168,8 +168,8 @@ define void @subvector_v8i32(ptr %in, ptr %out) #0 { ; CHECK-LABEL: subvector_v8i32: ; CHECK: // %bb.0: // %bb1 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %in br label %bb1 @@ -197,8 +197,8 @@ define void @subvector_v4i64(ptr %in, ptr %out) #0 { ; CHECK-LABEL: subvector_v4i64: ; CHECK: // %bb.0: // %bb1 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <4 x i64>, ptr %in br label %bb1 @@ -254,8 +254,8 @@ define void @subvector_v16f16(ptr %in, ptr %out) #0 { ; CHECK-LABEL: subvector_v16f16: ; CHECK: // %bb.0: // %bb1 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <16 x half>, ptr %in br label %bb1 @@ -297,8 +297,8 @@ define void @subvector_v8f32(ptr %in, ptr %out) #0 { ; CHECK-LABEL: subvector_v8f32: ; CHECK: // %bb.0: // %bb1 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <8 x float>,ptr %in br label %bb1 @@ -326,8 +326,8 @@ define void @subvector_v4f64(ptr %in, ptr %out) #0 { ; CHECK-LABEL: subvector_v4f64: ; CHECK: // %bb.0: // %bb1 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <4 x double>, ptr %in br label %bb1 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll @@ -50,29 +50,29 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) #0 { ; CHECK-LABEL: trunc_v64i16_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: ldp q0, q1, [x0, #96] ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: ldp q2, q3, [x0, #96] +; CHECK-NEXT: ldp q2, q3, [x0, #64] ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q4, q5, [x0, #32] ; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b ; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b ; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b -; CHECK-NEXT: ldp q6, q7, [x0, #32] +; CHECK-NEXT: ldp q6, q7, [x0] ; CHECK-NEXT: uzp1 z1.b, z5.b, z5.b ; CHECK-NEXT: splice z4.b, p0, z4.b, z1.b ; CHECK-NEXT: uzp1 z3.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z1.b, z7.b, z7.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b ; CHECK-NEXT: add z1.b, z2.b, z2.b -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q1, q0, [x1, #32] ; CHECK-NEXT: add z0.b, z4.b, z4.b ; CHECK-NEXT: add z1.b, z3.b, z3.b -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <64 x i16>, ptr %in %b = trunc <64 x i16> %a to <64 x i8> @@ -85,51 +85,51 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) #0 { ; CHECK-LABEL: trunc_v128i16_v128i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #192] +; CHECK-NEXT: ldp q0, q1, [x0, #224] ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: ldp q2, q3, [x0, #224] +; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: ldp q6, q7, [x0, #128] +; CHECK-NEXT: ldp q6, q7, [x0, #160] ; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b ; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b ; CHECK-NEXT: add z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: ldp q1, q3, [x0, #160] +; CHECK-NEXT: ldp q1, q3, [x0, #128] ; CHECK-NEXT: uzp1 z7.b, z7.b, z7.b ; CHECK-NEXT: splice z6.b, p0, z6.b, z7.b ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b -; CHECK-NEXT: ldp q16, q17, [x0, #64] +; CHECK-NEXT: ldp q16, q17, [x0, #96] ; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b ; CHECK-NEXT: splice z1.b, p0, z1.b, z3.b ; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: uzp1 z16.b, z16.b, z16.b -; CHECK-NEXT: ldp q7, q18, [x0, #96] +; CHECK-NEXT: ldp q7, q18, [x0, #64] ; CHECK-NEXT: uzp1 z17.b, z17.b, z17.b ; CHECK-NEXT: splice z16.b, p0, z16.b, z17.b ; CHECK-NEXT: uzp1 z7.b, z7.b, z7.b -; CHECK-NEXT: ldp q4, q5, [x0, #32] +; CHECK-NEXT: ldp q4, q5, [x0] ; CHECK-NEXT: uzp1 z3.b, z18.b, z18.b ; CHECK-NEXT: splice z7.b, p0, z7.b, z3.b ; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b -; CHECK-NEXT: ldp q19, q20, [x0] +; CHECK-NEXT: ldp q19, q20, [x0, #32] ; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b -; CHECK-NEXT: stp q0, q2, [x1, #96] +; CHECK-NEXT: stp q2, q0, [x1, #96] ; CHECK-NEXT: add z0.b, z6.b, z6.b ; CHECK-NEXT: splice z4.b, p0, z4.b, z3.b -; CHECK-NEXT: stp q0, q1, [x1, #64] +; CHECK-NEXT: stp q1, q0, [x1, #64] ; CHECK-NEXT: add z0.b, z16.b, z16.b ; CHECK-NEXT: uzp1 z18.b, z19.b, z19.b ; CHECK-NEXT: add z1.b, z7.b, z7.b -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q1, q0, [x1, #32] ; CHECK-NEXT: add z1.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z17.b, z20.b, z20.b ; CHECK-NEXT: splice z18.b, p0, z18.b, z17.b ; CHECK-NEXT: add z0.b, z18.b, z18.b -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <128 x i16>, ptr %in %b = trunc <128 x i16> %a to <128 x i8> @@ -224,11 +224,11 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) #0 { ; CHECK-LABEL: trunc_v64i32_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #128] +; CHECK-NEXT: ldp q0, q1, [x0, #192] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: ptrue p1.b, vl8 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: ldp q2, q3, [x0, #160] +; CHECK-NEXT: ldp q2, q3, [x0, #224] ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b @@ -236,36 +236,36 @@ ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: ldp q1, q17, [x0, #224] +; CHECK-NEXT: ldp q1, q17, [x0, #160] ; CHECK-NEXT: splice z0.b, p1, z0.b, z2.b ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: ldp q18, q2, [x0, #192] +; CHECK-NEXT: ldp q18, q2, [x0, #128] ; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h ; CHECK-NEXT: splice z1.h, p0, z1.h, z17.h ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h -; CHECK-NEXT: ldp q4, q5, [x0, #64] +; CHECK-NEXT: ldp q4, q5, [x0] ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: splice z18.h, p0, z18.h, z2.h ; CHECK-NEXT: uzp1 z2.b, z18.b, z18.b ; CHECK-NEXT: splice z2.b, p1, z2.b, z1.b ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: ldp q6, q7, [x0, #96] +; CHECK-NEXT: ldp q6, q7, [x0, #32] ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: ldp q3, q16, [x0] +; CHECK-NEXT: ldp q3, q16, [x0, #64] ; CHECK-NEXT: uzp1 z1.h, z7.h, z7.h ; CHECK-NEXT: splice z6.h, p0, z6.h, z1.h ; CHECK-NEXT: uzp1 z1.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z4.b, p1, z4.b, z1.b ; CHECK-NEXT: add z1.b, z2.b, z2.b -; CHECK-NEXT: ldp q19, q20, [x0, #32] +; CHECK-NEXT: ldp q19, q20, [x0, #96] ; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q1, q0, [x1, #32] ; CHECK-NEXT: splice z3.h, p0, z3.h, z16.h ; CHECK-NEXT: add z1.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b @@ -275,7 +275,7 @@ ; CHECK-NEXT: uzp1 z16.b, z18.b, z18.b ; CHECK-NEXT: splice z3.b, p1, z3.b, z16.b ; CHECK-NEXT: add z0.b, z3.b, z3.b -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <64 x i32>, ptr %in %b = trunc <64 x i32> %a to <64 x i8> @@ -331,29 +331,29 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) #0 { ; CHECK-LABEL: trunc_v32i32_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: ldp q0, q1, [x0, #96] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: ldp q2, q3, [x0, #96] +; CHECK-NEXT: ldp q2, q3, [x0, #64] ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q4, q5, [x0, #32] ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: ldp q6, q7, [x0, #32] +; CHECK-NEXT: ldp q6, q7, [x0] ; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z1.h ; CHECK-NEXT: uzp1 z3.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z1.h, z7.h, z7.h ; CHECK-NEXT: splice z3.h, p0, z3.h, z1.h ; CHECK-NEXT: add z1.h, z2.h, z2.h -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q1, q0, [x1, #32] ; CHECK-NEXT: add z0.h, z4.h, z4.h ; CHECK-NEXT: add z1.h, z3.h, z3.h -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <32 x i32>, ptr %in %b = trunc <32 x i32> %a to <32 x i16> @@ -366,51 +366,51 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) #0 { ; CHECK-LABEL: trunc_v64i32_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #192] +; CHECK-NEXT: ldp q0, q1, [x0, #224] ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: ldp q2, q3, [x0, #224] +; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: ldp q6, q7, [x0, #128] +; CHECK-NEXT: ldp q6, q7, [x0, #160] ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h ; CHECK-NEXT: add z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: ldp q1, q3, [x0, #160] +; CHECK-NEXT: ldp q1, q3, [x0, #128] ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: ldp q16, q17, [x0, #64] +; CHECK-NEXT: ldp q16, q17, [x0, #96] ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z1.h, p0, z1.h, z3.h ; CHECK-NEXT: add z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h -; CHECK-NEXT: ldp q7, q18, [x0, #96] +; CHECK-NEXT: ldp q7, q18, [x0, #64] ; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h ; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h -; CHECK-NEXT: ldp q4, q5, [x0, #32] +; CHECK-NEXT: ldp q4, q5, [x0] ; CHECK-NEXT: uzp1 z3.h, z18.h, z18.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z3.h ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: ldp q19, q20, [x0] +; CHECK-NEXT: ldp q19, q20, [x0, #32] ; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h -; CHECK-NEXT: stp q0, q2, [x1, #96] +; CHECK-NEXT: stp q2, q0, [x1, #96] ; CHECK-NEXT: add z0.h, z6.h, z6.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z3.h -; CHECK-NEXT: stp q0, q1, [x1, #64] +; CHECK-NEXT: stp q1, q0, [x1, #64] ; CHECK-NEXT: add z0.h, z16.h, z16.h ; CHECK-NEXT: uzp1 z18.h, z19.h, z19.h ; CHECK-NEXT: add z1.h, z7.h, z7.h -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q1, q0, [x1, #32] ; CHECK-NEXT: add z1.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z17.h, z20.h, z20.h ; CHECK-NEXT: splice z18.h, p0, z18.h, z17.h ; CHECK-NEXT: add z0.h, z18.h, z18.h -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <64 x i32>, ptr %in %b = trunc <64 x i32> %a to <64 x i16> @@ -656,11 +656,11 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) #0 { ; CHECK-LABEL: trunc_v32i64_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #128] +; CHECK-NEXT: ldp q0, q1, [x0, #192] ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: ptrue p1.h, vl4 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: ldp q2, q3, [x0, #160] +; CHECK-NEXT: ldp q2, q3, [x0, #224] ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h @@ -668,36 +668,36 @@ ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s ; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: ldp q1, q17, [x0, #224] +; CHECK-NEXT: ldp q1, q17, [x0, #160] ; CHECK-NEXT: splice z0.h, p1, z0.h, z2.h ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: ldp q18, q2, [x0, #192] +; CHECK-NEXT: ldp q18, q2, [x0, #128] ; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s ; CHECK-NEXT: splice z1.s, p0, z1.s, z17.s ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z18.s, z18.s, z18.s -; CHECK-NEXT: ldp q4, q5, [x0, #64] +; CHECK-NEXT: ldp q4, q5, [x0] ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: splice z18.s, p0, z18.s, z2.s ; CHECK-NEXT: uzp1 z2.h, z18.h, z18.h ; CHECK-NEXT: splice z2.h, p1, z2.h, z1.h ; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: ldp q6, q7, [x0, #96] +; CHECK-NEXT: ldp q6, q7, [x0, #32] ; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s ; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: ldp q3, q16, [x0] +; CHECK-NEXT: ldp q3, q16, [x0, #64] ; CHECK-NEXT: uzp1 z1.s, z7.s, z7.s ; CHECK-NEXT: splice z6.s, p0, z6.s, z1.s ; CHECK-NEXT: uzp1 z1.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s ; CHECK-NEXT: splice z4.h, p1, z4.h, z1.h ; CHECK-NEXT: add z1.h, z2.h, z2.h -; CHECK-NEXT: ldp q19, q20, [x0, #32] +; CHECK-NEXT: ldp q19, q20, [x0, #96] ; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q1, q0, [x1, #32] ; CHECK-NEXT: splice z3.s, p0, z3.s, z16.s ; CHECK-NEXT: add z1.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h @@ -707,7 +707,7 @@ ; CHECK-NEXT: uzp1 z16.h, z18.h, z18.h ; CHECK-NEXT: splice z3.h, p1, z3.h, z16.h ; CHECK-NEXT: add z0.h, z3.h, z3.h -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i16> @@ -763,29 +763,29 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) #0 { ; CHECK-LABEL: trunc_v16i64_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: ldp q0, q1, [x0, #96] ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: ldp q2, q3, [x0, #96] +; CHECK-NEXT: ldp q2, q3, [x0, #64] ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q4, q5, [x0, #32] ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s ; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s ; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: ldp q6, q7, [x0, #32] +; CHECK-NEXT: ldp q6, q7, [x0] ; CHECK-NEXT: uzp1 z1.s, z5.s, z5.s ; CHECK-NEXT: splice z4.s, p0, z4.s, z1.s ; CHECK-NEXT: uzp1 z3.s, z6.s, z6.s ; CHECK-NEXT: uzp1 z1.s, z7.s, z7.s ; CHECK-NEXT: splice z3.s, p0, z3.s, z1.s ; CHECK-NEXT: add z1.s, z2.s, z2.s -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q1, q0, [x1, #32] ; CHECK-NEXT: add z0.s, z4.s, z4.s ; CHECK-NEXT: add z1.s, z3.s, z3.s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <16 x i64>, ptr %in %b = trunc <16 x i64> %a to <16 x i32> @@ -798,51 +798,51 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) #0 { ; CHECK-LABEL: trunc_v32i64_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #192] +; CHECK-NEXT: ldp q0, q1, [x0, #224] ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: ldp q2, q3, [x0, #224] +; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: ldp q6, q7, [x0, #128] +; CHECK-NEXT: ldp q6, q7, [x0, #160] ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s ; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s ; CHECK-NEXT: add z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: ldp q1, q3, [x0, #160] +; CHECK-NEXT: ldp q1, q3, [x0, #128] ; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s ; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: ldp q16, q17, [x0, #64] +; CHECK-NEXT: ldp q16, q17, [x0, #96] ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s ; CHECK-NEXT: splice z1.s, p0, z1.s, z3.s ; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s -; CHECK-NEXT: ldp q7, q18, [x0, #96] +; CHECK-NEXT: ldp q7, q18, [x0, #64] ; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s ; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s ; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s -; CHECK-NEXT: ldp q4, q5, [x0, #32] +; CHECK-NEXT: ldp q4, q5, [x0] ; CHECK-NEXT: uzp1 z3.s, z18.s, z18.s ; CHECK-NEXT: splice z7.s, p0, z7.s, z3.s ; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: ldp q19, q20, [x0] +; CHECK-NEXT: ldp q19, q20, [x0, #32] ; CHECK-NEXT: uzp1 z3.s, z5.s, z5.s -; CHECK-NEXT: stp q0, q2, [x1, #96] +; CHECK-NEXT: stp q2, q0, [x1, #96] ; CHECK-NEXT: add z0.s, z6.s, z6.s ; CHECK-NEXT: splice z4.s, p0, z4.s, z3.s -; CHECK-NEXT: stp q0, q1, [x1, #64] +; CHECK-NEXT: stp q1, q0, [x1, #64] ; CHECK-NEXT: add z0.s, z16.s, z16.s ; CHECK-NEXT: uzp1 z18.s, z19.s, z19.s ; CHECK-NEXT: add z1.s, z7.s, z7.s -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q1, q0, [x1, #32] ; CHECK-NEXT: add z1.s, z4.s, z4.s ; CHECK-NEXT: uzp1 z17.s, z20.s, z20.s ; CHECK-NEXT: splice z18.s, p0, z18.s, z17.s ; CHECK-NEXT: add z0.s, z18.s, z18.s -; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q1, q0, [x1] ; CHECK-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll @@ -59,16 +59,15 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: mov z0.b, z0.b[15] -; CHECK-NEXT: mov z2.b, z1.b[15] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: ldr q0, [x1, #16] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: insr z1.b, w8 +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: mov z3.b, z0.b[15] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: insr z2.b, w8 +; CHECK-NEXT: mov z1.b, z1.b[15] +; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: insr z0.b, w9 -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -123,16 +122,15 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: mov z0.h, z0.h[7] -; CHECK-NEXT: mov z2.h, z1.h[7] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: ldr q0, [x1, #16] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: insr z1.h, w8 +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: mov z3.h, z0.h[7] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: insr z2.h, w8 +; CHECK-NEXT: mov z1.h, z1.h[7] +; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: insr z0.h, w9 -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -173,16 +171,15 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: mov z2.s, z1.s[3] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: ldr q0, [x1, #16] -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: insr z1.s, w8 +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: mov z3.s, z0.s[3] +; CHECK-NEXT: fmov w8, s3 +; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: insr z2.s, w8 +; CHECK-NEXT: mov z1.s, z1.s[3] +; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: insr z0.s, w9 -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -208,16 +205,15 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: mov z2.d, z1.d[1] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: ldr q0, [x1, #16] -; CHECK-NEXT: fmov x9, d2 -; CHECK-NEXT: insr z1.d, x8 +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: mov z3.d, z0.d[1] +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: insr z2.d, x8 +; CHECK-NEXT: mov z1.d, z1.d[1] +; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: insr z0.d, x9 -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -256,13 +252,13 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: mov z3.h, z1.h[7] -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: insr z2.h, h3 -; CHECK-NEXT: mov z0.h, z0.h[7] -; CHECK-NEXT: insr z1.h, h0 -; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: ldr q2, [x0, #16] +; CHECK-NEXT: insr z0.h, h3 +; CHECK-NEXT: mov z2.h, z2.h[7] +; CHECK-NEXT: insr z1.h, h2 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -301,13 +297,13 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: mov z3.s, z1.s[3] -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: insr z2.s, s3 -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: insr z1.s, s0 -; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: ldr q2, [x0, #16] +; CHECK-NEXT: insr z0.s, s3 +; CHECK-NEXT: mov z2.s, z2.s[3] +; CHECK-NEXT: insr z1.s, s2 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -332,13 +328,13 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [x1] +; CHECK-NEXT: ldp q1, q0, [x1] ; CHECK-NEXT: mov z3.d, z1.d[1] -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: insr z2.d, d3 -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: ldr q2, [x0, #16] +; CHECK-NEXT: insr z0.d, d3 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: insr z1.d, d2 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -350,13 +346,13 @@ define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shuffle_ext_byone_reverse: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: mov z3.d, z1.d[1] -; CHECK-NEXT: ldr q0, [x1, #16] -; CHECK-NEXT: insr z2.d, d3 -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: insr z1.d, d0 -; CHECK-NEXT: stp q1, q2, [x0] +; CHECK-NEXT: ldr q2, [x1, #16] +; CHECK-NEXT: insr z0.d, d3 +; CHECK-NEXT: mov z2.d, z2.d[1] +; CHECK-NEXT: insr z1.d, d2 +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -368,9 +364,9 @@ define void @shuffle_ext_invalid(ptr %a, ptr %b) #0 { ; CHECK-LABEL: shuffle_ext_invalid: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ldr q1, [x0, #16] +; CHECK-NEXT: stp q1, q0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll @@ -344,16 +344,14 @@ define i32 @decb_scalar_i32(i32 %a) { ; NO_SCALAR_INC-LABEL: decb_scalar_i32: ; NO_SCALAR_INC: // %bb.0: -; NO_SCALAR_INC-NEXT: // kill: def $w0 killed $w0 def $x0 -; NO_SCALAR_INC-NEXT: addvl x0, x0, #-4 -; NO_SCALAR_INC-NEXT: // kill: def $w0 killed $w0 killed $x0 +; NO_SCALAR_INC-NEXT: rdvl x8, #4 +; NO_SCALAR_INC-NEXT: sub w0, w0, w8 ; NO_SCALAR_INC-NEXT: ret ; ; CHECK-LABEL: decb_scalar_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: addvl x0, x0, #-4 -; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: rdvl x8, #4 +; CHECK-NEXT: sub w0, w0, w8 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() @@ -367,15 +365,13 @@ ; NO_SCALAR_INC-LABEL: dech_scalar_i32: ; NO_SCALAR_INC: // %bb.0: ; NO_SCALAR_INC-NEXT: cnth x8 -; NO_SCALAR_INC-NEXT: neg x8, x8 -; NO_SCALAR_INC-NEXT: add w0, w0, w8 +; NO_SCALAR_INC-NEXT: sub w0, w0, w8 ; NO_SCALAR_INC-NEXT: ret ; ; CHECK-LABEL: dech_scalar_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: dech x0 -; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: sub w0, w0, w8 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() @@ -389,15 +385,13 @@ ; NO_SCALAR_INC-LABEL: decw_scalar_i32: ; NO_SCALAR_INC: // %bb.0: ; NO_SCALAR_INC-NEXT: cntw x8 -; NO_SCALAR_INC-NEXT: neg x8, x8 -; NO_SCALAR_INC-NEXT: add w0, w0, w8 +; NO_SCALAR_INC-NEXT: sub w0, w0, w8 ; NO_SCALAR_INC-NEXT: ret ; ; CHECK-LABEL: decw_scalar_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: decw x0 -; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: sub w0, w0, w8 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() @@ -411,15 +405,13 @@ ; NO_SCALAR_INC-LABEL: decd_scalar_i32: ; NO_SCALAR_INC: // %bb.0: ; NO_SCALAR_INC-NEXT: cntd x8 -; NO_SCALAR_INC-NEXT: neg x8, x8 -; NO_SCALAR_INC-NEXT: add w0, w0, w8 +; NO_SCALAR_INC-NEXT: sub w0, w0, w8 ; NO_SCALAR_INC-NEXT: ret ; ; CHECK-LABEL: decd_scalar_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: decd x0 -; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: sub w0, w0, w8 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %mul = mul i64 %vscale, 2 diff --git a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll --- a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll +++ b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll @@ -7,16 +7,16 @@ define void @func_vscale_none(ptr %a, ptr %b) #0 { ; CHECK-NOARG-LABEL: func_vscale_none: ; CHECK-NOARG: // %bb.0: -; CHECK-NOARG-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NOARG-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NOARG-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NOARG-NEXT: ldp q2, q3, [x0] +; CHECK-NOARG-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NOARG-NEXT: ldp q5, q4, [x1, #32] ; CHECK-NOARG-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-NOARG-NEXT: ldp q6, q4, [x1] -; CHECK-NOARG-NEXT: stp q0, q1, [x0, #32] -; CHECK-NOARG-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-NOARG-NEXT: ldp q3, q2, [x0] +; CHECK-NOARG-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NOARG-NEXT: ldp q4, q6, [x1] +; CHECK-NOARG-NEXT: stp q1, q0, [x0, #32] ; CHECK-NOARG-NEXT: add v0.4s, v3.4s, v4.4s -; CHECK-NOARG-NEXT: stp q2, q0, [x0] +; CHECK-NOARG-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-NOARG-NEXT: stp q0, q2, [x0] ; CHECK-NOARG-NEXT: ret ; ; CHECK-ARG-LABEL: func_vscale_none: @@ -39,16 +39,16 @@ define void @func_vscale1_1(ptr %a, ptr %b) #1 { ; CHECK-LABEL: func_vscale1_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #32] -; CHECK-NEXT: ldp q4, q5, [x1, #32] -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ldp q5, q4, [x1, #32] ; CHECK-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-NEXT: ldp q6, q4, [x1] -; CHECK-NEXT: stp q0, q1, [x0, #32] -; CHECK-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ldp q4, q6, [x1] +; CHECK-NEXT: stp q1, q0, [x0, #32] ; CHECK-NEXT: add v0.4s, v3.4s, v4.4s -; CHECK-NEXT: stp q2, q0, [x0] +; CHECK-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i32>, ptr %a %op2 = load <16 x i32>, ptr %b @@ -62,7 +62,7 @@ define void @func_vscale2_2(ptr %a, ptr %b) #2 { ; CHECK-LABEL: func_vscale2_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x8, #8 // =0x8 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -85,7 +85,7 @@ define void @func_vscale2_4(ptr %a, ptr %b) #3 { ; CHECK-LABEL: func_vscale2_4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x8, #8 // =0x8 ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll @@ -65,7 +65,7 @@ define void @test_copysign_v32f16_v32f16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: test_copysign_v32f16_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov z4.h, #32767 // =0x7fff ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] @@ -182,7 +182,7 @@ define void @test_copysign_v16f32_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: test_copysign_v16f32_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z4.s, #0x7fffffff ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] @@ -284,7 +284,7 @@ define void @test_copysign_v8f64_v8f64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-LABEL: test_copysign_v8f64_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov z4.d, #0x7fffffffffffffff ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] @@ -416,28 +416,16 @@ ; SplitVecRes mismatched define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { -; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: -; CHECK_NO_EXTEND_ROUND: // %bb.0: -; CHECK_NO_EXTEND_ROUND-NEXT: ptrue p0.d, vl4 -; CHECK_NO_EXTEND_ROUND-NEXT: mov z2.d, #0x7fffffffffffffff -; CHECK_NO_EXTEND_ROUND-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK_NO_EXTEND_ROUND-NEXT: ld1w { z1.d }, p0/z, [x1] -; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z1.d, p0/m, z1.s -; CHECK_NO_EXTEND_ROUND-NEXT: bsl z0.d, z0.d, z1.d, z2.d -; CHECK_NO_EXTEND_ROUND-NEXT: st1d { z0.d }, p0, [x0] -; CHECK_NO_EXTEND_ROUND-NEXT: ret -; -; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: -; CHECK_EXTEND_ROUND: // %bb.0: -; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d, vl4 -; CHECK_EXTEND_ROUND-NEXT: mov z2.d, #0x7fffffffffffffff -; CHECK_EXTEND_ROUND-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK_EXTEND_ROUND-NEXT: ldr q1, [x1] -; CHECK_EXTEND_ROUND-NEXT: uunpklo z1.d, z1.s -; CHECK_EXTEND_ROUND-NEXT: fcvt z1.d, p0/m, z1.s -; CHECK_EXTEND_ROUND-NEXT: bsl z0.d, z0.d, z1.d, z2.d -; CHECK_EXTEND_ROUND-NEXT: st1d { z0.d }, p0, [x0] -; CHECK_EXTEND_ROUND-NEXT: ret +; CHECK-LABEL: test_copysign_v4f64_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov z2.d, #0x7fffffffffffffff +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x1] +; CHECK-NEXT: fcvt z1.d, p0/m, z1.s +; CHECK-NEXT: bsl z0.d, z0.d, z1.d, z2.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x float>, ptr %bp %tmp0 = fpext <4 x float> %b to <4 x double> @@ -534,3 +522,6 @@ declare <32 x double> @llvm.copysign.v32f64(<32 x double> %a, <32 x double> %b) #0 attributes #0 = { "target-features"="+sve2" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK_EXTEND_ROUND: {{.*}} +; CHECK_NO_EXTEND_ROUND: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll --- a/llvm/test/CodeGen/AArch64/swifterror.ll +++ b/llvm/test/CodeGen/AArch64/swifterror.ll @@ -19,9 +19,9 @@ ; CHECK-APPLE-NEXT: .cfi_def_cfa w29, 16 ; CHECK-APPLE-NEXT: .cfi_offset w30, -8 ; CHECK-APPLE-NEXT: .cfi_offset w29, -16 -; CHECK-APPLE-NEXT: mov w0, #16 +; CHECK-APPLE-NEXT: mov w0, #16 ; =0x10 ; CHECK-APPLE-NEXT: bl _malloc -; CHECK-APPLE-NEXT: mov w8, #1 +; CHECK-APPLE-NEXT: mov w8, #1 ; =0x1 ; CHECK-APPLE-NEXT: fmov s0, #1.00000000 ; CHECK-APPLE-NEXT: mov x21, x0 ; CHECK-APPLE-NEXT: strb w8, [x0, #8] @@ -36,11 +36,11 @@ ; CHECK-O0-AARCH64-NEXT: .cfi_def_cfa w29, 16 ; CHECK-O0-AARCH64-NEXT: .cfi_offset w30, -8 ; CHECK-O0-AARCH64-NEXT: .cfi_offset w29, -16 -; CHECK-O0-AARCH64-NEXT: mov w8, #16 +; CHECK-O0-AARCH64-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-AARCH64-NEXT: mov w0, w8 ; CHECK-O0-AARCH64-NEXT: bl _malloc ; CHECK-O0-AARCH64-NEXT: mov x21, x0 -; CHECK-O0-AARCH64-NEXT: mov w8, #1 +; CHECK-O0-AARCH64-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-AARCH64-NEXT: strb w8, [x0, #8] ; CHECK-O0-AARCH64-NEXT: fmov s0, #1.00000000 ; CHECK-O0-AARCH64-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload @@ -51,11 +51,11 @@ ; CHECK-O0-ARM64_32-NEXT: str x30, [sp, #-16]! ; 8-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: .cfi_def_cfa_offset 16 ; CHECK-O0-ARM64_32-NEXT: .cfi_offset w30, -16 -; CHECK-O0-ARM64_32-NEXT: mov w8, #16 +; CHECK-O0-ARM64_32-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-ARM64_32-NEXT: mov w0, w8 ; CHECK-O0-ARM64_32-NEXT: bl _malloc ; CHECK-O0-ARM64_32-NEXT: mov x21, x0 -; CHECK-O0-ARM64_32-NEXT: mov w8, #1 +; CHECK-O0-ARM64_32-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-ARM64_32-NEXT: strb w8, [x0, #8] ; CHECK-O0-ARM64_32-NEXT: fmov s0, #1.00000000 ; CHECK-O0-ARM64_32-NEXT: ldr x30, [sp], #16 ; 8-byte Folded Reload @@ -423,10 +423,10 @@ ; CHECK-APPLE-NEXT: .cfi_offset w29, -16 ; CHECK-APPLE-NEXT: cbz w0, LBB3_2 ; CHECK-APPLE-NEXT: ; %bb.1: ; %gen_error -; CHECK-APPLE-NEXT: mov w0, #16 +; CHECK-APPLE-NEXT: mov w0, #16 ; =0x10 ; CHECK-APPLE-NEXT: bl _malloc ; CHECK-APPLE-NEXT: mov x21, x0 -; CHECK-APPLE-NEXT: mov w8, #1 +; CHECK-APPLE-NEXT: mov w8, #1 ; =0x1 ; CHECK-APPLE-NEXT: fmov s0, #1.00000000 ; CHECK-APPLE-NEXT: strb w8, [x0, #8] ; CHECK-APPLE-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload @@ -448,11 +448,11 @@ ; CHECK-O0-AARCH64-NEXT: str x21, [sp, #8] ; 8-byte Folded Spill ; CHECK-O0-AARCH64-NEXT: cbz w0, LBB3_2 ; CHECK-O0-AARCH64-NEXT: ; %bb.1: ; %gen_error -; CHECK-O0-AARCH64-NEXT: mov w8, #16 +; CHECK-O0-AARCH64-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-AARCH64-NEXT: mov w0, w8 ; CHECK-O0-AARCH64-NEXT: bl _malloc ; CHECK-O0-AARCH64-NEXT: mov x21, x0 -; CHECK-O0-AARCH64-NEXT: mov w8, #1 +; CHECK-O0-AARCH64-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-AARCH64-NEXT: strb w8, [x0, #8] ; CHECK-O0-AARCH64-NEXT: fmov s0, #1.00000000 ; CHECK-O0-AARCH64-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload @@ -474,11 +474,11 @@ ; CHECK-O0-ARM64_32-NEXT: str x21, [sp, #8] ; 8-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: cbz w0, LBB3_2 ; CHECK-O0-ARM64_32-NEXT: ; %bb.1: ; %gen_error -; CHECK-O0-ARM64_32-NEXT: mov w8, #16 +; CHECK-O0-ARM64_32-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-ARM64_32-NEXT: mov w0, w8 ; CHECK-O0-ARM64_32-NEXT: bl _malloc ; CHECK-O0-ARM64_32-NEXT: mov x21, x0 -; CHECK-O0-ARM64_32-NEXT: mov w8, #1 +; CHECK-O0-ARM64_32-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-ARM64_32-NEXT: strb w8, [x0, #8] ; CHECK-O0-ARM64_32-NEXT: fmov s0, #1.00000000 ; CHECK-O0-ARM64_32-NEXT: ldr x30, [sp, #16] ; 8-byte Folded Reload @@ -528,7 +528,7 @@ ; CHECK-APPLE-NEXT: fmov s8, s0 ; CHECK-APPLE-NEXT: mov w19, w0 ; CHECK-APPLE-NEXT: mov x0, x21 -; CHECK-APPLE-NEXT: mov w20, #1 +; CHECK-APPLE-NEXT: mov w20, #1 ; =0x1 ; CHECK-APPLE-NEXT: fmov s9, #1.00000000 ; CHECK-APPLE-NEXT: b LBB4_2 ; CHECK-APPLE-NEXT: LBB4_1: ; %bb_cont @@ -540,7 +540,7 @@ ; CHECK-APPLE-NEXT: cbz w19, LBB4_1 ; CHECK-APPLE-NEXT: ; %bb.3: ; %gen_error ; CHECK-APPLE-NEXT: ; in Loop: Header=BB4_2 Depth=1 -; CHECK-APPLE-NEXT: mov w0, #16 +; CHECK-APPLE-NEXT: mov w0, #16 ; =0x10 ; CHECK-APPLE-NEXT: bl _malloc ; CHECK-APPLE-NEXT: strb w20, [x0, #8] ; CHECK-APPLE-NEXT: b LBB4_1 @@ -573,11 +573,11 @@ ; CHECK-O0-AARCH64-NEXT: cbz w8, LBB4_3 ; CHECK-O0-AARCH64-NEXT: ; %bb.2: ; %gen_error ; CHECK-O0-AARCH64-NEXT: ; in Loop: Header=BB4_1 Depth=1 -; CHECK-O0-AARCH64-NEXT: mov w8, #16 +; CHECK-O0-AARCH64-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-AARCH64-NEXT: mov w0, w8 ; CHECK-O0-AARCH64-NEXT: bl _malloc ; CHECK-O0-AARCH64-NEXT: mov x9, x0 -; CHECK-O0-AARCH64-NEXT: mov w8, #1 +; CHECK-O0-AARCH64-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-AARCH64-NEXT: strb w8, [x9, #8] ; CHECK-O0-AARCH64-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill ; CHECK-O0-AARCH64-NEXT: LBB4_3: ; %bb_cont @@ -614,13 +614,13 @@ ; CHECK-O0-ARM64_32-NEXT: cbz w8, LBB4_3 ; CHECK-O0-ARM64_32-NEXT: ; %bb.2: ; %gen_error ; CHECK-O0-ARM64_32-NEXT: ; in Loop: Header=BB4_1 Depth=1 -; CHECK-O0-ARM64_32-NEXT: mov w8, #16 +; CHECK-O0-ARM64_32-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-ARM64_32-NEXT: mov w0, w8 ; CHECK-O0-ARM64_32-NEXT: bl _malloc ; CHECK-O0-ARM64_32-NEXT: mov x9, x0 ; CHECK-O0-ARM64_32-NEXT: ; kill: def $x0 killed $x9 ; CHECK-O0-ARM64_32-NEXT: mov x0, x9 -; CHECK-O0-ARM64_32-NEXT: mov w8, #1 +; CHECK-O0-ARM64_32-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-ARM64_32-NEXT: strb w8, [x9, #8] ; CHECK-O0-ARM64_32-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: LBB4_3: ; %bb_cont @@ -684,9 +684,9 @@ ; CHECK-APPLE-NEXT: .cfi_offset w20, -32 ; CHECK-APPLE-NEXT: mov w19, w0 ; CHECK-APPLE-NEXT: mov x20, x8 -; CHECK-APPLE-NEXT: mov w0, #16 +; CHECK-APPLE-NEXT: mov w0, #16 ; =0x10 ; CHECK-APPLE-NEXT: bl _malloc -; CHECK-APPLE-NEXT: mov w8, #1 +; CHECK-APPLE-NEXT: mov w8, #1 ; =0x1 ; CHECK-APPLE-NEXT: mov x21, x0 ; CHECK-APPLE-NEXT: strb w8, [x0, #8] ; CHECK-APPLE-NEXT: str w19, [x20, #4] @@ -705,14 +705,14 @@ ; CHECK-O0-AARCH64-NEXT: .cfi_offset w29, -16 ; CHECK-O0-AARCH64-NEXT: stur w0, [x29, #-4] ; 4-byte Folded Spill ; CHECK-O0-AARCH64-NEXT: str x8, [sp] ; 8-byte Folded Spill -; CHECK-O0-AARCH64-NEXT: mov w8, #16 +; CHECK-O0-AARCH64-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-AARCH64-NEXT: mov w0, w8 ; CHECK-O0-AARCH64-NEXT: bl _malloc ; CHECK-O0-AARCH64-NEXT: ldr x8, [sp] ; 8-byte Folded Reload ; CHECK-O0-AARCH64-NEXT: mov x10, x0 ; CHECK-O0-AARCH64-NEXT: ldur w0, [x29, #-4] ; 4-byte Folded Reload ; CHECK-O0-AARCH64-NEXT: mov x21, x10 -; CHECK-O0-AARCH64-NEXT: mov w9, #1 +; CHECK-O0-AARCH64-NEXT: mov w9, #1 ; =0x1 ; CHECK-O0-AARCH64-NEXT: strb w9, [x10, #8] ; CHECK-O0-AARCH64-NEXT: str w0, [x8, #4] ; CHECK-O0-AARCH64-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload @@ -727,14 +727,14 @@ ; CHECK-O0-ARM64_32-NEXT: .cfi_offset w30, -16 ; CHECK-O0-ARM64_32-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: str x8, [sp] ; 8-byte Folded Spill -; CHECK-O0-ARM64_32-NEXT: mov w8, #16 +; CHECK-O0-ARM64_32-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-ARM64_32-NEXT: mov w0, w8 ; CHECK-O0-ARM64_32-NEXT: bl _malloc ; CHECK-O0-ARM64_32-NEXT: ldr x8, [sp] ; 8-byte Folded Reload ; CHECK-O0-ARM64_32-NEXT: mov x10, x0 ; CHECK-O0-ARM64_32-NEXT: ldr w0, [sp, #12] ; 4-byte Folded Reload ; CHECK-O0-ARM64_32-NEXT: mov x21, x10 -; CHECK-O0-ARM64_32-NEXT: mov w9, #1 +; CHECK-O0-ARM64_32-NEXT: mov w9, #1 ; =0x1 ; CHECK-O0-ARM64_32-NEXT: strb w9, [x10, #8] ; CHECK-O0-ARM64_32-NEXT: str w0, [x8, #4] ; CHECK-O0-ARM64_32-NEXT: ldr x30, [sp, #16] ; 8-byte Folded Reload @@ -772,7 +772,7 @@ ; CHECK-APPLE-AARCH64-NEXT: .cfi_offset w22, -48 ; CHECK-APPLE-AARCH64-NEXT: mov x19, x0 ; CHECK-APPLE-AARCH64-NEXT: add x8, sp, #8 -; CHECK-APPLE-AARCH64-NEXT: mov w0, #1 +; CHECK-APPLE-AARCH64-NEXT: mov w0, #1 ; =0x1 ; CHECK-APPLE-AARCH64-NEXT: mov x21, xzr ; CHECK-APPLE-AARCH64-NEXT: bl _foo_sret ; CHECK-APPLE-AARCH64-NEXT: mov x0, x21 @@ -805,7 +805,7 @@ ; CHECK-O0-AARCH64-NEXT: str x0, [sp] ; 8-byte Folded Spill ; CHECK-O0-AARCH64-NEXT: mov x21, xzr ; CHECK-O0-AARCH64-NEXT: add x8, sp, #24 -; CHECK-O0-AARCH64-NEXT: mov w0, #1 +; CHECK-O0-AARCH64-NEXT: mov w0, #1 ; =0x1 ; CHECK-O0-AARCH64-NEXT: bl _foo_sret ; CHECK-O0-AARCH64-NEXT: str x21, [sp, #8] ; 8-byte Folded Spill ; CHECK-O0-AARCH64-NEXT: cbnz x21, LBB6_2 @@ -840,7 +840,7 @@ ; CHECK-APPLE-ARM64_32-NEXT: .cfi_offset w22, -48 ; CHECK-APPLE-ARM64_32-NEXT: mov x19, x0 ; CHECK-APPLE-ARM64_32-NEXT: add x8, sp, #8 -; CHECK-APPLE-ARM64_32-NEXT: mov w0, #1 +; CHECK-APPLE-ARM64_32-NEXT: mov w0, #1 ; =0x1 ; CHECK-APPLE-ARM64_32-NEXT: mov x21, xzr ; CHECK-APPLE-ARM64_32-NEXT: bl _foo_sret ; CHECK-APPLE-ARM64_32-NEXT: mov x0, x21 @@ -871,7 +871,7 @@ ; CHECK-O0-ARM64_32-NEXT: str x0, [sp] ; 8-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: mov x21, xzr ; CHECK-O0-ARM64_32-NEXT: add x8, sp, #24 -; CHECK-O0-ARM64_32-NEXT: mov w0, #1 +; CHECK-O0-ARM64_32-NEXT: mov w0, #1 ; =0x1 ; CHECK-O0-ARM64_32-NEXT: bl _foo_sret ; CHECK-O0-ARM64_32-NEXT: str x21, [sp, #8] ; 8-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: cmp x21, #0 @@ -926,21 +926,20 @@ ; CHECK-APPLE-AARCH64-NEXT: .cfi_def_cfa w29, 16 ; CHECK-APPLE-AARCH64-NEXT: .cfi_offset w30, -8 ; CHECK-APPLE-AARCH64-NEXT: .cfi_offset w29, -16 -; CHECK-APPLE-AARCH64-NEXT: mov w0, #16 +; CHECK-APPLE-AARCH64-NEXT: mov w0, #16 ; =0x10 ; CHECK-APPLE-AARCH64-NEXT: bl _malloc -; CHECK-APPLE-AARCH64-NEXT: mov w8, #1 -; CHECK-APPLE-AARCH64-NEXT: add x9, x29, #16 -; CHECK-APPLE-AARCH64-NEXT: ldr w10, [x29, #16] -; CHECK-APPLE-AARCH64-NEXT: orr x9, x9, #0x8 +; CHECK-APPLE-AARCH64-NEXT: mov w8, #1 ; =0x1 +; CHECK-APPLE-AARCH64-NEXT: ldr w9, [x29, #16] +; CHECK-APPLE-AARCH64-NEXT: add x10, x29, #16 +; CHECK-APPLE-AARCH64-NEXT: ldr w11, [x29, #32] ; CHECK-APPLE-AARCH64-NEXT: strb w8, [x0, #8] -; CHECK-APPLE-AARCH64-NEXT: stur w10, [x29, #-12] -; CHECK-APPLE-AARCH64-NEXT: ldr w8, [x9], #8 -; CHECK-APPLE-AARCH64-NEXT: str w8, [sp, #16] -; CHECK-APPLE-AARCH64-NEXT: ldr w8, [x9], #8 +; CHECK-APPLE-AARCH64-NEXT: add x8, x10, #24 +; CHECK-APPLE-AARCH64-NEXT: stur w9, [x29, #-12] +; CHECK-APPLE-AARCH64-NEXT: ldr w9, [x29, #24] ; CHECK-APPLE-AARCH64-NEXT: fmov s0, #1.00000000 ; CHECK-APPLE-AARCH64-NEXT: mov x21, x0 -; CHECK-APPLE-AARCH64-NEXT: stur x9, [x29, #-8] -; CHECK-APPLE-AARCH64-NEXT: str w8, [sp, #12] +; CHECK-APPLE-AARCH64-NEXT: stur x8, [x29, #-8] +; CHECK-APPLE-AARCH64-NEXT: stp w11, w9, [sp, #12] ; CHECK-APPLE-AARCH64-NEXT: ldp x29, x30, [sp, #32] ; 16-byte Folded Reload ; CHECK-APPLE-AARCH64-NEXT: add sp, sp, #48 ; CHECK-APPLE-AARCH64-NEXT: ret @@ -954,11 +953,11 @@ ; CHECK-O0-AARCH64-NEXT: .cfi_def_cfa w29, 16 ; CHECK-O0-AARCH64-NEXT: .cfi_offset w30, -8 ; CHECK-O0-AARCH64-NEXT: .cfi_offset w29, -16 -; CHECK-O0-AARCH64-NEXT: mov w8, #16 +; CHECK-O0-AARCH64-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-AARCH64-NEXT: mov w0, w8 ; CHECK-O0-AARCH64-NEXT: bl _malloc ; CHECK-O0-AARCH64-NEXT: mov x21, x0 -; CHECK-O0-AARCH64-NEXT: mov w8, #1 +; CHECK-O0-AARCH64-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-AARCH64-NEXT: strb w8, [x0, #8] ; CHECK-O0-AARCH64-NEXT: add x8, x29, #16 ; CHECK-O0-AARCH64-NEXT: stur x8, [x29, #-8] @@ -991,9 +990,9 @@ ; CHECK-APPLE-ARM64_32-NEXT: .cfi_def_cfa w29, 16 ; CHECK-APPLE-ARM64_32-NEXT: .cfi_offset w30, -8 ; CHECK-APPLE-ARM64_32-NEXT: .cfi_offset w29, -16 -; CHECK-APPLE-ARM64_32-NEXT: mov w0, #16 +; CHECK-APPLE-ARM64_32-NEXT: mov w0, #16 ; =0x10 ; CHECK-APPLE-ARM64_32-NEXT: bl _malloc -; CHECK-APPLE-ARM64_32-NEXT: mov w8, #1 +; CHECK-APPLE-ARM64_32-NEXT: mov w8, #1 ; =0x1 ; CHECK-APPLE-ARM64_32-NEXT: add x9, x29, #16 ; CHECK-APPLE-ARM64_32-NEXT: orr w10, w9, #0x4 ; CHECK-APPLE-ARM64_32-NEXT: and x11, x9, #0xfffffff0 @@ -1020,11 +1019,11 @@ ; CHECK-O0-ARM64_32-NEXT: .cfi_def_cfa_offset 48 ; CHECK-O0-ARM64_32-NEXT: str x30, [sp, #32] ; 8-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: .cfi_offset w30, -16 -; CHECK-O0-ARM64_32-NEXT: mov w8, #16 +; CHECK-O0-ARM64_32-NEXT: mov w8, #16 ; =0x10 ; CHECK-O0-ARM64_32-NEXT: mov w0, w8 ; CHECK-O0-ARM64_32-NEXT: bl _malloc ; CHECK-O0-ARM64_32-NEXT: mov x21, x0 -; CHECK-O0-ARM64_32-NEXT: mov w8, #1 +; CHECK-O0-ARM64_32-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-ARM64_32-NEXT: strb w8, [x0, #8] ; CHECK-O0-ARM64_32-NEXT: add x8, sp, #48 ; CHECK-O0-ARM64_32-NEXT: ; kill: def $w8 killed $w8 killed $x8 @@ -1100,9 +1099,9 @@ ; CHECK-APPLE-AARCH64-NEXT: .cfi_offset w21, -40 ; CHECK-APPLE-AARCH64-NEXT: .cfi_offset w22, -48 ; CHECK-APPLE-AARCH64-NEXT: mov x19, x0 -; CHECK-APPLE-AARCH64-NEXT: mov w8, #10 -; CHECK-APPLE-AARCH64-NEXT: mov w9, #11 -; CHECK-APPLE-AARCH64-NEXT: mov w10, #12 +; CHECK-APPLE-AARCH64-NEXT: mov w8, #10 ; =0xa +; CHECK-APPLE-AARCH64-NEXT: mov w9, #11 ; =0xb +; CHECK-APPLE-AARCH64-NEXT: mov w10, #12 ; =0xc ; CHECK-APPLE-AARCH64-NEXT: stp w9, w8, [sp, #32] ; CHECK-APPLE-AARCH64-NEXT: str w10, [sp, #28] ; CHECK-APPLE-AARCH64-NEXT: mov x21, xzr @@ -1138,11 +1137,11 @@ ; CHECK-O0-AARCH64-NEXT: ; implicit-def: $x1 ; CHECK-O0-AARCH64-NEXT: str x0, [sp, #24] ; 8-byte Folded Spill ; CHECK-O0-AARCH64-NEXT: mov x21, xzr -; CHECK-O0-AARCH64-NEXT: mov w8, #10 +; CHECK-O0-AARCH64-NEXT: mov w8, #10 ; =0xa ; CHECK-O0-AARCH64-NEXT: stur w8, [x29, #-28] -; CHECK-O0-AARCH64-NEXT: mov w8, #11 +; CHECK-O0-AARCH64-NEXT: mov w8, #11 ; =0xb ; CHECK-O0-AARCH64-NEXT: stur w8, [x29, #-32] -; CHECK-O0-AARCH64-NEXT: mov w8, #12 +; CHECK-O0-AARCH64-NEXT: mov w8, #12 ; =0xc ; CHECK-O0-AARCH64-NEXT: stur w8, [x29, #-36] ; CHECK-O0-AARCH64-NEXT: ldur w8, [x29, #-28] ; CHECK-O0-AARCH64-NEXT: ; kill: def $x8 killed $w8 @@ -1187,16 +1186,16 @@ ; CHECK-APPLE-ARM64_32-NEXT: .cfi_offset w21, -40 ; CHECK-APPLE-ARM64_32-NEXT: .cfi_offset w22, -48 ; CHECK-APPLE-ARM64_32-NEXT: mov x19, x0 -; CHECK-APPLE-ARM64_32-NEXT: mov w8, #10 -; CHECK-APPLE-ARM64_32-NEXT: mov w9, #11 -; CHECK-APPLE-ARM64_32-NEXT: mov w10, #12 +; CHECK-APPLE-ARM64_32-NEXT: mov w8, #10 ; =0xa +; CHECK-APPLE-ARM64_32-NEXT: mov w9, #11 ; =0xb +; CHECK-APPLE-ARM64_32-NEXT: mov w10, #12 ; =0xc ; CHECK-APPLE-ARM64_32-NEXT: stp w9, w8, [sp, #20] ; CHECK-APPLE-ARM64_32-NEXT: str w10, [sp, #16] ; CHECK-APPLE-ARM64_32-NEXT: mov x21, xzr -; CHECK-APPLE-ARM64_32-NEXT: mov x9, #11 -; CHECK-APPLE-ARM64_32-NEXT: movk x9, #12, lsl #32 -; CHECK-APPLE-ARM64_32-NEXT: stur x9, [sp, #4] -; CHECK-APPLE-ARM64_32-NEXT: str w8, [sp] +; CHECK-APPLE-ARM64_32-NEXT: mov x8, #10 ; =0xa +; CHECK-APPLE-ARM64_32-NEXT: movk x8, #11, lsl #32 +; CHECK-APPLE-ARM64_32-NEXT: str w10, [sp, #8] +; CHECK-APPLE-ARM64_32-NEXT: str x8, [sp] ; CHECK-APPLE-ARM64_32-NEXT: bl _foo_vararg ; CHECK-APPLE-ARM64_32-NEXT: mov x0, x21 ; CHECK-APPLE-ARM64_32-NEXT: cbnz w0, LBB8_2 @@ -1225,11 +1224,11 @@ ; CHECK-O0-ARM64_32-NEXT: ; implicit-def: $x1 ; CHECK-O0-ARM64_32-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: mov x21, xzr -; CHECK-O0-ARM64_32-NEXT: mov w8, #10 +; CHECK-O0-ARM64_32-NEXT: mov w8, #10 ; =0xa ; CHECK-O0-ARM64_32-NEXT: str w8, [sp, #40] -; CHECK-O0-ARM64_32-NEXT: mov w8, #11 +; CHECK-O0-ARM64_32-NEXT: mov w8, #11 ; =0xb ; CHECK-O0-ARM64_32-NEXT: str w8, [sp, #36] -; CHECK-O0-ARM64_32-NEXT: mov w8, #12 +; CHECK-O0-ARM64_32-NEXT: mov w8, #12 ; =0xc ; CHECK-O0-ARM64_32-NEXT: str w8, [sp, #32] ; CHECK-O0-ARM64_32-NEXT: ldr w8, [sp, #40] ; CHECK-O0-ARM64_32-NEXT: ldr w10, [sp, #36] @@ -1499,14 +1498,14 @@ ; CHECK-APPLE-NEXT: mov x28, x2 ; CHECK-APPLE-NEXT: mov x19, x1 ; CHECK-APPLE-NEXT: mov x22, x0 -; CHECK-APPLE-NEXT: mov w0, #1 -; CHECK-APPLE-NEXT: mov w1, #2 -; CHECK-APPLE-NEXT: mov w2, #3 -; CHECK-APPLE-NEXT: mov w3, #4 -; CHECK-APPLE-NEXT: mov w4, #5 -; CHECK-APPLE-NEXT: mov w5, #6 -; CHECK-APPLE-NEXT: mov w6, #7 -; CHECK-APPLE-NEXT: mov w7, #8 +; CHECK-APPLE-NEXT: mov w0, #1 ; =0x1 +; CHECK-APPLE-NEXT: mov w1, #2 ; =0x2 +; CHECK-APPLE-NEXT: mov w2, #3 ; =0x3 +; CHECK-APPLE-NEXT: mov w3, #4 ; =0x4 +; CHECK-APPLE-NEXT: mov w4, #5 ; =0x5 +; CHECK-APPLE-NEXT: mov w5, #6 ; =0x6 +; CHECK-APPLE-NEXT: mov w6, #7 ; =0x7 +; CHECK-APPLE-NEXT: mov w7, #8 ; =0x8 ; CHECK-APPLE-NEXT: mov x20, xzr ; CHECK-APPLE-NEXT: mov x21, xzr ; CHECK-APPLE-NEXT: bl _params_in_reg2 @@ -1553,21 +1552,21 @@ ; CHECK-O0-AARCH64-NEXT: ; implicit-def: $x0 ; CHECK-O0-AARCH64-NEXT: mov x20, xzr ; CHECK-O0-AARCH64-NEXT: mov x21, x20 -; CHECK-O0-AARCH64-NEXT: mov w8, #1 +; CHECK-O0-AARCH64-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-AARCH64-NEXT: mov w0, w8 -; CHECK-O0-AARCH64-NEXT: mov w8, #2 +; CHECK-O0-AARCH64-NEXT: mov w8, #2 ; =0x2 ; CHECK-O0-AARCH64-NEXT: mov w1, w8 -; CHECK-O0-AARCH64-NEXT: mov w8, #3 +; CHECK-O0-AARCH64-NEXT: mov w8, #3 ; =0x3 ; CHECK-O0-AARCH64-NEXT: mov w2, w8 -; CHECK-O0-AARCH64-NEXT: mov w8, #4 +; CHECK-O0-AARCH64-NEXT: mov w8, #4 ; =0x4 ; CHECK-O0-AARCH64-NEXT: mov w3, w8 -; CHECK-O0-AARCH64-NEXT: mov w8, #5 +; CHECK-O0-AARCH64-NEXT: mov w8, #5 ; =0x5 ; CHECK-O0-AARCH64-NEXT: mov w4, w8 -; CHECK-O0-AARCH64-NEXT: mov w8, #6 +; CHECK-O0-AARCH64-NEXT: mov w8, #6 ; =0x6 ; CHECK-O0-AARCH64-NEXT: mov w5, w8 -; CHECK-O0-AARCH64-NEXT: mov w8, #7 +; CHECK-O0-AARCH64-NEXT: mov w8, #7 ; =0x7 ; CHECK-O0-AARCH64-NEXT: mov w6, w8 -; CHECK-O0-AARCH64-NEXT: mov w8, #8 +; CHECK-O0-AARCH64-NEXT: mov w8, #8 ; =0x8 ; CHECK-O0-AARCH64-NEXT: mov w7, w8 ; CHECK-O0-AARCH64-NEXT: bl _params_in_reg2 ; CHECK-O0-AARCH64-NEXT: ldr x20, [sp, #8] ; 8-byte Folded Reload @@ -1607,21 +1606,21 @@ ; CHECK-O0-ARM64_32-NEXT: ; implicit-def: $x0 ; CHECK-O0-ARM64_32-NEXT: mov x20, xzr ; CHECK-O0-ARM64_32-NEXT: mov x21, x20 -; CHECK-O0-ARM64_32-NEXT: mov w8, #1 +; CHECK-O0-ARM64_32-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-ARM64_32-NEXT: mov w0, w8 -; CHECK-O0-ARM64_32-NEXT: mov w8, #2 +; CHECK-O0-ARM64_32-NEXT: mov w8, #2 ; =0x2 ; CHECK-O0-ARM64_32-NEXT: mov w1, w8 -; CHECK-O0-ARM64_32-NEXT: mov w8, #3 +; CHECK-O0-ARM64_32-NEXT: mov w8, #3 ; =0x3 ; CHECK-O0-ARM64_32-NEXT: mov w2, w8 -; CHECK-O0-ARM64_32-NEXT: mov w8, #4 +; CHECK-O0-ARM64_32-NEXT: mov w8, #4 ; =0x4 ; CHECK-O0-ARM64_32-NEXT: mov w3, w8 -; CHECK-O0-ARM64_32-NEXT: mov w8, #5 +; CHECK-O0-ARM64_32-NEXT: mov w8, #5 ; =0x5 ; CHECK-O0-ARM64_32-NEXT: mov w4, w8 -; CHECK-O0-ARM64_32-NEXT: mov w8, #6 +; CHECK-O0-ARM64_32-NEXT: mov w8, #6 ; =0x6 ; CHECK-O0-ARM64_32-NEXT: mov w5, w8 -; CHECK-O0-ARM64_32-NEXT: mov w8, #7 +; CHECK-O0-ARM64_32-NEXT: mov w8, #7 ; =0x7 ; CHECK-O0-ARM64_32-NEXT: mov w6, w8 -; CHECK-O0-ARM64_32-NEXT: mov w8, #8 +; CHECK-O0-ARM64_32-NEXT: mov w8, #8 ; =0x8 ; CHECK-O0-ARM64_32-NEXT: mov w7, w8 ; CHECK-O0-ARM64_32-NEXT: bl _params_in_reg2 ; CHECK-O0-ARM64_32-NEXT: ldr x20, [sp, #8] ; 8-byte Folded Reload @@ -1680,14 +1679,14 @@ ; CHECK-APPLE-NEXT: mov x28, x2 ; CHECK-APPLE-NEXT: mov x19, x1 ; CHECK-APPLE-NEXT: mov x22, x0 -; CHECK-APPLE-NEXT: mov w0, #1 -; CHECK-APPLE-NEXT: mov w1, #2 -; CHECK-APPLE-NEXT: mov w2, #3 -; CHECK-APPLE-NEXT: mov w3, #4 -; CHECK-APPLE-NEXT: mov w4, #5 -; CHECK-APPLE-NEXT: mov w5, #6 -; CHECK-APPLE-NEXT: mov w6, #7 -; CHECK-APPLE-NEXT: mov w7, #8 +; CHECK-APPLE-NEXT: mov w0, #1 ; =0x1 +; CHECK-APPLE-NEXT: mov w1, #2 ; =0x2 +; CHECK-APPLE-NEXT: mov w2, #3 ; =0x3 +; CHECK-APPLE-NEXT: mov w3, #4 ; =0x4 +; CHECK-APPLE-NEXT: mov w4, #5 ; =0x5 +; CHECK-APPLE-NEXT: mov w5, #6 ; =0x6 +; CHECK-APPLE-NEXT: mov w6, #7 ; =0x7 +; CHECK-APPLE-NEXT: mov w7, #8 ; =0x8 ; CHECK-APPLE-NEXT: mov x20, xzr ; CHECK-APPLE-NEXT: mov x21, xzr ; CHECK-APPLE-NEXT: bl _params_in_reg2 @@ -1711,14 +1710,14 @@ ; CHECK-APPLE-NEXT: mov x28, x6 ; CHECK-APPLE-NEXT: mov x23, x7 ; CHECK-APPLE-NEXT: str x21, [sp, #24] ; 8-byte Folded Spill -; CHECK-APPLE-NEXT: mov w0, #1 -; CHECK-APPLE-NEXT: mov w1, #2 -; CHECK-APPLE-NEXT: mov w2, #3 -; CHECK-APPLE-NEXT: mov w3, #4 -; CHECK-APPLE-NEXT: mov w4, #5 -; CHECK-APPLE-NEXT: mov w5, #6 -; CHECK-APPLE-NEXT: mov w6, #7 -; CHECK-APPLE-NEXT: mov w7, #8 +; CHECK-APPLE-NEXT: mov w0, #1 ; =0x1 +; CHECK-APPLE-NEXT: mov w1, #2 ; =0x2 +; CHECK-APPLE-NEXT: mov w2, #3 ; =0x3 +; CHECK-APPLE-NEXT: mov w3, #4 ; =0x4 +; CHECK-APPLE-NEXT: mov w4, #5 ; =0x5 +; CHECK-APPLE-NEXT: mov w5, #6 ; =0x6 +; CHECK-APPLE-NEXT: mov w6, #7 ; =0x7 +; CHECK-APPLE-NEXT: mov w7, #8 ; =0x8 ; CHECK-APPLE-NEXT: mov x20, xzr ; CHECK-APPLE-NEXT: ldr x21, [sp, #8] ; 8-byte Folded Reload ; CHECK-APPLE-NEXT: bl _params_in_reg2 @@ -1765,28 +1764,28 @@ ; CHECK-O0-AARCH64-NEXT: mov x20, xzr ; CHECK-O0-AARCH64-NEXT: str x20, [sp, #80] ; 8-byte Folded Spill ; CHECK-O0-AARCH64-NEXT: mov x21, x20 -; CHECK-O0-AARCH64-NEXT: mov w8, #1 +; CHECK-O0-AARCH64-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-AARCH64-NEXT: mov w0, w8 ; CHECK-O0-AARCH64-NEXT: str x0, [sp, #88] ; 8-byte Folded Spill -; CHECK-O0-AARCH64-NEXT: mov w8, #2 +; CHECK-O0-AARCH64-NEXT: mov w8, #2 ; =0x2 ; CHECK-O0-AARCH64-NEXT: mov w1, w8 ; CHECK-O0-AARCH64-NEXT: str x1, [sp, #96] ; 8-byte Folded Spill -; CHECK-O0-AARCH64-NEXT: mov w8, #3 +; CHECK-O0-AARCH64-NEXT: mov w8, #3 ; =0x3 ; CHECK-O0-AARCH64-NEXT: mov w2, w8 ; CHECK-O0-AARCH64-NEXT: str x2, [sp, #104] ; 8-byte Folded Spill -; CHECK-O0-AARCH64-NEXT: mov w8, #4 +; CHECK-O0-AARCH64-NEXT: mov w8, #4 ; =0x4 ; CHECK-O0-AARCH64-NEXT: mov w3, w8 ; CHECK-O0-AARCH64-NEXT: str x3, [sp, #112] ; 8-byte Folded Spill -; CHECK-O0-AARCH64-NEXT: mov w8, #5 +; CHECK-O0-AARCH64-NEXT: mov w8, #5 ; =0x5 ; CHECK-O0-AARCH64-NEXT: mov w4, w8 ; CHECK-O0-AARCH64-NEXT: str x4, [sp, #120] ; 8-byte Folded Spill -; CHECK-O0-AARCH64-NEXT: mov w8, #6 +; CHECK-O0-AARCH64-NEXT: mov w8, #6 ; =0x6 ; CHECK-O0-AARCH64-NEXT: mov w5, w8 ; CHECK-O0-AARCH64-NEXT: str x5, [sp, #128] ; 8-byte Folded Spill -; CHECK-O0-AARCH64-NEXT: mov w8, #7 +; CHECK-O0-AARCH64-NEXT: mov w8, #7 ; =0x7 ; CHECK-O0-AARCH64-NEXT: mov w6, w8 ; CHECK-O0-AARCH64-NEXT: stur x6, [x29, #-120] ; 8-byte Folded Spill -; CHECK-O0-AARCH64-NEXT: mov w8, #8 +; CHECK-O0-AARCH64-NEXT: mov w8, #8 ; =0x8 ; CHECK-O0-AARCH64-NEXT: mov w7, w8 ; CHECK-O0-AARCH64-NEXT: stur x7, [x29, #-112] ; 8-byte Folded Spill ; CHECK-O0-AARCH64-NEXT: bl _params_in_reg2 @@ -1870,28 +1869,28 @@ ; CHECK-O0-ARM64_32-NEXT: mov x20, xzr ; CHECK-O0-ARM64_32-NEXT: str x20, [sp, #80] ; 8-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: mov x21, x20 -; CHECK-O0-ARM64_32-NEXT: mov w8, #1 +; CHECK-O0-ARM64_32-NEXT: mov w8, #1 ; =0x1 ; CHECK-O0-ARM64_32-NEXT: mov w0, w8 ; CHECK-O0-ARM64_32-NEXT: str x0, [sp, #88] ; 8-byte Folded Spill -; CHECK-O0-ARM64_32-NEXT: mov w8, #2 +; CHECK-O0-ARM64_32-NEXT: mov w8, #2 ; =0x2 ; CHECK-O0-ARM64_32-NEXT: mov w1, w8 ; CHECK-O0-ARM64_32-NEXT: str x1, [sp, #96] ; 8-byte Folded Spill -; CHECK-O0-ARM64_32-NEXT: mov w8, #3 +; CHECK-O0-ARM64_32-NEXT: mov w8, #3 ; =0x3 ; CHECK-O0-ARM64_32-NEXT: mov w2, w8 ; CHECK-O0-ARM64_32-NEXT: str x2, [sp, #104] ; 8-byte Folded Spill -; CHECK-O0-ARM64_32-NEXT: mov w8, #4 +; CHECK-O0-ARM64_32-NEXT: mov w8, #4 ; =0x4 ; CHECK-O0-ARM64_32-NEXT: mov w3, w8 ; CHECK-O0-ARM64_32-NEXT: str x3, [sp, #112] ; 8-byte Folded Spill -; CHECK-O0-ARM64_32-NEXT: mov w8, #5 +; CHECK-O0-ARM64_32-NEXT: mov w8, #5 ; =0x5 ; CHECK-O0-ARM64_32-NEXT: mov w4, w8 ; CHECK-O0-ARM64_32-NEXT: str x4, [sp, #120] ; 8-byte Folded Spill -; CHECK-O0-ARM64_32-NEXT: mov w8, #6 +; CHECK-O0-ARM64_32-NEXT: mov w8, #6 ; =0x6 ; CHECK-O0-ARM64_32-NEXT: mov w5, w8 ; CHECK-O0-ARM64_32-NEXT: str x5, [sp, #128] ; 8-byte Folded Spill -; CHECK-O0-ARM64_32-NEXT: mov w8, #7 +; CHECK-O0-ARM64_32-NEXT: mov w8, #7 ; =0x7 ; CHECK-O0-ARM64_32-NEXT: mov w6, w8 ; CHECK-O0-ARM64_32-NEXT: str x6, [sp, #136] ; 8-byte Folded Spill -; CHECK-O0-ARM64_32-NEXT: mov w8, #8 +; CHECK-O0-ARM64_32-NEXT: mov w8, #8 ; =0x8 ; CHECK-O0-ARM64_32-NEXT: mov w7, w8 ; CHECK-O0-ARM64_32-NEXT: str x7, [sp, #144] ; 8-byte Folded Spill ; CHECK-O0-ARM64_32-NEXT: bl _params_in_reg2 diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -457,28 +457,25 @@ ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: LBB5_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp x10, x9, [x0] -; CHECK-NEXT: ldrb w11, [x0, #18] -; CHECK-NEXT: ldrh w13, [x0, #16] -; CHECK-NEXT: add x0, x0, #32 -; CHECK-NEXT: lsr x14, x10, #19 +; CHECK-NEXT: ldrb w9, [x0, #18] +; CHECK-NEXT: ldrh w10, [x0, #16] +; CHECK-NEXT: orr x9, x10, x9, lsl #16 +; CHECK-NEXT: ldp x10, x11, [x0], #32 +; CHECK-NEXT: extr x13, x11, x10, #19 ; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: ubfx x12, x9, #12, #20 -; CHECK-NEXT: lsr x15, x9, #31 -; CHECK-NEXT: orr w11, w13, w11, lsl #16 -; CHECK-NEXT: lsr x13, x9, #50 -; CHECK-NEXT: mov.s v0[1], w14 +; CHECK-NEXT: extr x12, x9, x11, #12 +; CHECK-NEXT: extr x14, x9, x11, #31 +; CHECK-NEXT: mov.s v0[1], w13 +; CHECK-NEXT: extr x13, x9, x11, #50 ; CHECK-NEXT: fmov s1, w12 -; CHECK-NEXT: lsr x12, x10, #38 -; CHECK-NEXT: orr w13, w13, w11, lsl #14 -; CHECK-NEXT: lsr x10, x10, #57 -; CHECK-NEXT: orr w9, w10, w9, lsl #7 -; CHECK-NEXT: lsr w10, w11, #5 -; CHECK-NEXT: mov.s v1[1], w15 +; CHECK-NEXT: extr x12, x11, x10, #38 +; CHECK-NEXT: extr x10, x11, x10, #57 +; CHECK-NEXT: ubfx x9, x9, #5, #27 +; CHECK-NEXT: mov.s v1[1], w14 ; CHECK-NEXT: mov.s v0[2], w12 ; CHECK-NEXT: mov.s v1[2], w13 -; CHECK-NEXT: mov.s v0[3], w9 -; CHECK-NEXT: mov.s v1[3], w10 +; CHECK-NEXT: mov.s v0[3], w10 +; CHECK-NEXT: mov.s v1[3], w9 ; CHECK-NEXT: uzp1.8h v0, v0, v1 ; CHECK-NEXT: xtn.8b v0, v0 ; CHECK-NEXT: str d0, [x1, x8, lsl #3] @@ -495,33 +492,31 @@ ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ldp x10, x9, [x0] ; CHECK-BE-NEXT: ldrh w11, [x0, #16] -; CHECK-BE-NEXT: lsr x13, x10, #45 -; CHECK-BE-NEXT: lsr x15, x10, #40 -; CHECK-BE-NEXT: lsr x12, x9, #40 -; CHECK-BE-NEXT: ubfx x14, x9, #33, #7 -; CHECK-BE-NEXT: ubfx x16, x10, #26, #14 -; CHECK-BE-NEXT: orr w12, w14, w12, lsl #7 -; CHECK-BE-NEXT: ldrb w14, [x0, #18] -; CHECK-BE-NEXT: orr w15, w16, w15, lsl #14 -; CHECK-BE-NEXT: fmov s0, w13 +; CHECK-BE-NEXT: ldrb w12, [x0, #18] ; CHECK-BE-NEXT: add x0, x0, #32 -; CHECK-BE-NEXT: fmov s1, w12 -; CHECK-BE-NEXT: ubfx x12, x9, #14, #18 -; CHECK-BE-NEXT: orr w11, w14, w11, lsl #8 -; CHECK-BE-NEXT: mov v0.s[1], w15 -; CHECK-BE-NEXT: mov v1.s[1], w12 -; CHECK-BE-NEXT: extr x12, x10, x9, #40 -; CHECK-BE-NEXT: lsl x9, x9, #24 -; CHECK-BE-NEXT: ubfx x10, x10, #7, #25 -; CHECK-BE-NEXT: orr w9, w11, w9 -; CHECK-BE-NEXT: lsr w9, w9, #19 -; CHECK-BE-NEXT: mov v0.s[2], w10 -; CHECK-BE-NEXT: ubfx x10, x12, #12, #20 -; CHECK-BE-NEXT: mov v1.s[2], w9 +; CHECK-BE-NEXT: lsl x13, x9, #24 +; CHECK-BE-NEXT: extr x14, x10, x9, #40 +; CHECK-BE-NEXT: orr x11, x12, x11, lsl #8 +; CHECK-BE-NEXT: lsr x12, x10, #45 +; CHECK-BE-NEXT: lsr x10, x10, #40 +; CHECK-BE-NEXT: extr x15, x14, x13, #57 +; CHECK-BE-NEXT: extr x13, x14, x13, #38 +; CHECK-BE-NEXT: fmov s0, w12 +; CHECK-BE-NEXT: extr x12, x10, x14, #50 +; CHECK-BE-NEXT: fmov s1, w15 +; CHECK-BE-NEXT: mov x15, x11 +; CHECK-BE-NEXT: bfi x15, x9, #24, #40 +; CHECK-BE-NEXT: extr x9, x10, x14, #31 +; CHECK-BE-NEXT: mov v0.s[1], w12 +; CHECK-BE-NEXT: extr x10, x10, x14, #12 +; CHECK-BE-NEXT: mov v1.s[1], w13 +; CHECK-BE-NEXT: extr x12, x14, x15, #19 +; CHECK-BE-NEXT: mov v0.s[2], w9 ; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 +; CHECK-BE-NEXT: mov v1.s[2], w12 ; CHECK-BE-NEXT: add x8, x8, #1 -; CHECK-BE-NEXT: mov v0.s[3], w10 ; CHECK-BE-NEXT: cmp x8, #1000 +; CHECK-BE-NEXT: mov v0.s[3], w10 ; CHECK-BE-NEXT: mov v1.s[3], w11 ; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-BE-NEXT: xtn v0.8b, v0.8h diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll @@ -38,10 +38,13 @@ ; CHECK-LABEL: test_urem_odd_setne: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #13 // =0xd -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: and w8, w8, #0xf -; CHECK-NEXT: cmp w8, #3 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: and w9, w0, #0xf +; CHECK-NEXT: mul w8, w9, w8 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: orr w8, w8, w8, lsl #2 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: tst w8, #0xf +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %urem = urem i4 %X, 5 %cmp = icmp ne i4 %urem, 0 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll @@ -4,12 +4,14 @@ define i1 @t32_3_1(i32 %X) nounwind { ; CHECK-LABEL: t32_3_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: mov w9, #1431655765 +; CHECK-NEXT: mov w8, #43691 // =0xaaab ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w8, lsl #1 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 3 %cmp = icmp eq i32 %urem, 1 @@ -19,13 +21,14 @@ define i1 @t32_3_2(i32 %X) nounwind { ; CHECK-LABEL: t32_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: mov w9, #-1431655766 +; CHECK-NEXT: mov w8, #43691 // =0xaaab ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #1431655765 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #33 +; CHECK-NEXT: add w8, w8, w8, lsl #1 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: cmp w8, #2 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 3 %cmp = icmp eq i32 %urem, 2 @@ -36,12 +39,14 @@ define i1 @t32_5_1(i32 %X) nounwind { ; CHECK-LABEL: t32_5_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #858993459 +; CHECK-NEXT: mov w8, #52429 // =0xcccd ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 1 @@ -51,13 +56,14 @@ define i1 @t32_5_2(i32 %X) nounwind { ; CHECK-LABEL: t32_5_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #1717986918 +; CHECK-NEXT: mov w8, #52429 // =0xcccd ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #858993459 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: cmp w8, #2 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 2 @@ -67,13 +73,14 @@ define i1 @t32_5_3(i32 %X) nounwind { ; CHECK-LABEL: t32_5_3: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #-1717986919 +; CHECK-NEXT: mov w8, #52429 // =0xcccd ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #858993459 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: cmp w8, #3 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 3 @@ -83,13 +90,14 @@ define i1 @t32_5_4(i32 %X) nounwind { ; CHECK-LABEL: t32_5_4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #-858993460 +; CHECK-NEXT: mov w8, #52429 // =0xcccd ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #858993459 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: cmp w8, #4 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 4 @@ -100,15 +108,14 @@ define i1 @t32_6_1(i32 %X) nounwind { ; CHECK-LABEL: t32_6_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: mov w9, #1431655765 +; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: mov w9, #6 // =0x6 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: movk w9, #10922, lsl #16 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 1 @@ -118,15 +125,14 @@ define i1 @t32_6_2(i32 %X) nounwind { ; CHECK-LABEL: t32_6_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: mov w9, #-1431655766 +; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: mov w9, #6 // =0x6 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: movk w9, #10922, lsl #16 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #2 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 2 @@ -136,15 +142,14 @@ define i1 @t32_6_3(i32 %X) nounwind { ; CHECK-LABEL: t32_6_3: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: mov w9, #6 // =0x6 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: movk w9, #10922, lsl #16 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #3 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 3 @@ -154,15 +159,14 @@ define i1 @t32_6_4(i32 %X) nounwind { ; CHECK-LABEL: t32_6_4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: sub w9, w0, #4 +; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: mov w9, #6 // =0x6 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: mul w8, w9, w8 -; CHECK-NEXT: mov w9, #43690 -; CHECK-NEXT: movk w9, #10922, lsl #16 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #4 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 4 @@ -172,15 +176,14 @@ define i1 @t32_6_5(i32 %X) nounwind { ; CHECK-LABEL: t32_6_5: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: sub w9, w0, #5 +; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: mov w9, #6 // =0x6 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: mul w8, w9, w8 -; CHECK-NEXT: mov w9, #43690 -; CHECK-NEXT: movk w9, #10922, lsl #16 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #5 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 6 %cmp = icmp eq i32 %urem, 5 @@ -193,12 +196,15 @@ define i1 @t16_3_2(i16 %X) nounwind { ; CHECK-LABEL: t16_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-21845 -; CHECK-NEXT: mov w9, #-21846 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #21845 -; CHECK-NEXT: cmp w9, w8, uxth -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: and w9, w0, #0xffff +; CHECK-NEXT: mul w8, w9, w8 +; CHECK-NEXT: lsr w8, w8, #17 +; CHECK-NEXT: add w8, w8, w8, lsl #1 +; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: and w8, w8, #0xffff +; CHECK-NEXT: cmp w8, #2 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i16 %X, 3 %cmp = icmp eq i16 %urem, 2 @@ -208,12 +214,15 @@ define i1 @t8_3_2(i8 %X) nounwind { ; CHECK-LABEL: t8_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-85 -; CHECK-NEXT: mov w9, #-86 -; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w8, #171 // =0xab +; CHECK-NEXT: and w9, w0, #0xff +; CHECK-NEXT: mul w8, w9, w8 +; CHECK-NEXT: lsr w8, w8, #9 +; CHECK-NEXT: add w8, w8, w8, lsl #1 +; CHECK-NEXT: sub w8, w0, w8 ; CHECK-NEXT: and w8, w8, #0xff -; CHECK-NEXT: cmp w8, #85 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: cmp w8, #2 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i8 %X, 3 %cmp = icmp eq i8 %urem, 2 @@ -223,13 +232,14 @@ define i1 @t64_3_2(i64 %X) nounwind { ; CHECK-LABEL: t64_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-6148914691236517206 -; CHECK-NEXT: mov x9, #-6148914691236517206 +; CHECK-NEXT: mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa ; CHECK-NEXT: movk x8, #43691 -; CHECK-NEXT: madd x8, x0, x8, x9 -; CHECK-NEXT: mov x9, #6148914691236517205 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umulh x8, x0, x8 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: add x8, x8, x8, lsl #1 +; CHECK-NEXT: sub x8, x0, x8 +; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i64 %X, 3 %cmp = icmp eq i64 %urem, 2 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll b/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll @@ -4,12 +4,12 @@ define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { ; CHECK-LABEL: test_minsize: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #5 -; CHECK-NEXT: mov w9, #42 +; CHECK-NEXT: mov w8, #5 // =0x5 +; CHECK-NEXT: mov w9, #42 // =0x2a ; CHECK-NEXT: udiv w8, w0, w8 ; CHECK-NEXT: add w8, w8, w8, lsl #2 ; CHECK-NEXT: cmp w0, w8 -; CHECK-NEXT: mov w8, #-10 +; CHECK-NEXT: mov w8, #-10 // =0xfffffff6 ; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret %rem = urem i32 %X, 5 @@ -21,15 +21,15 @@ define i32 @test_optsize(i32 %X) optsize nounwind readnone { ; CHECK-LABEL: test_optsize: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #13108 +; CHECK-NEXT: mov w8, #52429 // =0xcccd +; CHECK-NEXT: mov w9, #42 // =0x2a ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #13107, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: mov w8, #-10 -; CHECK-NEXT: mov w9, #42 -; CHECK-NEXT: csel w0, w9, w8, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: mov w8, #-10 // =0xfffffff6 +; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret %rem = urem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -6,20 +6,22 @@ ; CHECK-LABEL: test_urem_odd_even: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: adrp x9, .LCPI0_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI0_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: adrp x8, .LCPI0_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_1] -; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x9, .LCPI0_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI0_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -34,13 +36,19 @@ ; CHECK-LABEL: test_urem_odd_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI1_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] ; CHECK-NEXT: adrp x8, .LCPI1_1 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_1] -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI1_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -51,13 +59,19 @@ ; CHECK-LABEL: test_urem_odd_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI2_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: adrp x8, .LCPI2_1 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_1] -; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI2_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -70,20 +84,22 @@ ; CHECK-LABEL: test_urem_even_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: adrp x9, .LCPI3_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI3_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: adrp x8, .LCPI3_3 +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x9, .LCPI3_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI3_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -94,20 +110,22 @@ ; CHECK-LABEL: test_urem_even_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: adrp x9, .LCPI4_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI4_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: adrp x8, .LCPI4_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI4_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_1] -; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x9, .LCPI4_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI4_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -120,20 +138,22 @@ ; CHECK-LABEL: test_urem_odd_even_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI5_0 -; CHECK-NEXT: adrp x9, .LCPI5_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI5_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] -; CHECK-NEXT: adrp x8, .LCPI5_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI5_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_1] -; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: adrp x8, .LCPI5_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI5_1] +; CHECK-NEXT: adrp x9, .LCPI5_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI5_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -144,20 +164,22 @@ ; CHECK-LABEL: test_urem_odd_even_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI6_0 -; CHECK-NEXT: adrp x9, .LCPI6_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI6_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] -; CHECK-NEXT: adrp x8, .LCPI6_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_1] -; CHECK-NEXT: adrp x8, .LCPI6_3 +; CHECK-NEXT: adrp x8, .LCPI6_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_1] +; CHECK-NEXT: adrp x9, .LCPI6_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI6_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -173,19 +195,18 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI7_0 ; CHECK-NEXT: adrp x9, .LCPI7_2 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] ; CHECK-NEXT: adrp x8, .LCPI7_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI7_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_1] -; CHECK-NEXT: adrp x8, .LCPI7_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI7_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -198,20 +219,22 @@ ; CHECK-LABEL: test_urem_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI8_0 -; CHECK-NEXT: adrp x9, .LCPI8_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI8_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: adrp x8, .LCPI8_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_1] -; CHECK-NEXT: adrp x8, .LCPI8_3 +; CHECK-NEXT: adrp x8, .LCPI8_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_1] +; CHECK-NEXT: adrp x9, .LCPI8_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI8_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -224,20 +247,22 @@ ; CHECK-LABEL: test_urem_odd_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: adrp x9, .LCPI9_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI9_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: adrp x8, .LCPI9_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI9_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_1] -; CHECK-NEXT: adrp x8, .LCPI9_3 +; CHECK-NEXT: adrp x8, .LCPI9_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI9_1] +; CHECK-NEXT: adrp x9, .LCPI9_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI9_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -251,15 +276,22 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 +; CHECK-NEXT: mov w8, #52429 // =0xcccd ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: adrp x8, .LCPI10_1 +; CHECK-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI10_1] +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -271,18 +303,23 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: mov w8, #9363 // =0x2493 +; CHECK-NEXT: ushr v2.4s, v0.4s, #1 +; CHECK-NEXT: movk w8, #37449, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v3.2d, v2.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v2.2s, v1.2s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: shl v1.4s, v0.4s, #31 -; CHECK-NEXT: ushr v0.4s, v0.4s, #1 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: adrp x8, .LCPI11_1 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1] +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -295,20 +332,25 @@ ; CHECK-LABEL: test_urem_odd_even_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI12_0 -; CHECK-NEXT: adrp x9, .LCPI12_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI12_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] -; CHECK-NEXT: adrp x8, .LCPI12_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI12_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_1] -; CHECK-NEXT: adrp x8, .LCPI12_3 +; CHECK-NEXT: adrp x8, .LCPI12_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI12_1] +; CHECK-NEXT: adrp x9, .LCPI12_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_2] +; CHECK-NEXT: adrp x8, .LCPI12_4 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI12_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_4] +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -324,19 +366,18 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI13_0 ; CHECK-NEXT: adrp x9, .LCPI13_2 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: adrp x8, .LCPI13_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI13_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_1] -; CHECK-NEXT: adrp x8, .LCPI13_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI13_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -349,20 +390,22 @@ ; CHECK-LABEL: test_urem_even_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: adrp x9, .LCPI14_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI14_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: adrp x8, .LCPI14_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_1] -; CHECK-NEXT: adrp x8, .LCPI14_3 +; CHECK-NEXT: adrp x8, .LCPI14_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_1] +; CHECK-NEXT: adrp x9, .LCPI14_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI14_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -375,20 +418,22 @@ ; CHECK-LABEL: test_urem_odd_even_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: adrp x9, .LCPI15_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI15_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: adrp x8, .LCPI15_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI15_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] -; CHECK-NEXT: adrp x8, .LCPI15_3 +; CHECK-NEXT: adrp x8, .LCPI15_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI15_1] +; CHECK-NEXT: adrp x9, .LCPI15_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI15_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -404,19 +449,18 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI16_0 ; CHECK-NEXT: adrp x9, .LCPI16_2 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] ; CHECK-NEXT: adrp x8, .LCPI16_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI16_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_1] -; CHECK-NEXT: adrp x8, .LCPI16_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI16_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -429,20 +473,22 @@ ; CHECK-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI17_0 -; CHECK-NEXT: adrp x9, .LCPI17_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI17_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] -; CHECK-NEXT: adrp x8, .LCPI17_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI17_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_1] -; CHECK-NEXT: adrp x8, .LCPI17_3 +; CHECK-NEXT: adrp x8, .LCPI17_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI17_1] +; CHECK-NEXT: adrp x9, .LCPI17_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI17_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -456,19 +502,18 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI18_0 ; CHECK-NEXT: adrp x9, .LCPI18_2 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] ; CHECK-NEXT: adrp x8, .LCPI18_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_1] -; CHECK-NEXT: adrp x8, .LCPI18_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI18_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -483,13 +528,22 @@ ; CHECK-LABEL: test_urem_odd_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI19_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] ; CHECK-NEXT: adrp x8, .LCPI19_1 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_1] -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI19_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_1] +; CHECK-NEXT: adrp x8, .LCPI19_3 +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_3] +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -502,20 +556,25 @@ ; CHECK-LABEL: test_urem_even_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI20_0 -; CHECK-NEXT: adrp x9, .LCPI20_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI20_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: adrp x8, .LCPI20_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI20_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_1] -; CHECK-NEXT: adrp x8, .LCPI20_3 +; CHECK-NEXT: adrp x8, .LCPI20_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI20_1] +; CHECK-NEXT: adrp x9, .LCPI20_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_2] +; CHECK-NEXT: adrp x8, .LCPI20_4 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI20_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_4] +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -529,19 +588,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI21_0 ; CHECK-NEXT: adrp x9, .LCPI21_2 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] ; CHECK-NEXT: adrp x8, .LCPI21_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI21_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_1] +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI21_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] ; CHECK-NEXT: adrp x8, .LCPI21_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -557,19 +618,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI22_0 ; CHECK-NEXT: adrp x9, .LCPI22_2 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] ; CHECK-NEXT: adrp x8, .LCPI22_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI22_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_1] +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI22_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1] ; CHECK-NEXT: adrp x8, .LCPI22_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -582,20 +645,25 @@ ; CHECK-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI23_0 -; CHECK-NEXT: adrp x9, .LCPI23_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI23_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] -; CHECK-NEXT: adrp x8, .LCPI23_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI23_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_1] -; CHECK-NEXT: adrp x8, .LCPI23_3 +; CHECK-NEXT: adrp x8, .LCPI23_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI23_1] +; CHECK-NEXT: adrp x9, .LCPI23_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_2] +; CHECK-NEXT: adrp x8, .LCPI23_4 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI23_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_4] +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -609,19 +677,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI24_0 ; CHECK-NEXT: adrp x9, .LCPI24_2 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] ; CHECK-NEXT: adrp x8, .LCPI24_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI24_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_1] +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI24_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] ; CHECK-NEXT: adrp x8, .LCPI24_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -636,19 +706,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI25_0 ; CHECK-NEXT: adrp x9, .LCPI25_2 -; CHECK-NEXT: movi v3.4s, #1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] ; CHECK-NEXT: adrp x8, .LCPI25_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI25_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_1] +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI25_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_1] ; CHECK-NEXT: adrp x8, .LCPI25_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -660,20 +732,25 @@ ; CHECK-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI26_0 -; CHECK-NEXT: adrp x9, .LCPI26_2 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI26_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] -; CHECK-NEXT: adrp x8, .LCPI26_1 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI26_2] -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_1] -; CHECK-NEXT: adrp x8, .LCPI26_3 +; CHECK-NEXT: adrp x8, .LCPI26_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI26_1] +; CHECK-NEXT: adrp x9, .LCPI26_3 ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_3] -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_2] +; CHECK-NEXT: adrp x8, .LCPI26_4 +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI26_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_4] +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -4,16 +4,18 @@ define <4 x i1> @t32_3(<4 x i32> %X) nounwind { ; CHECK-LABEL: t32_3: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: mov w8, #43691 // =0xaaab ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: adrp x8, .LCPI0_1 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_1] -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v2.4s, #3 +; CHECK-NEXT: ushr v1.4s, v1.4s, #1 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -24,17 +26,18 @@ define <4 x i1> @t32_5(<4 x i32> %X) nounwind { ; CHECK-LABEL: t32_5: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: mov w8, #52429 +; CHECK-NEXT: mov w8, #52429 // =0xcccd ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: mov w8, #13106 -; CHECK-NEXT: movk w8, #13107, lsl #16 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v2.4s, #5 +; CHECK-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -45,20 +48,18 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-LABEL: t32_6_part0: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: mov w8, #43691 // =0xaaab ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: mov w8, #43690 -; CHECK-NEXT: movk w8, #10922, lsl #16 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: shl v1.4s, v0.4s, #31 -; CHECK-NEXT: ushr v0.4s, v0.4s, #1 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v2.4s, #6 +; CHECK-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -69,19 +70,18 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind { ; CHECK-LABEL: t32_6_part1: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: movk w9, #43690, lsl #16 +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v2.4s, #6 +; CHECK-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: shl v1.4s, v0.4s, #31 -; CHECK-NEXT: ushr v0.4s, v0.4s, #1 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -93,18 +93,22 @@ ; CHECK-LABEL: t32_tautological: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: movi v1.2d, #0xffffffff00000000 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] ; CHECK-NEXT: adrp x8, .LCPI4_1 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_1] -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi d1, #0x00ffffffff0000 +; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll @@ -5,16 +5,18 @@ define <4 x i32> @test_urem_odd_25(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: mov w8, #28835 -; CHECK-NEXT: movk w8, #2621, lsl #16 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v2.4s, #25 +; CHECK-NEXT: ushr v1.4s, v1.4s, #3 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -26,19 +28,18 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: mov w8, #23592 -; CHECK-NEXT: movk w8, #655, lsl #16 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: shl v1.4s, v0.4s, #30 -; CHECK-NEXT: ushr v0.4s, v0.4s, #2 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v2.4s, #100 +; CHECK-NEXT: ushr v1.4s, v1.4s, #5 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -53,13 +54,19 @@ ; CHECK-LABEL: test_urem_odd_neg25: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI2_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: adrp x8, .LCPI2_1 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_1] -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI2_2] +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -72,16 +79,22 @@ ; CHECK-LABEL: test_urem_even_neg100: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: adrp x9, .LCPI3_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: shl v1.4s, v0.4s, #30 -; CHECK-NEXT: ushr v0.4s, v0.4s, #2 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x9, .LCPI3_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v3.2d, v1.4s, v2.4s +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI3_3] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -96,7 +109,7 @@ define <4 x i32> @test_urem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: mov w8, #34079 // =0x851f ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s @@ -118,7 +131,7 @@ define <4 x i32> @test_urem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_undef1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 +; CHECK-NEXT: mov w8, #34079 // =0x851f ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s @@ -197,10 +210,12 @@ define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_allones: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: neg v0.4s, v0.4s -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmeq v1.4s, v0.4s, v1.4s +; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll @@ -20,16 +20,22 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: t1_all_odd_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-NEXT: adrp x8, .LCPI1_1 +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] +; CHECK-NEXT: adrp x8, .LCPI1_2 +; CHECK-NEXT: ushr v1.4s, v1.4s, #1 +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_2] +; CHECK-NEXT: adrp x8, .LCPI1_3 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_3] +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -39,16 +45,23 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: t1_all_odd_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s -; CHECK-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-NEXT: adrp x8, .LCPI2_1 +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: adrp x8, .LCPI2_2 +; CHECK-NEXT: ushr v1.4s, v1.4s, #1 +; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_2] +; CHECK-NEXT: adrp x8, .LCPI2_3 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_3] +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -58,15 +71,26 @@ define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind { ; CHECK-LABEL: t2_narrow: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: dup v1.8h, w8 ; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-NEXT: movi v3.2d, #0xffff00000000ffff +; CHECK-NEXT: movi v4.2d, #0x00ffffffff0000 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: cmhs v0.8h, v1.8h, v0.8h -; CHECK-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: ushl v1.8h, v1.8h, v2.8h +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: adrp x8, .LCPI3_3 +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: and v3.16b, v0.16b, v4.16b +; CHECK-NEXT: orr v1.16b, v3.16b, v1.16b +; CHECK-NEXT: mls v0.8h, v1.8h, v2.8h +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_3] +; CHECK-NEXT: cmeq v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <8 x i16> %X, %cmp = icmp eq <8 x i16> %urem, @@ -76,7 +100,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { ; CHECK-LABEL: t3_wide: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-6148914691236517206 +; CHECK-NEXT: mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa ; CHECK-NEXT: fmov x9, d0 ; CHECK-NEXT: movk x8, #43691 ; CHECK-NEXT: mov x10, v0.d[1] diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -8,13 +8,13 @@ define i32 @test_urem_odd(i32 %X) nounwind { ; CHECK-LABEL: test_urem_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #13108 +; CHECK-NEXT: mov w8, #52429 // =0xcccd ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #13107, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 0 @@ -25,13 +25,14 @@ define i32 @test_urem_odd_25(i32 %X) nounwind { ; CHECK-LABEL: test_urem_odd_25: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #28836 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #2621, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: mov w9, #25 // =0x19 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #35 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 25 %cmp = icmp eq i32 %urem, 0 @@ -43,11 +44,14 @@ define i32 @test_urem_odd_bit30(i32 %X) nounwind { ; CHECK-LABEL: test_urem_odd_bit30: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #27306, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: cmp w8, #4 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #-11 // =0xfffffff5 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: movk w9, #16384, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #62 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 1073741827 %cmp = icmp eq i32 %urem, 0 @@ -59,11 +63,15 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind { ; CHECK-LABEL: test_urem_odd_bit31: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #10922, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: cmp w8, #2 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: ubfiz x9, x0, #30, #32 +; CHECK-NEXT: mov w10, w0 +; CHECK-NEXT: sub x9, x9, x10 +; CHECK-NEXT: mov w8, #-2147483645 // =0x80000003 +; CHECK-NEXT: lsr x9, x9, #61 +; CHECK-NEXT: msub w8, w9, w8, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 2147483651 %cmp = icmp eq i32 %urem, 0 @@ -78,14 +86,14 @@ define i16 @test_urem_even(i16 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #28087 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: and w9, w8, #0xfffc -; CHECK-NEXT: lsr w9, w9, #1 -; CHECK-NEXT: orr w8, w9, w8, lsl #15 -; CHECK-NEXT: ubfx w8, w8, #1, #15 -; CHECK-NEXT: cmp w8, #2340 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mov w8, #18725 // =0x4925 +; CHECK-NEXT: ubfx w9, w0, #1, #15 +; CHECK-NEXT: mul w8, w9, w8 +; CHECK-NEXT: mov w9, #14 // =0xe +; CHECK-NEXT: lsr w8, w8, #17 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: tst w8, #0xffff +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %urem = urem i16 %X, 14 %cmp = icmp ne i16 %urem, 0 @@ -96,14 +104,14 @@ define i32 @test_urem_even_100(i32 %X) nounwind { ; CHECK-LABEL: test_urem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: mov w9, #23593 -; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: movk w9, #655, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: ror w8, w8, #2 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: mov w9, #100 // =0x64 +; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #37 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 100 %cmp = icmp eq i32 %urem, 0 @@ -115,12 +123,14 @@ define i32 @test_urem_even_bit30(i32 %X) nounwind { ; CHECK-LABEL: test_urem_even_bit30: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #20165 -; CHECK-NEXT: movk w8, #64748, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: ror w8, w8, #3 -; CHECK-NEXT: cmp w8, #4 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #-415 // =0xfffffe61 +; CHECK-NEXT: mov w9, #104 // =0x68 +; CHECK-NEXT: movk w9, #16384, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #62 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 1073741928 %cmp = icmp eq i32 %urem, 0 @@ -132,12 +142,15 @@ define i32 @test_urem_even_bit31(i32 %X) nounwind { ; CHECK-LABEL: test_urem_even_bit31: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #64251 -; CHECK-NEXT: movk w8, #47866, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, #2 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #65435 // =0xff9b +; CHECK-NEXT: mov w9, #102 // =0x66 +; CHECK-NEXT: movk w8, #32767, lsl #16 +; CHECK-NEXT: movk w9, #32768, lsl #16 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #62 +; CHECK-NEXT: msub w8, w8, w9, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 2147483750 %cmp = icmp eq i32 %urem, 0 @@ -153,12 +166,13 @@ define i32 @test_urem_odd_setne(i32 %X) nounwind { ; CHECK-LABEL: test_urem_odd_setne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: mov w9, #858993459 +; CHECK-NEXT: mov w8, #52429 // =0xcccd ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #34 +; CHECK-NEXT: add w8, w8, w8, lsl #2 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp ne i32 %urem, 0 @@ -170,10 +184,12 @@ define i32 @test_urem_negative_odd(i32 %X) nounwind { ; CHECK-LABEL: test_urem_negative_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #858993459 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: mov w8, #-2147483645 // =0x80000003 +; CHECK-NEXT: umull x8, w0, w8 +; CHECK-NEXT: lsr x8, x8, #63 +; CHECK-NEXT: orr w8, w8, w8, lsl #2 +; CHECK-NEXT: cmn w0, w8 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %urem = urem i32 %X, -5 %cmp = icmp ne i32 %urem, 0 @@ -183,12 +199,13 @@ define i32 @test_urem_negative_even(i32 %X) nounwind { ; CHECK-LABEL: test_urem_negative_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #37449 -; CHECK-NEXT: movk w8, #51492, lsl #16 -; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: lsr w9, w0, #1 +; CHECK-NEXT: mov w8, #-14 // =0xfffffff2 +; CHECK-NEXT: add x9, x9, x9, lsl #28 +; CHECK-NEXT: lsr x9, x9, #59 +; CHECK-NEXT: msub w8, w9, w8, w0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %urem = urem i32 %X, -14 %cmp = icmp ne i32 %urem, 0 @@ -204,7 +221,7 @@ define i32 @test_urem_one(i32 %X) nounwind { ; CHECK-LABEL: test_urem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #1 +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret %urem = urem i32 %X, 1 %cmp = icmp eq i32 %urem, 0 @@ -242,9 +259,10 @@ define i32 @test_urem_allones(i32 %X) nounwind { ; CHECK-LABEL: test_urem_allones: ; CHECK: // %bb.0: -; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: cmp w8, #2 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: cmn w0, #1 +; CHECK-NEXT: csel w8, wzr, w0, eq +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %urem = urem i32 %X, 4294967295 %cmp = icmp eq i32 %urem, 0 diff --git a/llvm/test/CodeGen/AArch64/ushl_sat.ll b/llvm/test/CodeGen/AArch64/ushl_sat.ll --- a/llvm/test/CodeGen/AArch64/ushl_sat.ll +++ b/llvm/test/CodeGen/AArch64/ushl_sat.ll @@ -74,7 +74,7 @@ define i16 @combine_shlsat_constfold(i16 %x, i16 %y) nounwind { ; CHECK-LABEL: combine_shlsat_constfold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #32 +; CHECK-NEXT: mov w0, #32 // =0x20 ; CHECK-NEXT: ret %tmp = call i16 @llvm.ushl.sat.i16(i16 8, i16 2) ret i16 %tmp @@ -84,7 +84,7 @@ define i16 @combine_shlsat_satmax(i16 %x, i16 %y) nounwind { ; CHECK-LABEL: combine_shlsat_satmax: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, #65535 +; CHECK-NEXT: mov w0, #65535 // =0xffff ; CHECK-NEXT: ret %tmp = call i16 @llvm.ushl.sat.i16(i16 8, i16 15) ret i16 %tmp @@ -98,8 +98,8 @@ ; CHECK-LABEL: combine_shlsat_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: mov w0, #32 -; CHECK-NEXT: mov w1, #65535 +; CHECK-NEXT: mov w0, #32 // =0x20 +; CHECK-NEXT: mov w1, #65535 // =0xffff ; CHECK-NEXT: bl sink2xi16 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -128,9 +128,9 @@ define i16 @combine_shlsat_to_shl_no_fold(i16 %x) nounwind { ; CHECK-LABEL: combine_shlsat_to_shl_no_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xfffc -; CHECK-NEXT: lsl w9, w8, #17 -; CHECK-NEXT: lsl w8, w8, #14 +; CHECK-NEXT: lsl w8, w0, #14 +; CHECK-NEXT: and w8, w8, #0x3fff0000 +; CHECK-NEXT: lsl w9, w8, #3 ; CHECK-NEXT: cmp w8, w9, lsr #3 ; CHECK-NEXT: csinv w8, w9, wzr, eq ; CHECK-NEXT: lsr w0, w8, #16 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll @@ -102,6 +102,7 @@ ; CHECK-NEXT: adrp x8, .LCPI8_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.b[15], wzr ; CHECK-NEXT: addv b0, v0.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -2066,115 +2066,98 @@ } define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { -; CHECK-BASE-LABEL: full: -; CHECK-BASE: // %bb.0: // %entry -; CHECK-BASE-NEXT: // kill: def $w3 killed $w3 def $x3 -; CHECK-BASE-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-BASE-NEXT: sxtw x8, w1 -; CHECK-BASE-NEXT: sxtw x10, w3 -; CHECK-BASE-NEXT: add x9, x0, x8 -; CHECK-BASE-NEXT: ldr d0, [x0] -; CHECK-BASE-NEXT: ldr d1, [x2] -; CHECK-BASE-NEXT: add x11, x2, x10 -; CHECK-BASE-NEXT: ldr d2, [x9] -; CHECK-BASE-NEXT: add x9, x9, x8 -; CHECK-BASE-NEXT: uabdl v0.8h, v0.8b, v1.8b -; CHECK-BASE-NEXT: ldr d1, [x11] -; CHECK-BASE-NEXT: add x11, x11, x10 -; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h -; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v1.8b -; CHECK-BASE-NEXT: ldr d2, [x9] -; CHECK-BASE-NEXT: ldr d3, [x11] -; CHECK-BASE-NEXT: add x9, x9, x8 -; CHECK-BASE-NEXT: add x11, x11, x10 -; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h -; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d2, [x9] -; CHECK-BASE-NEXT: ldr d3, [x11] -; CHECK-BASE-NEXT: add x9, x9, x8 -; CHECK-BASE-NEXT: add x11, x11, x10 -; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h -; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d2, [x9] -; CHECK-BASE-NEXT: ldr d3, [x11] -; CHECK-BASE-NEXT: add x9, x9, x8 -; CHECK-BASE-NEXT: add x11, x11, x10 -; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h -; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d2, [x9] -; CHECK-BASE-NEXT: ldr d3, [x11] -; CHECK-BASE-NEXT: add x9, x9, x8 -; CHECK-BASE-NEXT: add x11, x11, x10 -; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h -; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d2, [x9] -; CHECK-BASE-NEXT: ldr d3, [x11] -; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h -; CHECK-BASE-NEXT: ldr d1, [x9, x8] -; CHECK-BASE-NEXT: uabdl v2.8h, v2.8b, v3.8b -; CHECK-BASE-NEXT: ldr d3, [x11, x10] -; CHECK-BASE-NEXT: uadalp v0.4s, v2.8h -; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v3.8b -; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h -; CHECK-BASE-NEXT: addv s0, v0.4s -; CHECK-BASE-NEXT: fmov w0, s0 -; CHECK-BASE-NEXT: ret -; -; CHECK-DOT-LABEL: full: -; CHECK-DOT: // %bb.0: // %entry -; CHECK-DOT-NEXT: // kill: def $w3 killed $w3 def $x3 -; CHECK-DOT-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-DOT-NEXT: sxtw x8, w3 -; CHECK-DOT-NEXT: sxtw x9, w1 -; CHECK-DOT-NEXT: ldr d0, [x0] -; CHECK-DOT-NEXT: add x10, x0, x9 -; CHECK-DOT-NEXT: ldr d1, [x2] -; CHECK-DOT-NEXT: add x11, x2, x8 -; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000 -; CHECK-DOT-NEXT: movi v3.8b, #1 -; CHECK-DOT-NEXT: uabd v0.8b, v0.8b, v1.8b -; CHECK-DOT-NEXT: ldr d1, [x10] -; CHECK-DOT-NEXT: ldr d4, [x11] -; CHECK-DOT-NEXT: add x10, x10, x9 -; CHECK-DOT-NEXT: add x11, x11, x8 -; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b -; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b -; CHECK-DOT-NEXT: ldr d1, [x10] -; CHECK-DOT-NEXT: ldr d4, [x11] -; CHECK-DOT-NEXT: add x10, x10, x9 -; CHECK-DOT-NEXT: add x11, x11, x8 -; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b -; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b -; CHECK-DOT-NEXT: ldr d1, [x10] -; CHECK-DOT-NEXT: ldr d4, [x11] -; CHECK-DOT-NEXT: add x10, x10, x9 -; CHECK-DOT-NEXT: add x11, x11, x8 -; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b -; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b -; CHECK-DOT-NEXT: ldr d1, [x10] -; CHECK-DOT-NEXT: ldr d4, [x11] -; CHECK-DOT-NEXT: add x10, x10, x9 -; CHECK-DOT-NEXT: add x11, x11, x8 -; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b -; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b -; CHECK-DOT-NEXT: ldr d1, [x10] -; CHECK-DOT-NEXT: ldr d4, [x11] -; CHECK-DOT-NEXT: add x10, x10, x9 -; CHECK-DOT-NEXT: add x11, x11, x8 -; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b -; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b -; CHECK-DOT-NEXT: ldr d1, [x10] -; CHECK-DOT-NEXT: ldr d4, [x11] -; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b -; CHECK-DOT-NEXT: ldr d0, [x10, x9] -; CHECK-DOT-NEXT: uabd v1.8b, v1.8b, v4.8b -; CHECK-DOT-NEXT: ldr d4, [x11, x8] -; CHECK-DOT-NEXT: udot v2.2s, v1.8b, v3.8b -; CHECK-DOT-NEXT: uabd v0.8b, v0.8b, v4.8b -; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b -; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s -; CHECK-DOT-NEXT: fmov w0, s0 -; CHECK-DOT-NEXT: ret +; CHECK-LABEL: full: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: sxtw x8, w3 +; CHECK-NEXT: add x10, x0, x9 +; CHECK-NEXT: add x11, x2, x8 +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: add x12, x10, x9 +; CHECK-NEXT: ldr d1, [x2] +; CHECK-NEXT: ldr d2, [x10] +; CHECK-NEXT: add x10, x11, x8 +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: add x11, x12, x9 +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ldr d1, [x12] +; CHECK-NEXT: ldr d4, [x10] +; CHECK-NEXT: add x10, x10, x8 +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: sshll v3.4s, v0.4h, #0 +; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-NEXT: usubl v1.8h, v1.8b, v4.8b +; CHECK-NEXT: sshll2 v4.4s, v2.8h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: abs v0.4s, v0.4s +; CHECK-NEXT: abs v3.4s, v3.4s +; CHECK-NEXT: abs v4.4s, v4.4s +; CHECK-NEXT: abs v2.4s, v2.4s +; CHECK-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: sshll v5.4s, v1.4h, #0 +; CHECK-NEXT: add x11, x11, x9 +; CHECK-NEXT: add v2.4s, v2.4s, v4.4s +; CHECK-NEXT: ldr d4, [x10] +; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0 +; CHECK-NEXT: add x10, x10, x8 +; CHECK-NEXT: abs v5.4s, v5.4s +; CHECK-NEXT: abs v1.4s, v1.4s +; CHECK-NEXT: usubl v3.8h, v3.8b, v4.8b +; CHECK-NEXT: ldr d4, [x11] +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add x11, x11, x9 +; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: ldr d5, [x10] +; CHECK-NEXT: sshll2 v2.4s, v3.8h, #0 +; CHECK-NEXT: add x10, x10, x8 +; CHECK-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-NEXT: abs v2.4s, v2.4s +; CHECK-NEXT: abs v3.4s, v3.4s +; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b +; CHECK-NEXT: ldr d5, [x10] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add x10, x10, x8 +; CHECK-NEXT: add v1.4s, v3.4s, v2.4s +; CHECK-NEXT: ldr d3, [x11] +; CHECK-NEXT: sshll2 v2.4s, v4.8h, #0 +; CHECK-NEXT: add x11, x11, x9 +; CHECK-NEXT: sshll v4.4s, v4.4h, #0 +; CHECK-NEXT: abs v2.4s, v2.4s +; CHECK-NEXT: abs v4.4s, v4.4s +; CHECK-NEXT: usubl v3.8h, v3.8b, v5.8b +; CHECK-NEXT: ldr d5, [x10] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v4.4s, v2.4s +; CHECK-NEXT: ldr d4, [x11] +; CHECK-NEXT: sshll2 v2.4s, v3.8h, #0 +; CHECK-NEXT: sshll v3.4s, v3.4h, #0 +; CHECK-NEXT: abs v2.4s, v2.4s +; CHECK-NEXT: abs v3.4s, v3.4s +; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b +; CHECK-NEXT: ldr d5, [x10, x8] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v3.4s, v2.4s +; CHECK-NEXT: ldr d2, [x11, x9] +; CHECK-NEXT: sshll2 v3.4s, v4.8h, #0 +; CHECK-NEXT: sshll v4.4s, v4.4h, #0 +; CHECK-NEXT: abs v3.4s, v3.4s +; CHECK-NEXT: abs v4.4s, v4.4s +; CHECK-NEXT: usubl v2.8h, v2.8b, v5.8b +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v4.4s, v3.4s +; CHECK-NEXT: sshll2 v3.4s, v2.8h, #0 +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: abs v3.4s, v3.4s +; CHECK-NEXT: abs v2.4s, v2.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v2.4s, v3.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret entry: %idx.ext8 = sext i32 %s2 to i64 %idx.ext = sext i32 %s1 to i64 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -117,6 +117,7 @@ ; CHECK-NEXT: adrp x8, .LCPI9_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.b[15], wzr ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll --- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -218,9 +218,9 @@ ; ALL-NEXT: stp q0, q0, [sp] ; ALL-NEXT: stp x10, x11, [sp, #48] ; ALL-NEXT: str q1, [sp, #32] -; ALL-NEXT: ldp x9, x10, [x8, #16] +; ALL-NEXT: ldp x10, x9, [x8, #16] ; ALL-NEXT: ldr q0, [x8] -; ALL-NEXT: stp x9, x10, [x2, #16] +; ALL-NEXT: stp x10, x9, [x2, #16] ; ALL-NEXT: str q0, [x2] ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll --- a/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll @@ -167,29 +167,29 @@ ; ALL-NEXT: ubfx x12, x9, #3, #5 ; ALL-NEXT: add x8, x8, x12 ; ALL-NEXT: and x9, x9, #0x7 +; ALL-NEXT: mvn w12, w9 +; ALL-NEXT: eor x14, x9, #0x3f ; ALL-NEXT: stp q0, q0, [sp, #32] ; ALL-NEXT: stp x10, x11, [sp, #16] -; ALL-NEXT: eor x11, x9, #0x3f ; ALL-NEXT: str q1, [sp] -; ALL-NEXT: ldp x10, x13, [x8, #8] -; ALL-NEXT: ldr x12, [x8, #24] -; ALL-NEXT: ldr x8, [x8] -; ALL-NEXT: lsl x14, x10, #1 +; ALL-NEXT: ldp x13, x10, [x8, #8] +; ALL-NEXT: ldr x11, [x8] +; ALL-NEXT: ldr x8, [x8, #24] +; ALL-NEXT: lsl x15, x10, #1 +; ALL-NEXT: lsr x11, x11, x9 ; ALL-NEXT: lsr x10, x10, x9 -; ALL-NEXT: lsl x15, x12, #1 -; ALL-NEXT: lsl x14, x14, x11 -; ALL-NEXT: lsl x11, x15, x11 -; ALL-NEXT: mvn w15, w9 +; ALL-NEXT: lsl x12, x15, x12 +; ALL-NEXT: lsl x15, x8, #1 ; ALL-NEXT: lsr x8, x8, x9 -; ALL-NEXT: lsr x12, x12, x9 ; ALL-NEXT: lsr x9, x13, x9 -; ALL-NEXT: orr x8, x8, x14 -; ALL-NEXT: orr x9, x9, x11 -; ALL-NEXT: lsl x11, x13, #1 -; ALL-NEXT: lsl x11, x11, x15 -; ALL-NEXT: orr x10, x10, x11 -; ALL-NEXT: stp x9, x12, [x2, #16] -; ALL-NEXT: stp x8, x10, [x2] +; ALL-NEXT: lsl x13, x13, #1 +; ALL-NEXT: lsl x15, x15, x14 +; ALL-NEXT: lsl x13, x13, x14 +; ALL-NEXT: orr x10, x10, x15 +; ALL-NEXT: orr x9, x9, x12 +; ALL-NEXT: orr x11, x11, x13 +; ALL-NEXT: stp x10, x8, [x2, #16] +; ALL-NEXT: stp x11, x9, [x2] ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret %src = load i256, ptr %src.ptr, align 1 @@ -216,24 +216,23 @@ ; ALL-NEXT: stp q0, q0, [sp] ; ALL-NEXT: stp x10, x11, [sp, #48] ; ALL-NEXT: str q1, [sp, #32] -; ALL-NEXT: ldp x11, x10, [x8, #8] -; ALL-NEXT: ldr x13, [x8] -; ALL-NEXT: ldr x8, [x8, #24] -; ALL-NEXT: lsr x15, x11, #1 -; ALL-NEXT: lsl x11, x11, x9 -; ALL-NEXT: lsr x16, x10, #1 -; ALL-NEXT: lsr x12, x15, x12 -; ALL-NEXT: lsr x15, x13, #1 -; ALL-NEXT: lsr x16, x16, x14 -; ALL-NEXT: lsr x14, x15, x14 -; ALL-NEXT: lsl x13, x13, x9 +; ALL-NEXT: ldp x11, x10, [x8] +; ALL-NEXT: ldp x13, x8, [x8, #16] +; ALL-NEXT: lsl x15, x10, x9 +; ALL-NEXT: lsr x10, x10, #1 +; ALL-NEXT: lsr x10, x10, x12 +; ALL-NEXT: lsr x12, x13, #1 ; ALL-NEXT: lsl x8, x8, x9 -; ALL-NEXT: lsl x9, x10, x9 -; ALL-NEXT: orr x11, x11, x14 -; ALL-NEXT: orr x8, x8, x16 -; ALL-NEXT: orr x9, x9, x12 -; ALL-NEXT: stp x13, x11, [x2] -; ALL-NEXT: stp x9, x8, [x2, #16] +; ALL-NEXT: lsr x12, x12, x14 +; ALL-NEXT: orr x8, x8, x12 +; ALL-NEXT: lsr x12, x11, #1 +; ALL-NEXT: lsl x13, x13, x9 +; ALL-NEXT: lsl x9, x11, x9 +; ALL-NEXT: lsr x11, x12, x14 +; ALL-NEXT: orr x10, x13, x10 +; ALL-NEXT: orr x11, x15, x11 +; ALL-NEXT: stp x10, x8, [x2, #16] +; ALL-NEXT: stp x9, x11, [x2] ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret %src = load i256, ptr %src.ptr, align 1 @@ -260,24 +259,24 @@ ; ALL-NEXT: eor x14, x9, #0x3f ; ALL-NEXT: stp x12, x12, [sp, #32] ; ALL-NEXT: mvn w12, w9 -; ALL-NEXT: ldp x10, x11, [x8, #8] -; ALL-NEXT: ldr x13, [x8, #24] -; ALL-NEXT: ldr x8, [x8] -; ALL-NEXT: lsl x16, x10, #1 -; ALL-NEXT: lsl x15, x11, #1 -; ALL-NEXT: lsl x16, x16, x14 -; ALL-NEXT: lsl x12, x15, x12 -; ALL-NEXT: lsl x15, x13, #1 -; ALL-NEXT: lsl x14, x15, x14 +; ALL-NEXT: ldp x13, x10, [x8, #8] +; ALL-NEXT: ldr x11, [x8] +; ALL-NEXT: ldr x8, [x8, #24] +; ALL-NEXT: lsl x15, x10, #1 ; ALL-NEXT: lsr x11, x11, x9 -; ALL-NEXT: asr x13, x13, x9 -; ALL-NEXT: lsr x8, x8, x9 -; ALL-NEXT: lsr x9, x10, x9 -; ALL-NEXT: orr x11, x11, x14 -; ALL-NEXT: orr x8, x8, x16 +; ALL-NEXT: lsr x10, x10, x9 +; ALL-NEXT: lsl x12, x15, x12 +; ALL-NEXT: lsl x15, x8, #1 +; ALL-NEXT: asr x8, x8, x9 +; ALL-NEXT: lsr x9, x13, x9 +; ALL-NEXT: lsl x13, x13, #1 +; ALL-NEXT: lsl x15, x15, x14 +; ALL-NEXT: lsl x13, x13, x14 +; ALL-NEXT: orr x10, x10, x15 ; ALL-NEXT: orr x9, x9, x12 -; ALL-NEXT: stp x11, x13, [x2, #16] -; ALL-NEXT: stp x8, x9, [x2] +; ALL-NEXT: orr x11, x11, x13 +; ALL-NEXT: stp x10, x8, [x2, #16] +; ALL-NEXT: stp x11, x9, [x2] ; ALL-NEXT: add sp, sp, #64 ; ALL-NEXT: ret %src = load i256, ptr %src.ptr, align 1 diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -910,24 +910,24 @@ ; CHECK-NEXT: ushll.8h v0, v0, #0 ; CHECK-NEXT: ushll2.4s v1, v0, #0 ; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: mov.s w11, v1[1] +; CHECK-NEXT: mov.s w10, v1[1] ; CHECK-NEXT: mov.s w13, v0[1] -; CHECK-NEXT: fmov w12, s1 -; CHECK-NEXT: mov.s w14, v1[2] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov.s w12, v1[2] ; CHECK-NEXT: fmov w15, s0 ; CHECK-NEXT: mov.s w16, v0[2] ; CHECK-NEXT: mov.s w9, v1[3] -; CHECK-NEXT: mov.s w10, v0[3] -; CHECK-NEXT: orr x11, x12, x11, lsl #20 -; CHECK-NEXT: orr x12, x15, x13, lsl #20 -; CHECK-NEXT: orr x11, x11, x14, lsl #40 -; CHECK-NEXT: orr x12, x12, x16, lsl #40 -; CHECK-NEXT: lsr w13, w9, #4 -; CHECK-NEXT: lsr w14, w10, #4 -; CHECK-NEXT: orr x9, x11, x9, lsl #60 -; CHECK-NEXT: orr x10, x12, x10, lsl #60 +; CHECK-NEXT: mov.s w14, v0[3] +; CHECK-NEXT: orr x10, x11, x10, lsl #20 +; CHECK-NEXT: orr x11, x15, x13, lsl #20 +; CHECK-NEXT: orr x10, x10, x12, lsl #40 +; CHECK-NEXT: orr x11, x11, x16, lsl #40 +; CHECK-NEXT: lsr x13, x9, #4 +; CHECK-NEXT: lsr x12, x14, #4 +; CHECK-NEXT: orr x9, x10, x9, lsl #60 +; CHECK-NEXT: orr x10, x11, x14, lsl #60 ; CHECK-NEXT: strh w13, [x1, #18] -; CHECK-NEXT: strh w14, [x1, #8] +; CHECK-NEXT: strh w12, [x1, #8] ; CHECK-NEXT: stur x9, [x1, #10] ; CHECK-NEXT: str x10, [x1], #64 ; CHECK-NEXT: b.ne LBB10_1 @@ -947,26 +947,28 @@ ; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0 ; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BE-NEXT: mov w9, v1.s[1] -; CHECK-BE-NEXT: mov w11, v0.s[1] -; CHECK-BE-NEXT: mov w13, v1.s[2] -; CHECK-BE-NEXT: fmov w14, s1 +; CHECK-BE-NEXT: mov w13, v0.s[1] +; CHECK-BE-NEXT: fmov w10, s1 +; CHECK-BE-NEXT: mov w11, v1.s[2] +; CHECK-BE-NEXT: fmov w14, s0 ; CHECK-BE-NEXT: mov w15, v0.s[2] -; CHECK-BE-NEXT: fmov w16, s0 -; CHECK-BE-NEXT: mov w10, v1.s[3] +; CHECK-BE-NEXT: mov w12, v1.s[3] ; CHECK-BE-NEXT: lsl x9, x9, #40 -; CHECK-BE-NEXT: mov w12, v0.s[3] -; CHECK-BE-NEXT: lsl x11, x11, #40 -; CHECK-BE-NEXT: orr x9, x9, x14, lsl #60 -; CHECK-BE-NEXT: orr x11, x11, x16, lsl #60 -; CHECK-BE-NEXT: orr x9, x9, x13, lsl #20 -; CHECK-BE-NEXT: orr x11, x11, x15, lsl #20 -; CHECK-BE-NEXT: lsr w13, w14, #4 -; CHECK-BE-NEXT: lsr w14, w16, #4 -; CHECK-BE-NEXT: strh w10, [x1, #18] -; CHECK-BE-NEXT: extr x9, x13, x9, #16 -; CHECK-BE-NEXT: strh w12, [x1, #8] -; CHECK-BE-NEXT: extr x10, x14, x11, #16 +; CHECK-BE-NEXT: orr x9, x9, x10, lsl #60 +; CHECK-BE-NEXT: lsr x10, x10, #4 +; CHECK-BE-NEXT: lsl x13, x13, #40 +; CHECK-BE-NEXT: orr x9, x9, x11, lsl #20 +; CHECK-BE-NEXT: orr x13, x13, x14, lsl #60 +; CHECK-BE-NEXT: lsr x14, x14, #4 +; CHECK-BE-NEXT: orr x13, x13, x15, lsl #20 +; CHECK-BE-NEXT: lsr x9, x9, #16 +; CHECK-BE-NEXT: mov w11, v0.s[3] +; CHECK-BE-NEXT: bfi x9, x10, #48, #4 +; CHECK-BE-NEXT: lsr x10, x13, #16 +; CHECK-BE-NEXT: strh w12, [x1, #18] +; CHECK-BE-NEXT: bfi x10, x14, #48, #4 ; CHECK-BE-NEXT: stur x9, [x1, #10] +; CHECK-BE-NEXT: strh w11, [x1, #8] ; CHECK-BE-NEXT: str x10, [x1], #64 ; CHECK-BE-NEXT: b.ne .LBB10_1 ; CHECK-BE-NEXT: // %bb.2: // %exit @@ -994,18 +996,22 @@ define void @zext_v4i8_to_v4i32_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v4i8_to_v4i32_in_loop: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh12: +; CHECK-NEXT: adrp x9, lCPI11_0@PAGE ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: Lloh13: +; CHECK-NEXT: ldr q0, [x9, lCPI11_0@PAGEOFF] ; CHECK-NEXT: LBB11_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr s0, [x0, x8] +; CHECK-NEXT: ldr s1, [x0, x8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: str q0, [x1], #64 +; CHECK-NEXT: tbl.16b v1, { v1 }, v0 +; CHECK-NEXT: str q1, [x1], #64 ; CHECK-NEXT: b.ne LBB11_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh13 ; ; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -1152,18 +1158,18 @@ define void @zext_v12i8_to_v12i32_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v12i8_to_v12i32_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh12: +; CHECK-NEXT: Lloh14: ; CHECK-NEXT: adrp x9, lCPI12_0@PAGE -; CHECK-NEXT: Lloh13: +; CHECK-NEXT: Lloh15: ; CHECK-NEXT: adrp x10, lCPI12_1@PAGE -; CHECK-NEXT: Lloh14: +; CHECK-NEXT: Lloh16: ; CHECK-NEXT: adrp x11, lCPI12_2@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh15: +; CHECK-NEXT: Lloh17: ; CHECK-NEXT: ldr q0, [x9, lCPI12_0@PAGEOFF] -; CHECK-NEXT: Lloh16: +; CHECK-NEXT: Lloh18: ; CHECK-NEXT: ldr q1, [x10, lCPI12_1@PAGEOFF] -; CHECK-NEXT: Lloh17: +; CHECK-NEXT: Lloh19: ; CHECK-NEXT: ldr q2, [x11, lCPI12_2@PAGEOFF] ; CHECK-NEXT: LBB12_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1178,9 +1184,9 @@ ; CHECK-NEXT: b.ne LBB12_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh19 +; CHECK-NEXT: .loh AdrpLdr Lloh15, Lloh18 ; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh17 -; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh16 -; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh15 ; ; CHECK-BE-LABEL: zext_v12i8_to_v12i32_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -1669,15 +1675,15 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v8i8_to_v8i64_with_add_in_sequence_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh18: +; CHECK-NEXT: Lloh20: ; CHECK-NEXT: adrp x9, lCPI17_0@PAGE -; CHECK-NEXT: Lloh19: +; CHECK-NEXT: Lloh21: ; CHECK-NEXT: adrp x10, lCPI17_1@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh20: +; CHECK-NEXT: Lloh22: ; CHECK-NEXT: ldr q0, [x9, lCPI17_0@PAGEOFF] ; CHECK-NEXT: add x9, x0, #8 -; CHECK-NEXT: Lloh21: +; CHECK-NEXT: Lloh23: ; CHECK-NEXT: ldr q1, [x10, lCPI17_1@PAGEOFF] ; CHECK-NEXT: LBB17_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1709,8 +1715,8 @@ ; CHECK-NEXT: b.ne LBB17_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh21 -; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh20 +; CHECK-NEXT: .loh AdrpLdr Lloh21, Lloh23 +; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh22 ; ; CHECK-BE-LABEL: zext_v8i8_to_v8i64_with_add_in_sequence_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -1971,8 +1977,8 @@ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x0, x8 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x8] -; CHECK-NEXT: ld1b { z1.s }, p0/z, [x9, #2, mul vl] -; CHECK-NEXT: ld1b { z2.s }, p0/z, [x9, #3, mul vl] +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x9, #3, mul vl] +; CHECK-NEXT: ld1b { z2.s }, p0/z, [x9, #2, mul vl] ; CHECK-NEXT: ld1b { z3.s }, p0/z, [x9, #1, mul vl] ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: add x9, x1, x8, lsl #2 @@ -1982,8 +1988,8 @@ ; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: add z0.s, z3.s, z3.s ; CHECK-NEXT: add z2.s, z2.s, z2.s -; CHECK-NEXT: st1w { z1.s }, p0, [x9, #2, mul vl] -; CHECK-NEXT: st1w { z2.s }, p0, [x9, #3, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x9, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p0, [x9, #2, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [x9, #1, mul vl] ; CHECK-NEXT: b.ne LBB19_1 ; CHECK-NEXT: ; %bb.2: ; %exit @@ -1997,8 +2003,8 @@ ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: add x9, x0, x8 ; CHECK-BE-NEXT: ld1b { z0.s }, p0/z, [x0, x8] -; CHECK-BE-NEXT: ld1b { z1.s }, p0/z, [x9, #2, mul vl] -; CHECK-BE-NEXT: ld1b { z2.s }, p0/z, [x9, #3, mul vl] +; CHECK-BE-NEXT: ld1b { z1.s }, p0/z, [x9, #3, mul vl] +; CHECK-BE-NEXT: ld1b { z2.s }, p0/z, [x9, #2, mul vl] ; CHECK-BE-NEXT: ld1b { z3.s }, p0/z, [x9, #1, mul vl] ; CHECK-BE-NEXT: add z0.s, z0.s, z0.s ; CHECK-BE-NEXT: add x9, x1, x8, lsl #2 @@ -2008,8 +2014,8 @@ ; CHECK-BE-NEXT: add z1.s, z1.s, z1.s ; CHECK-BE-NEXT: add z0.s, z3.s, z3.s ; CHECK-BE-NEXT: add z2.s, z2.s, z2.s -; CHECK-BE-NEXT: st1w { z1.s }, p0, [x9, #2, mul vl] -; CHECK-BE-NEXT: st1w { z2.s }, p0, [x9, #3, mul vl] +; CHECK-BE-NEXT: st1w { z1.s }, p0, [x9, #3, mul vl] +; CHECK-BE-NEXT: st1w { z2.s }, p0, [x9, #2, mul vl] ; CHECK-BE-NEXT: st1w { z0.s }, p0, [x9, #1, mul vl] ; CHECK-BE-NEXT: b.ne .LBB19_1 ; CHECK-BE-NEXT: // %bb.2: // %exit @@ -2174,22 +2180,22 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v20i8_to_v20i24_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh22: +; CHECK-NEXT: Lloh24: ; CHECK-NEXT: adrp x9, lCPI20_0@PAGE -; CHECK-NEXT: Lloh23: +; CHECK-NEXT: Lloh25: ; CHECK-NEXT: adrp x10, lCPI20_1@PAGE -; CHECK-NEXT: Lloh24: +; CHECK-NEXT: Lloh26: ; CHECK-NEXT: adrp x11, lCPI20_2@PAGE -; CHECK-NEXT: Lloh25: +; CHECK-NEXT: Lloh27: ; CHECK-NEXT: adrp x12, lCPI20_3@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh26: +; CHECK-NEXT: Lloh28: ; CHECK-NEXT: ldr q0, [x9, lCPI20_0@PAGEOFF] -; CHECK-NEXT: Lloh27: +; CHECK-NEXT: Lloh29: ; CHECK-NEXT: ldr q1, [x10, lCPI20_1@PAGEOFF] -; CHECK-NEXT: Lloh28: +; CHECK-NEXT: Lloh30: ; CHECK-NEXT: ldr q2, [x11, lCPI20_2@PAGEOFF] -; CHECK-NEXT: Lloh29: +; CHECK-NEXT: Lloh31: ; CHECK-NEXT: ldr q3, [x12, lCPI20_3@PAGEOFF] ; CHECK-NEXT: LBB20_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2210,10 +2216,10 @@ ; CHECK-NEXT: b.ne LBB20_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh27, Lloh31 +; CHECK-NEXT: .loh AdrpLdr Lloh26, Lloh30 ; CHECK-NEXT: .loh AdrpLdr Lloh25, Lloh29 ; CHECK-NEXT: .loh AdrpLdr Lloh24, Lloh28 -; CHECK-NEXT: .loh AdrpLdr Lloh23, Lloh27 -; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh26 ; ; CHECK-BE-LABEL: zext_v20i8_to_v20i24_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -2501,30 +2507,30 @@ define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v23i8_to_v23i48_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh30: +; CHECK-NEXT: Lloh32: ; CHECK-NEXT: adrp x9, lCPI21_0@PAGE -; CHECK-NEXT: Lloh31: +; CHECK-NEXT: Lloh33: ; CHECK-NEXT: adrp x10, lCPI21_1@PAGE -; CHECK-NEXT: Lloh32: +; CHECK-NEXT: Lloh34: ; CHECK-NEXT: adrp x11, lCPI21_2@PAGE ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: Lloh33: +; CHECK-NEXT: Lloh35: ; CHECK-NEXT: ldr q0, [x9, lCPI21_0@PAGEOFF] -; CHECK-NEXT: Lloh34: +; CHECK-NEXT: Lloh36: ; CHECK-NEXT: adrp x9, lCPI21_3@PAGE -; CHECK-NEXT: Lloh35: +; CHECK-NEXT: Lloh37: ; CHECK-NEXT: ldr q1, [x10, lCPI21_1@PAGEOFF] -; CHECK-NEXT: Lloh36: +; CHECK-NEXT: Lloh38: ; CHECK-NEXT: adrp x10, lCPI21_4@PAGE -; CHECK-NEXT: Lloh37: +; CHECK-NEXT: Lloh39: ; CHECK-NEXT: ldr q2, [x11, lCPI21_2@PAGEOFF] -; CHECK-NEXT: Lloh38: +; CHECK-NEXT: Lloh40: ; CHECK-NEXT: adrp x11, lCPI21_5@PAGE -; CHECK-NEXT: Lloh39: +; CHECK-NEXT: Lloh41: ; CHECK-NEXT: ldr q3, [x9, lCPI21_3@PAGEOFF] -; CHECK-NEXT: Lloh40: +; CHECK-NEXT: Lloh42: ; CHECK-NEXT: ldr q4, [x10, lCPI21_4@PAGEOFF] -; CHECK-NEXT: Lloh41: +; CHECK-NEXT: Lloh43: ; CHECK-NEXT: ldr q5, [x11, lCPI21_5@PAGEOFF] ; CHECK-NEXT: LBB21_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2552,15 +2558,15 @@ ; CHECK-NEXT: b.ne LBB21_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret -; CHECK-NEXT: .loh AdrpLdr Lloh38, Lloh41 -; CHECK-NEXT: .loh AdrpLdr Lloh36, Lloh40 +; CHECK-NEXT: .loh AdrpLdr Lloh40, Lloh43 +; CHECK-NEXT: .loh AdrpLdr Lloh38, Lloh42 +; CHECK-NEXT: .loh AdrpLdr Lloh36, Lloh41 +; CHECK-NEXT: .loh AdrpAdrp Lloh34, Lloh40 ; CHECK-NEXT: .loh AdrpLdr Lloh34, Lloh39 -; CHECK-NEXT: .loh AdrpAdrp Lloh32, Lloh38 -; CHECK-NEXT: .loh AdrpLdr Lloh32, Lloh37 -; CHECK-NEXT: .loh AdrpAdrp Lloh31, Lloh36 -; CHECK-NEXT: .loh AdrpLdr Lloh31, Lloh35 -; CHECK-NEXT: .loh AdrpAdrp Lloh30, Lloh34 -; CHECK-NEXT: .loh AdrpLdr Lloh30, Lloh33 +; CHECK-NEXT: .loh AdrpAdrp Lloh33, Lloh38 +; CHECK-NEXT: .loh AdrpLdr Lloh33, Lloh37 +; CHECK-NEXT: .loh AdrpAdrp Lloh32, Lloh36 +; CHECK-NEXT: .loh AdrpLdr Lloh32, Lloh35 ; ; CHECK-BE-LABEL: zext_v23i8_to_v23i48_in_loop: ; CHECK-BE: // %bb.0: // %entry @@ -2701,29 +2707,32 @@ ; CHECK-BE-NEXT: ushll v2.2d, v1.2s, #0 ; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0 ; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-BE-NEXT: ushll2 v1.2d, v1.4s, #0 ; CHECK-BE-NEXT: mov x9, v3.d[1] +; CHECK-BE-NEXT: ushll2 v1.2d, v1.4s, #0 ; CHECK-BE-NEXT: fmov x10, d3 ; CHECK-BE-NEXT: mov x11, v0.d[1] -; CHECK-BE-NEXT: fmov x12, d0 -; CHECK-BE-NEXT: mov x13, v1.d[1] -; CHECK-BE-NEXT: mov x14, v2.d[1] -; CHECK-BE-NEXT: orr x10, x9, x10, lsl #33 -; CHECK-BE-NEXT: fmov x15, d1 +; CHECK-BE-NEXT: fmov x13, d0 +; CHECK-BE-NEXT: mov x12, v1.d[1] ; CHECK-BE-NEXT: strb w9, [x1, #32] -; CHECK-BE-NEXT: fmov x16, d2 -; CHECK-BE-NEXT: lsl x11, x11, #2 -; CHECK-BE-NEXT: lsl x13, x13, #4 -; CHECK-BE-NEXT: orr x12, x11, x12, lsl #35 -; CHECK-BE-NEXT: lsl x14, x14, #6 -; CHECK-BE-NEXT: orr x15, x13, x15, lsl #37 -; CHECK-BE-NEXT: extr x10, x11, x10, #8 -; CHECK-BE-NEXT: orr x11, x14, x16, lsl #39 -; CHECK-BE-NEXT: extr x12, x13, x12, #8 -; CHECK-BE-NEXT: extr x9, x14, x15, #8 -; CHECK-BE-NEXT: extr x11, xzr, x11, #8 -; CHECK-BE-NEXT: stp x12, x10, [x1, #16] -; CHECK-BE-NEXT: stp x11, x9, [x1], #128 +; CHECK-BE-NEXT: mov x15, v2.d[1] +; CHECK-BE-NEXT: orr x9, x9, x10, lsl #33 +; CHECK-BE-NEXT: fmov x14, d1 +; CHECK-BE-NEXT: lsr x9, x9, #8 +; CHECK-BE-NEXT: lsl x10, x11, #2 +; CHECK-BE-NEXT: orr x9, x9, x11, lsl #58 +; CHECK-BE-NEXT: orr x10, x10, x13, lsl #35 +; CHECK-BE-NEXT: fmov x11, d2 +; CHECK-BE-NEXT: lsl x13, x12, #4 +; CHECK-BE-NEXT: lsr x10, x10, #8 +; CHECK-BE-NEXT: orr x13, x13, x14, lsl #37 +; CHECK-BE-NEXT: orr x10, x10, x12, lsl #60 +; CHECK-BE-NEXT: lsl x12, x15, #6 +; CHECK-BE-NEXT: lsr x13, x13, #8 +; CHECK-BE-NEXT: orr x11, x12, x11, lsl #39 +; CHECK-BE-NEXT: orr x12, x13, x15, lsl #62 +; CHECK-BE-NEXT: lsr x11, x11, #8 +; CHECK-BE-NEXT: stp x10, x9, [x1, #16] +; CHECK-BE-NEXT: stp x11, x12, [x1], #128 ; CHECK-BE-NEXT: b.ne .LBB22_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret @@ -2759,9 +2768,20 @@ ; CHECK-NEXT: ldr q0, [x8, x9] ; CHECK-NEXT: subs w3, w3, #1 ; CHECK-NEXT: ldr q1, [x1, x9] -; CHECK-NEXT: uabdl.8h v2, v0, v1 -; CHECK-NEXT: uabal2.8h v2, v0, v1 -; CHECK-NEXT: uaddlv.8h s0, v2 +; CHECK-NEXT: usubl.8h v2, v0, v1 +; CHECK-NEXT: usubl2.8h v0, v0, v1 +; CHECK-NEXT: sshll2.4s v1, v2, #0 +; CHECK-NEXT: sshll2.4s v3, v0, #0 +; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: sshll.4s v2, v2, #0 +; CHECK-NEXT: abs.4s v0, v0 +; CHECK-NEXT: abs.4s v3, v3 +; CHECK-NEXT: abs.4s v1, v1 +; CHECK-NEXT: abs.4s v2, v2 +; CHECK-NEXT: add.4s v1, v1, v3 +; CHECK-NEXT: add.4s v0, v2, v0 +; CHECK-NEXT: add.4s v0, v0, v1 +; CHECK-NEXT: addv.4s s0, v0 ; CHECK-NEXT: fmov w10, s0 ; CHECK-NEXT: add w0, w10, w0 ; CHECK-NEXT: b.ne LBB23_1 @@ -2780,9 +2800,20 @@ ; CHECK-BE-NEXT: ld1 { v0.16b }, [x8] ; CHECK-BE-NEXT: subs w3, w3, #1 ; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] -; CHECK-BE-NEXT: uabdl v2.8h, v0.8b, v1.8b -; CHECK-BE-NEXT: uabal2 v2.8h, v0.16b, v1.16b -; CHECK-BE-NEXT: uaddlv s0, v2.8h +; CHECK-BE-NEXT: usubl v2.8h, v0.8b, v1.8b +; CHECK-BE-NEXT: usubl2 v0.8h, v0.16b, v1.16b +; CHECK-BE-NEXT: sshll2 v1.4s, v2.8h, #0 +; CHECK-BE-NEXT: sshll2 v3.4s, v0.8h, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-BE-NEXT: abs v0.4s, v0.4s +; CHECK-BE-NEXT: abs v3.4s, v3.4s +; CHECK-BE-NEXT: abs v1.4s, v1.4s +; CHECK-BE-NEXT: abs v2.4s, v2.4s +; CHECK-BE-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-BE-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-BE-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-BE-NEXT: addv s0, v0.4s ; CHECK-BE-NEXT: fmov w10, s0 ; CHECK-BE-NEXT: add w0, w10, w0 ; CHECK-BE-NEXT: b.ne .LBB23_1 @@ -2893,22 +2924,22 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-LABEL: test_widening_instr_mull_64: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh42: +; CHECK-NEXT: Lloh44: ; CHECK-NEXT: adrp x8, lCPI25_0@PAGE -; CHECK-NEXT: Lloh43: +; CHECK-NEXT: Lloh45: ; CHECK-NEXT: adrp x9, lCPI25_1@PAGE -; CHECK-NEXT: Lloh44: +; CHECK-NEXT: Lloh46: ; CHECK-NEXT: adrp x10, lCPI25_2@PAGE -; CHECK-NEXT: Lloh45: +; CHECK-NEXT: Lloh47: ; CHECK-NEXT: adrp x11, lCPI25_3@PAGE -; CHECK-NEXT: Lloh46: +; CHECK-NEXT: Lloh48: ; CHECK-NEXT: ldr q0, [x8, lCPI25_0@PAGEOFF] ; CHECK-NEXT: mov x8, x1 -; CHECK-NEXT: Lloh47: +; CHECK-NEXT: Lloh49: ; CHECK-NEXT: ldr q1, [x9, lCPI25_1@PAGEOFF] -; CHECK-NEXT: Lloh48: +; CHECK-NEXT: Lloh50: ; CHECK-NEXT: ldr q2, [x10, lCPI25_2@PAGEOFF] -; CHECK-NEXT: Lloh49: +; CHECK-NEXT: Lloh51: ; CHECK-NEXT: ldr q3, [x11, lCPI25_3@PAGEOFF] ; CHECK-NEXT: LBB25_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2939,10 +2970,10 @@ ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh47, Lloh51 +; CHECK-NEXT: .loh AdrpLdr Lloh46, Lloh50 ; CHECK-NEXT: .loh AdrpLdr Lloh45, Lloh49 ; CHECK-NEXT: .loh AdrpLdr Lloh44, Lloh48 -; CHECK-NEXT: .loh AdrpLdr Lloh43, Lloh47 -; CHECK-NEXT: .loh AdrpLdr Lloh42, Lloh46 ; ; CHECK-BE-LABEL: test_widening_instr_mull_64: ; CHECK-BE: // %bb.0: // %entry @@ -3040,50 +3071,50 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-LABEL: test_widening_instr_mull_2: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: Lloh50: +; CHECK-NEXT: Lloh52: ; CHECK-NEXT: adrp x8, lCPI26_0@PAGE -; CHECK-NEXT: Lloh51: +; CHECK-NEXT: Lloh53: ; CHECK-NEXT: adrp x9, lCPI26_1@PAGE -; CHECK-NEXT: Lloh52: +; CHECK-NEXT: Lloh54: ; CHECK-NEXT: adrp x10, lCPI26_2@PAGE -; CHECK-NEXT: Lloh53: +; CHECK-NEXT: Lloh55: ; CHECK-NEXT: adrp x11, lCPI26_3@PAGE -; CHECK-NEXT: Lloh54: +; CHECK-NEXT: Lloh56: ; CHECK-NEXT: ldr q0, [x8, lCPI26_0@PAGEOFF] ; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: Lloh55: +; CHECK-NEXT: Lloh57: ; CHECK-NEXT: ldr q1, [x9, lCPI26_1@PAGEOFF] -; CHECK-NEXT: Lloh56: +; CHECK-NEXT: Lloh58: ; CHECK-NEXT: ldr q2, [x10, lCPI26_2@PAGEOFF] -; CHECK-NEXT: Lloh57: +; CHECK-NEXT: Lloh59: ; CHECK-NEXT: ldr q3, [x11, lCPI26_3@PAGEOFF] ; CHECK-NEXT: LBB26_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr q4, [x1], #16 -; CHECK-NEXT: ldp q5, q6, [x0, #32] +; CHECK-NEXT: ldp q6, q5, [x0, #32] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: tbl.16b v16, { v4 }, v0 ; CHECK-NEXT: tbl.16b v18, { v4 }, v1 -; CHECK-NEXT: tbl.16b v19, { v4 }, v2 -; CHECK-NEXT: tbl.16b v4, { v4 }, v3 -; CHECK-NEXT: ldr q7, [x0] -; CHECK-NEXT: ldr q17, [x8, #16]! +; CHECK-NEXT: tbl.16b v19, { v4 }, v3 +; CHECK-NEXT: tbl.16b v4, { v4 }, v2 +; CHECK-NEXT: ldr q17, [x0] +; CHECK-NEXT: ldr q7, [x8, #16]! ; CHECK-NEXT: mul.4s v5, v5, v16 ; CHECK-NEXT: mul.4s v6, v6, v18 -; CHECK-NEXT: mul.4s v7, v7, v19 -; CHECK-NEXT: mul.4s v4, v17, v4 -; CHECK-NEXT: stp q5, q6, [x0, #32] -; CHECK-NEXT: str q7, [x0] +; CHECK-NEXT: mul.4s v16, v17, v19 +; CHECK-NEXT: mul.4s v4, v7, v4 +; CHECK-NEXT: stp q6, q5, [x0, #32] +; CHECK-NEXT: str q16, [x0] ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: str q4, [x8] ; CHECK-NEXT: b.ne LBB26_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh55, Lloh59 +; CHECK-NEXT: .loh AdrpLdr Lloh54, Lloh58 ; CHECK-NEXT: .loh AdrpLdr Lloh53, Lloh57 ; CHECK-NEXT: .loh AdrpLdr Lloh52, Lloh56 -; CHECK-NEXT: .loh AdrpLdr Lloh51, Lloh55 -; CHECK-NEXT: .loh AdrpLdr Lloh50, Lloh54 ; ; CHECK-BE-LABEL: test_widening_instr_mull_2: ; CHECK-BE: // %bb.0: // %entry @@ -3102,15 +3133,15 @@ ; CHECK-BE-NEXT: .LBB26_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x1] -; CHECK-BE-NEXT: add x8, x0, #32 -; CHECK-BE-NEXT: add x9, x0, #48 +; CHECK-BE-NEXT: add x8, x0, #48 +; CHECK-BE-NEXT: add x9, x0, #32 ; CHECK-BE-NEXT: add x10, x0, #16 ; CHECK-BE-NEXT: ld1 { v6.4s }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: add x1, x1, #16 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x8] -; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b -; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v0.16b +; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v0.16b +; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] ; CHECK-BE-NEXT: tbl v17.16b, { v4.16b }, v3.16b ; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v2.16b diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -1929,15 +1929,15 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 +; VI-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] +; VI-NEXT: s_mov_b32 s8, 0 +; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 ; VI-NEXT: s_cbranch_scc0 .LBB16_4 ; VI-NEXT: ; %bb.1: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s2 ; VI-NEXT: v_cvt_f32_u32_e32 v1, s3 -; VI-NEXT: s_sub_u32 s8, 0, s2 -; VI-NEXT: s_subb_u32 s9, 0, s3 +; VI-NEXT: s_sub_u32 s9, 0, s2 +; VI-NEXT: s_subb_u32 s10, 0, s3 ; VI-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; VI-NEXT: v_rcp_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1946,9 +1946,9 @@ ; VI-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; VI-NEXT: v_cvt_u32_f32_e32 v4, v1 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v0 -; VI-NEXT: v_mul_lo_u32 v2, s8, v4 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 -; VI-NEXT: v_mul_lo_u32 v3, s9, v5 +; VI-NEXT: v_mul_lo_u32 v2, s9, v4 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s9, v5, 0 +; VI-NEXT: v_mul_lo_u32 v3, s10, v5 ; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 ; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 ; VI-NEXT: v_mul_hi_u32 v6, v5, v0 @@ -1964,9 +1964,9 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, v4, v1, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 -; VI-NEXT: v_mul_lo_u32 v4, s8, v7 -; VI-NEXT: v_mul_lo_u32 v5, s9, v6 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s9, v6, 0 +; VI-NEXT: v_mul_lo_u32 v4, s9, v7 +; VI-NEXT: v_mul_lo_u32 v5, s10, v6 ; VI-NEXT: v_mul_hi_u32 v8, v6, v0 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0 ; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1 @@ -1984,30 +1984,30 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, v7, v1, vcc ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v3, 0 ; VI-NEXT: v_mul_hi_u32 v4, s6, v2 -; VI-NEXT: v_readfirstlane_b32 s8, v1 -; VI-NEXT: v_readfirstlane_b32 s9, v0 +; VI-NEXT: v_readfirstlane_b32 s9, v1 +; VI-NEXT: v_readfirstlane_b32 s10, v0 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v3, 0 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s7, v2, 0 -; VI-NEXT: v_readfirstlane_b32 s10, v4 -; VI-NEXT: s_add_u32 s0, s10, s9 -; VI-NEXT: s_addc_u32 s1, 0, s8 -; VI-NEXT: v_readfirstlane_b32 s10, v2 -; VI-NEXT: v_readfirstlane_b32 s9, v3 -; VI-NEXT: s_add_u32 s0, s0, s10 -; VI-NEXT: v_readfirstlane_b32 s8, v1 -; VI-NEXT: s_addc_u32 s0, s1, s9 -; VI-NEXT: s_addc_u32 s10, s8, 0 +; VI-NEXT: v_readfirstlane_b32 s11, v4 +; VI-NEXT: s_add_u32 s0, s11, s10 +; VI-NEXT: s_addc_u32 s1, 0, s9 +; VI-NEXT: v_readfirstlane_b32 s11, v2 +; VI-NEXT: v_readfirstlane_b32 s10, v3 +; VI-NEXT: s_add_u32 s0, s0, s11 +; VI-NEXT: v_readfirstlane_b32 s9, v1 +; VI-NEXT: s_addc_u32 s0, s1, s10 +; VI-NEXT: s_addc_u32 s9, s9, 0 ; VI-NEXT: v_readfirstlane_b32 s1, v0 -; VI-NEXT: s_add_u32 s11, s0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s11 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, 0 -; VI-NEXT: s_addc_u32 s10, 0, s10 -; VI-NEXT: s_mul_i32 s0, s2, s10 +; VI-NEXT: s_add_u32 s12, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v0, 0 +; VI-NEXT: s_addc_u32 s9, 0, s9 +; VI-NEXT: s_mul_i32 s0, s2, s9 ; VI-NEXT: v_readfirstlane_b32 s1, v1 ; VI-NEXT: s_add_i32 s0, s1, s0 -; VI-NEXT: s_mul_i32 s1, s3, s11 -; VI-NEXT: s_add_i32 s12, s0, s1 -; VI-NEXT: s_sub_i32 s0, s7, s12 +; VI-NEXT: s_mul_i32 s1, s3, s12 +; VI-NEXT: s_add_i32 s14, s0, s1 +; VI-NEXT: s_sub_i32 s0, s7, s14 ; VI-NEXT: v_sub_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: s_cmp_lg_u64 vcc, 0 ; VI-NEXT: s_subb_u32 s13, s0, s3 @@ -2015,38 +2015,41 @@ ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 ; VI-NEXT: s_subb_u32 s13, s13, 0 ; VI-NEXT: s_cmp_ge_u32 s13, s3 -; VI-NEXT: s_cselect_b32 s14, -1, 0 +; VI-NEXT: s_cselect_b32 s15, -1, 0 ; VI-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 ; VI-NEXT: s_cmp_eq_u32 s13, s3 +; VI-NEXT: s_mov_b32 s13, s8 ; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s14 +; VI-NEXT: v_mov_b32_e32 v2, s15 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] -; VI-NEXT: s_add_u32 s0, s11, 1 -; VI-NEXT: s_addc_u32 s13, s10, 0 -; VI-NEXT: s_add_u32 s1, s11, 2 -; VI-NEXT: s_addc_u32 s11, s10, 0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] +; VI-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] +; VI-NEXT: s_add_u32 s0, s8, 1 +; VI-NEXT: s_addc_u32 s12, s9, 0 +; VI-NEXT: s_add_u32 s1, s8, 2 +; VI-NEXT: s_addc_u32 s13, s9, 0 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; VI-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: v_mov_b32_e32 v4, s11 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s12 +; VI-NEXT: v_mov_b32_e32 v3, s13 ; VI-NEXT: s_cmp_lg_u64 vcc, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; VI-NEXT: s_subb_u32 s0, s7, s12 +; VI-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; VI-NEXT: s_subb_u32 s0, s7, s14 ; VI-NEXT: s_cmp_ge_u32 s0, s3 ; VI-NEXT: s_cselect_b32 s1, -1, 0 ; VI-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 ; VI-NEXT: s_cmp_eq_u32 s0, s3 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, s10 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; VI-NEXT: v_mov_b32_e32 v3, s9 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; VI-NEXT: s_cbranch_execnz .LBB16_3 ; VI-NEXT: .LBB16_2: ; VI-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2086,9 +2089,9 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b64 s[0:1], s[6:7], s[2:3] -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[2:3] +; GFX9-NEXT: s_mov_b32 s10, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2103,78 +2106,78 @@ ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s10, v1 -; GFX9-NEXT: v_readfirstlane_b32 s11, v0 -; GFX9-NEXT: s_mul_i32 s12, s0, s10 -; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11 -; GFX9-NEXT: s_mul_i32 s13, s1, s11 -; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s15, s0, s11 -; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 -; GFX9-NEXT: s_mul_i32 s14, s11, s12 -; GFX9-NEXT: s_mul_hi_u32 s11, s11, s15 -; GFX9-NEXT: s_add_u32 s11, s11, s14 -; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15 -; GFX9-NEXT: s_mul_i32 s15, s10, s15 -; GFX9-NEXT: s_add_u32 s11, s11, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12 -; GFX9-NEXT: s_addc_u32 s11, s13, s16 -; GFX9-NEXT: s_addc_u32 s13, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s10, s12 -; GFX9-NEXT: s_add_u32 s11, s11, s12 -; GFX9-NEXT: s_addc_u32 s12, 0, s13 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s11, v0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s10, s10, s12 +; GFX9-NEXT: v_readfirstlane_b32 s11, v1 ; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s11, s0, s10 -; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s11, s13, s11 -; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s11, s11, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 -; GFX9-NEXT: s_mul_i32 s14, s10, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s11 -; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 -; GFX9-NEXT: s_add_u32 s0, s0, s16 -; GFX9-NEXT: s_addc_u32 s12, 0, s15 -; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 -; GFX9-NEXT: s_addc_u32 s0, s12, s13 +; GFX9-NEXT: s_mul_i32 s13, s0, s11 +; GFX9-NEXT: s_mul_hi_u32 s15, s0, s12 +; GFX9-NEXT: s_mul_i32 s14, s1, s12 +; GFX9-NEXT: s_add_i32 s13, s15, s13 +; GFX9-NEXT: s_add_i32 s13, s13, s14 +; GFX9-NEXT: s_mul_i32 s16, s0, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s12, s13 +; GFX9-NEXT: s_mul_i32 s15, s12, s13 +; GFX9-NEXT: s_mul_hi_u32 s12, s12, s16 +; GFX9-NEXT: s_add_u32 s12, s12, s15 +; GFX9-NEXT: s_addc_u32 s14, 0, s14 +; GFX9-NEXT: s_mul_hi_u32 s17, s11, s16 +; GFX9-NEXT: s_mul_i32 s16, s11, s16 +; GFX9-NEXT: s_add_u32 s12, s12, s16 +; GFX9-NEXT: s_mul_hi_u32 s15, s11, s13 +; GFX9-NEXT: s_addc_u32 s12, s14, s17 +; GFX9-NEXT: s_addc_u32 s14, s15, 0 +; GFX9-NEXT: s_mul_i32 s13, s11, s13 +; GFX9-NEXT: s_add_u32 s12, s12, s13 +; GFX9-NEXT: s_addc_u32 s13, 0, s14 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s12, v0 +; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 +; GFX9-NEXT: s_addc_u32 s11, s11, s13 +; GFX9-NEXT: v_readfirstlane_b32 s13, v0 +; GFX9-NEXT: s_mul_i32 s12, s0, s11 +; GFX9-NEXT: s_mul_hi_u32 s14, s0, s13 +; GFX9-NEXT: s_add_i32 s12, s14, s12 +; GFX9-NEXT: s_mul_i32 s1, s1, s13 +; GFX9-NEXT: s_add_i32 s12, s12, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s11, s0 +; GFX9-NEXT: s_mul_i32 s15, s11, s0 +; GFX9-NEXT: s_mul_i32 s17, s13, s12 +; GFX9-NEXT: s_mul_hi_u32 s0, s13, s0 +; GFX9-NEXT: s_mul_hi_u32 s16, s13, s12 +; GFX9-NEXT: s_add_u32 s0, s0, s17 +; GFX9-NEXT: s_addc_u32 s13, 0, s16 +; GFX9-NEXT: s_add_u32 s0, s0, s15 +; GFX9-NEXT: s_mul_hi_u32 s1, s11, s12 +; GFX9-NEXT: s_addc_u32 s0, s13, s14 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s11, s10, s11 -; GFX9-NEXT: s_add_u32 s0, s0, s11 +; GFX9-NEXT: s_mul_i32 s12, s11, s12 +; GFX9-NEXT: s_add_u32 s0, s0, s12 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s0, s10, s1 -; GFX9-NEXT: v_readfirstlane_b32 s11, v0 -; GFX9-NEXT: s_mul_i32 s10, s6, s0 -; GFX9-NEXT: s_mul_hi_u32 s12, s6, s11 +; GFX9-NEXT: s_addc_u32 s0, s11, s1 +; GFX9-NEXT: v_readfirstlane_b32 s12, v0 +; GFX9-NEXT: s_mul_i32 s11, s6, s0 +; GFX9-NEXT: s_mul_hi_u32 s13, s6, s12 ; GFX9-NEXT: s_mul_hi_u32 s1, s6, s0 -; GFX9-NEXT: s_add_u32 s10, s12, s10 +; GFX9-NEXT: s_add_u32 s11, s13, s11 ; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_mul_hi_u32 s13, s7, s11 -; GFX9-NEXT: s_mul_i32 s11, s7, s11 -; GFX9-NEXT: s_add_u32 s10, s10, s11 -; GFX9-NEXT: s_mul_hi_u32 s12, s7, s0 -; GFX9-NEXT: s_addc_u32 s1, s1, s13 -; GFX9-NEXT: s_addc_u32 s10, s12, 0 +; GFX9-NEXT: s_mul_hi_u32 s14, s7, s12 +; GFX9-NEXT: s_mul_i32 s12, s7, s12 +; GFX9-NEXT: s_add_u32 s11, s11, s12 +; GFX9-NEXT: s_mul_hi_u32 s13, s7, s0 +; GFX9-NEXT: s_addc_u32 s1, s1, s14 +; GFX9-NEXT: s_addc_u32 s11, s13, 0 ; GFX9-NEXT: s_mul_i32 s0, s7, s0 -; GFX9-NEXT: s_add_u32 s11, s1, s0 -; GFX9-NEXT: s_addc_u32 s10, 0, s10 -; GFX9-NEXT: s_mul_i32 s0, s2, s10 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s11 +; GFX9-NEXT: s_add_u32 s12, s1, s0 +; GFX9-NEXT: s_addc_u32 s11, 0, s11 +; GFX9-NEXT: s_mul_i32 s0, s2, s11 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s12 ; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s3, s11 -; GFX9-NEXT: s_add_i32 s12, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s2, s11 +; GFX9-NEXT: s_mul_i32 s1, s3, s12 +; GFX9-NEXT: s_add_i32 s14, s0, s1 +; GFX9-NEXT: s_mul_i32 s1, s2, s12 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_sub_i32 s0, s7, s12 +; GFX9-NEXT: s_sub_i32 s0, s7, s14 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: s_subb_u32 s13, s0, s3 @@ -2182,26 +2185,28 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_subb_u32 s13, s13, 0 ; GFX9-NEXT: s_cmp_ge_u32 s13, s3 -; GFX9-NEXT: s_cselect_b32 s14, -1, 0 +; GFX9-NEXT: s_cselect_b32 s15, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 ; GFX9-NEXT: s_cmp_eq_u32 s13, s3 +; GFX9-NEXT: s_mov_b32 s13, s10 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s11, 1 -; GFX9-NEXT: s_addc_u32 s13, s10, 0 -; GFX9-NEXT: s_add_u32 s1, s11, 2 -; GFX9-NEXT: s_addc_u32 s14, s10, 0 +; GFX9-NEXT: s_add_u32 s0, s10, 1 +; GFX9-NEXT: s_addc_u32 s12, s11, 0 +; GFX9-NEXT: s_add_u32 s1, s10, 2 +; GFX9-NEXT: s_addc_u32 s13, s11, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s7, s12 +; GFX9-NEXT: s_subb_u32 s0, s7, s14 ; GFX9-NEXT: s_cmp_ge_u32 s0, s3 ; GFX9-NEXT: s_cselect_b32 s1, -1, 0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 @@ -2210,9 +2215,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 @@ -2333,49 +2338,51 @@ ; GFX1010-NEXT: s_add_u32 s1, s10, s1 ; GFX1010-NEXT: s_addc_u32 s1, s9, s13 ; GFX1010-NEXT: s_addc_u32 s9, s11, 0 -; GFX1010-NEXT: s_add_u32 s1, s1, s0 +; GFX1010-NEXT: s_add_u32 s0, s1, s0 ; GFX1010-NEXT: s_addc_u32 s9, 0, s9 -; GFX1010-NEXT: s_mul_hi_u32 s0, s2, s1 +; GFX1010-NEXT: s_mul_hi_u32 s1, s2, s0 ; GFX1010-NEXT: s_mul_i32 s11, s2, s9 -; GFX1010-NEXT: s_mul_i32 s12, s2, s1 -; GFX1010-NEXT: s_add_i32 s0, s0, s11 -; GFX1010-NEXT: v_sub_co_u32 v0, s11, s6, s12 -; GFX1010-NEXT: s_mul_i32 s10, s3, s1 -; GFX1010-NEXT: s_add_i32 s0, s0, s10 -; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s2 -; GFX1010-NEXT: s_sub_i32 s10, s7, s0 -; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1010-NEXT: s_subb_u32 s10, s10, s3 +; GFX1010-NEXT: s_mul_i32 s12, s2, s0 +; GFX1010-NEXT: s_mul_i32 s10, s3, s0 +; GFX1010-NEXT: s_add_i32 s1, s1, s11 +; GFX1010-NEXT: v_sub_co_u32 v0, s12, s6, s12 +; GFX1010-NEXT: s_add_i32 s13, s1, s10 +; GFX1010-NEXT: s_sub_i32 s1, s7, s13 ; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 +; GFX1010-NEXT: v_sub_co_u32 v1, s10, v0, s2 +; GFX1010-NEXT: s_subb_u32 s1, s1, s3 +; GFX1010-NEXT: s_cmp_lg_u32 s10, 0 +; GFX1010-NEXT: s_subb_u32 s1, s1, 0 ; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 -; GFX1010-NEXT: s_subb_u32 s10, s10, 0 -; GFX1010-NEXT: s_cmp_ge_u32 s10, s3 +; GFX1010-NEXT: s_cmp_ge_u32 s1, s3 +; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1010-NEXT: s_cmp_eq_u32 s1, s3 +; GFX1010-NEXT: s_mov_b32 s1, s8 ; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s10, s3 ; GFX1010-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1010-NEXT: s_add_u32 s10, s1, 1 -; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1010-NEXT: s_addc_u32 s12, s9, 0 -; GFX1010-NEXT: s_add_u32 s13, s1, 2 -; GFX1010-NEXT: s_addc_u32 s14, s9, 0 -; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1010-NEXT: s_or_b64 s[10:11], s[0:1], s[8:9] +; GFX1010-NEXT: s_add_u32 s1, s10, 1 +; GFX1010-NEXT: s_addc_u32 s9, s11, 0 +; GFX1010-NEXT: s_add_u32 s0, s10, 2 +; GFX1010-NEXT: v_cndmask_b32_e32 v1, s14, v1, vcc_lo +; GFX1010-NEXT: s_addc_u32 s14, s11, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 -; GFX1010-NEXT: s_subb_u32 s0, s7, s0 -; GFX1010-NEXT: v_mov_b32_e32 v2, s13 -; GFX1010-NEXT: s_cmp_ge_u32 s0, s3 +; GFX1010-NEXT: s_subb_u32 s7, s7, s13 +; GFX1010-NEXT: v_mov_b32_e32 v2, s0 +; GFX1010-NEXT: s_cmp_ge_u32 s7, s3 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s0, s3 +; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1010-NEXT: s_cmp_eq_u32 s7, s3 ; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1010-NEXT: v_mov_b32_e32 v1, s14 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 -; GFX1010-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo -; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, s12, v0, s0 +; GFX1010-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo ; GFX1010-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo -; GFX1010-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo +; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc_lo ; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 ; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1010-NEXT: .LBB16_2: @@ -2495,49 +2502,51 @@ ; GFX1030W32-NEXT: s_add_u32 s1, s10, s1 ; GFX1030W32-NEXT: s_addc_u32 s1, s9, s13 ; GFX1030W32-NEXT: s_addc_u32 s9, s11, 0 -; GFX1030W32-NEXT: s_add_u32 s1, s1, s0 +; GFX1030W32-NEXT: s_add_u32 s0, s1, s0 ; GFX1030W32-NEXT: s_addc_u32 s9, 0, s9 -; GFX1030W32-NEXT: s_mul_hi_u32 s0, s2, s1 +; GFX1030W32-NEXT: s_mul_hi_u32 s1, s2, s0 ; GFX1030W32-NEXT: s_mul_i32 s11, s2, s9 -; GFX1030W32-NEXT: s_mul_i32 s12, s2, s1 -; GFX1030W32-NEXT: s_add_i32 s0, s0, s11 -; GFX1030W32-NEXT: v_sub_co_u32 v0, s11, s6, s12 -; GFX1030W32-NEXT: s_mul_i32 s10, s3, s1 -; GFX1030W32-NEXT: s_add_i32 s0, s0, s10 -; GFX1030W32-NEXT: v_sub_co_u32 v1, s12, v0, s2 -; GFX1030W32-NEXT: s_sub_i32 s10, s7, s0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1030W32-NEXT: s_subb_u32 s10, s10, s3 +; GFX1030W32-NEXT: s_mul_i32 s12, s2, s0 +; GFX1030W32-NEXT: s_mul_i32 s10, s3, s0 +; GFX1030W32-NEXT: s_add_i32 s1, s1, s11 +; GFX1030W32-NEXT: v_sub_co_u32 v0, s12, s6, s12 +; GFX1030W32-NEXT: s_add_i32 s13, s1, s10 +; GFX1030W32-NEXT: s_sub_i32 s1, s7, s13 ; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 +; GFX1030W32-NEXT: v_sub_co_u32 v1, s10, v0, s2 +; GFX1030W32-NEXT: s_subb_u32 s1, s1, s3 +; GFX1030W32-NEXT: s_cmp_lg_u32 s10, 0 +; GFX1030W32-NEXT: s_subb_u32 s1, s1, 0 ; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 -; GFX1030W32-NEXT: s_subb_u32 s10, s10, 0 -; GFX1030W32-NEXT: s_cmp_ge_u32 s10, s3 +; GFX1030W32-NEXT: s_cmp_ge_u32 s1, s3 +; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1030W32-NEXT: s_cmp_eq_u32 s1, s3 +; GFX1030W32-NEXT: s_mov_b32 s1, s8 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1030W32-NEXT: s_cmp_eq_u32 s10, s3 ; GFX1030W32-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1030W32-NEXT: s_add_u32 s10, s1, 1 -; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1030W32-NEXT: s_addc_u32 s12, s9, 0 -; GFX1030W32-NEXT: s_add_u32 s13, s1, 2 -; GFX1030W32-NEXT: s_addc_u32 s14, s9, 0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1030W32-NEXT: s_or_b64 s[10:11], s[0:1], s[8:9] +; GFX1030W32-NEXT: s_add_u32 s1, s10, 1 +; GFX1030W32-NEXT: s_addc_u32 s9, s11, 0 +; GFX1030W32-NEXT: s_add_u32 s0, s10, 2 +; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s14, v1, vcc_lo +; GFX1030W32-NEXT: s_addc_u32 s14, s11, 0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 ; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 -; GFX1030W32-NEXT: s_subb_u32 s0, s7, s0 -; GFX1030W32-NEXT: v_mov_b32_e32 v2, s13 -; GFX1030W32-NEXT: s_cmp_ge_u32 s0, s3 +; GFX1030W32-NEXT: s_subb_u32 s7, s7, s13 +; GFX1030W32-NEXT: v_mov_b32_e32 v2, s0 +; GFX1030W32-NEXT: s_cmp_ge_u32 s7, s3 ; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX1030W32-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1030W32-NEXT: s_cmp_eq_u32 s0, s3 +; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1030W32-NEXT: s_cmp_eq_u32 s7, s3 ; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX1030W32-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1030W32-NEXT: v_mov_b32_e32 v1, s14 -; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 -; GFX1030W32-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo -; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, s12, v0, s0 +; GFX1030W32-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo ; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo -; GFX1030W32-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo +; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc_lo +; GFX1030W32-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc_lo ; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 ; GFX1030W32-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1030W32-NEXT: .LBB16_2: @@ -2585,8 +2594,8 @@ ; GFX1030W64-NEXT: ; %bb.1: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX1030W64-NEXT: s_sub_u32 s9, 0, s2 -; GFX1030W64-NEXT: s_subb_u32 s10, 0, s3 +; GFX1030W64-NEXT: s_sub_u32 s10, 0, s2 +; GFX1030W64-NEXT: s_subb_u32 s11, 0, s3 ; GFX1030W64-NEXT: v_fmac_f32_e32 v0, 0x4f800000, v1 ; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2595,111 +2604,113 @@ ; GFX1030W64-NEXT: v_fmac_f32_e32 v0, 0xcf800000, v1 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1 -; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1030W64-NEXT: s_mul_i32 s1, s9, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s0 -; GFX1030W64-NEXT: s_mul_i32 s11, s10, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s12, s1 -; GFX1030W64-NEXT: s_mul_i32 s13, s9, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s1, s11 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13 -; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1 -; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s15, s8, s1 -; GFX1030W64-NEXT: s_add_u32 s0, s12, s0 -; GFX1030W64-NEXT: s_addc_u32 s12, 0, s13 -; GFX1030W64-NEXT: s_add_u32 s0, s0, s11 -; GFX1030W64-NEXT: s_mul_i32 s1, s8, s1 -; GFX1030W64-NEXT: s_addc_u32 s0, s12, s14 -; GFX1030W64-NEXT: s_addc_u32 s11, s15, 0 -; GFX1030W64-NEXT: s_add_u32 s0, s0, s1 -; GFX1030W64-NEXT: s_addc_u32 s11, 0, s11 -; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11 -; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1030W64-NEXT: s_mul_i32 s1, s9, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s9, s0 -; GFX1030W64-NEXT: s_mul_i32 s10, s10, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s11, s1 -; GFX1030W64-NEXT: s_mul_i32 s9, s9, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s1, s10 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s9 -; GFX1030W64-NEXT: s_mul_i32 s12, s8, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s9, s0, s9 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1 -; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s1 -; GFX1030W64-NEXT: s_add_u32 s0, s9, s0 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s13 -; GFX1030W64-NEXT: s_add_u32 s0, s0, s12 -; GFX1030W64-NEXT: s_mul_i32 s1, s8, s1 -; GFX1030W64-NEXT: s_addc_u32 s0, s9, s11 -; GFX1030W64-NEXT: s_addc_u32 s9, s10, 0 -; GFX1030W64-NEXT: s_add_u32 s0, s0, s1 -; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9 -; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1030W64-NEXT: s_addc_u32 s0, s8, s9 -; GFX1030W64-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1030W64-NEXT: s_mul_i32 s9, s6, s0 -; GFX1030W64-NEXT: s_mul_hi_u32 s8, s6, s0 -; GFX1030W64-NEXT: s_mul_hi_u32 s10, s7, s0 -; GFX1030W64-NEXT: s_mul_i32 s0, s7, s0 -; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s1 -; GFX1030W64-NEXT: s_mul_i32 s1, s7, s1 -; GFX1030W64-NEXT: s_add_u32 s9, s11, s9 -; GFX1030W64-NEXT: s_addc_u32 s8, 0, s8 -; GFX1030W64-NEXT: s_add_u32 s1, s9, s1 -; GFX1030W64-NEXT: s_addc_u32 s1, s8, s12 -; GFX1030W64-NEXT: s_addc_u32 s8, s10, 0 -; GFX1030W64-NEXT: s_add_u32 s10, s1, s0 -; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8 -; GFX1030W64-NEXT: s_mul_hi_u32 s0, s2, s10 -; GFX1030W64-NEXT: s_mul_i32 s1, s2, s11 -; GFX1030W64-NEXT: s_mul_i32 s9, s2, s10 -; GFX1030W64-NEXT: s_add_i32 s12, s0, s1 -; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s6, s9 -; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10 -; GFX1030W64-NEXT: s_add_i32 s12, s12, s8 -; GFX1030W64-NEXT: v_sub_co_u32 v1, s[8:9], v0, s2 -; GFX1030W64-NEXT: s_sub_i32 s13, s7, s12 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1030W64-NEXT: s_subb_u32 s13, s13, s3 +; GFX1030W64-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1030W64-NEXT: s_mul_i32 s9, s10, s1 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s10, s8 +; GFX1030W64-NEXT: s_mul_i32 s12, s11, s8 +; GFX1030W64-NEXT: s_add_i32 s9, s13, s9 +; GFX1030W64-NEXT: s_mul_i32 s14, s10, s8 +; GFX1030W64-NEXT: s_add_i32 s9, s9, s12 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s8, s14 +; GFX1030W64-NEXT: s_mul_hi_u32 s15, s1, s14 +; GFX1030W64-NEXT: s_mul_i32 s12, s1, s14 +; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s9 +; GFX1030W64-NEXT: s_mul_i32 s8, s8, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s16, s1, s9 +; GFX1030W64-NEXT: s_add_u32 s8, s13, s8 +; GFX1030W64-NEXT: s_addc_u32 s13, 0, s14 +; GFX1030W64-NEXT: s_add_u32 s8, s8, s12 +; GFX1030W64-NEXT: s_mul_i32 s9, s1, s9 +; GFX1030W64-NEXT: s_addc_u32 s8, s13, s15 +; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0 +; GFX1030W64-NEXT: s_add_u32 s8, s8, s9 +; GFX1030W64-NEXT: s_addc_u32 s12, 0, s12 +; GFX1030W64-NEXT: v_add_co_u32 v0, s[8:9], v0, s8 ; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_addc_u32 s1, s1, s12 +; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1030W64-NEXT: s_mul_i32 s9, s10, s1 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s10, s8 +; GFX1030W64-NEXT: s_mul_i32 s11, s11, s8 +; GFX1030W64-NEXT: s_add_i32 s9, s12, s9 +; GFX1030W64-NEXT: s_mul_i32 s10, s10, s8 +; GFX1030W64-NEXT: s_add_i32 s9, s9, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s1, s10 +; GFX1030W64-NEXT: s_mul_i32 s13, s1, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s10 +; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s9 +; GFX1030W64-NEXT: s_mul_i32 s8, s8, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s1, s9 +; GFX1030W64-NEXT: s_add_u32 s8, s10, s8 +; GFX1030W64-NEXT: s_addc_u32 s10, 0, s14 +; GFX1030W64-NEXT: s_add_u32 s8, s8, s13 +; GFX1030W64-NEXT: s_mul_i32 s9, s1, s9 +; GFX1030W64-NEXT: s_addc_u32 s8, s10, s12 +; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0 +; GFX1030W64-NEXT: s_add_u32 s8, s8, s9 +; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10 +; GFX1030W64-NEXT: v_add_co_u32 v0, s[8:9], v0, s8 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_addc_u32 s1, s1, s10 +; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1030W64-NEXT: s_mul_i32 s10, s6, s1 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s6, s1 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s7, s1 +; GFX1030W64-NEXT: s_mul_i32 s1, s7, s1 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8 +; GFX1030W64-NEXT: s_mul_i32 s8, s7, s8 +; GFX1030W64-NEXT: s_add_u32 s10, s12, s10 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9 +; GFX1030W64-NEXT: s_add_u32 s8, s10, s8 +; GFX1030W64-NEXT: s_addc_u32 s8, s9, s13 +; GFX1030W64-NEXT: s_addc_u32 s9, s11, 0 +; GFX1030W64-NEXT: s_add_u32 s8, s8, s1 +; GFX1030W64-NEXT: s_addc_u32 s1, 0, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX1030W64-NEXT: s_mul_i32 s10, s2, s1 +; GFX1030W64-NEXT: s_mul_i32 s11, s2, s8 +; GFX1030W64-NEXT: s_add_i32 s9, s9, s10 +; GFX1030W64-NEXT: v_sub_co_u32 v0, s[10:11], s6, s11 +; GFX1030W64-NEXT: s_mul_i32 s12, s3, s8 +; GFX1030W64-NEXT: s_add_i32 s14, s9, s12 +; GFX1030W64-NEXT: v_sub_co_u32 v1, s[12:13], v0, s2 +; GFX1030W64-NEXT: s_sub_i32 s9, s7, s14 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX1030W64-NEXT: s_subb_u32 s9, s9, s3 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GFX1030W64-NEXT: s_subb_u32 s8, s13, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s3 +; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s3 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s3 +; GFX1030W64-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s3 +; GFX1030W64-NEXT: s_mov_b32 s9, s0 ; GFX1030W64-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX1030W64-NEXT: s_add_u32 s8, s10, 1 -; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc -; GFX1030W64-NEXT: s_addc_u32 s9, s11, 0 -; GFX1030W64-NEXT: s_add_u32 s13, s10, 2 -; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1030W64-NEXT: s_or_b64 s[8:9], s[8:9], s[0:1] +; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc +; GFX1030W64-NEXT: s_add_u32 s12, s8, 1 +; GFX1030W64-NEXT: s_addc_u32 s13, s9, 0 +; GFX1030W64-NEXT: s_add_u32 s0, s8, 2 +; GFX1030W64-NEXT: s_addc_u32 s1, s9, 0 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[10:11], 0 ; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX1030W64-NEXT: s_subb_u32 s0, s7, s12 -; GFX1030W64-NEXT: v_mov_b32_e32 v2, s13 -; GFX1030W64-NEXT: s_cmp_ge_u32 s0, s3 +; GFX1030W64-NEXT: s_subb_u32 s7, s7, s14 +; GFX1030W64-NEXT: v_mov_b32_e32 v2, s0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s7, s3 ; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s0, s3 +; GFX1030W64-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1030W64-NEXT: s_cmp_eq_u32 s7, s3 ; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1 ; GFX1030W64-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s14 -; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, s7, v0, s[0:1] -; GFX1030W64-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc -; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc +; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, s10, v0, s[0:1] +; GFX1030W64-NEXT: v_cndmask_b32_e32 v2, s12, v2, vcc +; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s13, v1, vcc ; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc -; GFX1030W64-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc +; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc +; GFX1030W64-NEXT: v_cndmask_b32_e32 v0, s8, v2, vcc ; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1030W64-NEXT: .LBB16_2: ; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -2828,54 +2839,57 @@ ; GFX11-NEXT: s_add_u32 s1, s10, s1 ; GFX11-NEXT: s_addc_u32 s1, s9, s13 ; GFX11-NEXT: s_addc_u32 s9, s11, 0 -; GFX11-NEXT: s_add_u32 s1, s1, s0 +; GFX11-NEXT: s_add_u32 s0, s1, s0 ; GFX11-NEXT: s_addc_u32 s9, 0, s9 -; GFX11-NEXT: s_mul_hi_u32 s0, s2, s1 +; GFX11-NEXT: s_mul_hi_u32 s1, s2, s0 ; GFX11-NEXT: s_mul_i32 s11, s2, s9 -; GFX11-NEXT: s_mul_i32 s12, s2, s1 -; GFX11-NEXT: s_add_i32 s0, s0, s11 -; GFX11-NEXT: v_sub_co_u32 v0, s11, s6, s12 -; GFX11-NEXT: s_mul_i32 s10, s3, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_add_i32 s0, s0, s10 -; GFX11-NEXT: v_sub_co_u32 v1, s12, v0, s2 -; GFX11-NEXT: s_sub_i32 s10, s7, s0 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 -; GFX11-NEXT: s_subb_u32 s10, s10, s3 +; GFX11-NEXT: s_mul_i32 s12, s2, s0 +; GFX11-NEXT: s_mul_i32 s10, s3, s0 +; GFX11-NEXT: s_add_i32 s1, s1, s11 +; GFX11-NEXT: v_sub_co_u32 v0, s12, s6, s12 +; GFX11-NEXT: s_add_i32 s13, s1, s10 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_sub_i32 s1, s7, s13 ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 +; GFX11-NEXT: v_sub_co_u32 v1, s10, v0, s2 +; GFX11-NEXT: s_subb_u32 s1, s1, s3 +; GFX11-NEXT: s_cmp_lg_u32 s10, 0 +; GFX11-NEXT: s_subb_u32 s1, s1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 -; GFX11-NEXT: s_subb_u32 s10, s10, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_ge_u32 s10, s3 +; GFX11-NEXT: s_cmp_ge_u32 s1, s3 +; GFX11-NEXT: s_cselect_b32 s14, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, s3 +; GFX11-NEXT: s_mov_b32 s1, s8 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: s_cselect_b32 s12, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, s3 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_add_u32 s10, s1, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX11-NEXT: s_addc_u32 s12, s9, 0 -; GFX11-NEXT: s_add_u32 s13, s1, 2 -; GFX11-NEXT: s_addc_u32 s14, s9, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, s13 -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_or_b64 s[10:11], s[0:1], s[8:9] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s1, s10, 1 +; GFX11-NEXT: s_addc_u32 s9, s11, 0 +; GFX11-NEXT: s_add_u32 s0, s10, 2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, s14, v1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_addc_u32 s14, s11, 0 +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 -; GFX11-NEXT: s_subb_u32 s0, s7, s0 +; GFX11-NEXT: s_subb_u32 s7, s7, s13 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_ge_u32 s0, s3 +; GFX11-NEXT: s_cmp_ge_u32 s7, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_cselect_b32 s7, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, s3 +; GFX11-NEXT: s_cselect_b32 s12, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s7, s3 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, s12, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc_lo ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8 ; GFX11-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX11-NEXT: .LBB16_2: diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1850,24 +1850,24 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v3, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v0 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; GFX7-NEXT: v_alignbit_b32 v6, 0, v6, 16 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v7, v6 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v8, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v9, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2557,49 +2557,55 @@ ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v7, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 +; GFX7-NEXT: v_bfe_i32 v3, v2, 4, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4 -; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2 -; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4 -; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 ; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 ; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 0, 4 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v10 ; GFX7-NEXT: v_ashrrev_i32_e32 v15, 28, v0 ; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v17, v0, 8, 4 ; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v14 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 24, 4 +; GFX7-NEXT: v_bfe_i32 v6, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v8, 28, v2 +; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 -; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v13, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 24, v16 +; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 ; GFX7-NEXT: v_alignbit_b32 v9, 0, v9, 24 -; GFX7-NEXT: v_alignbit_b32 v16, 0, v16, 24 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 -; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX7-NEXT: v_alignbit_b32 v11, 0, v14, 24 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v16, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX7-NEXT: v_bfe_u32 v14, v10, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v11, v0 +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v10, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v14, v0 ; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v15 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -2631,76 +2637,76 @@ ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 4, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 20, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 12, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 8, v2 ; GFX8-NEXT: v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshlrev_b16_e32 v18, 12, v2 ; GFX8-NEXT: v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshlrev_b16_e32 v5, 12, v10 +; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 +; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v16 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v17 ; GFX8-NEXT: v_lshlrev_b16_e32 v7, 12, v7 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 12, v6 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, 12, v15 +; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 +; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v18 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v19 ; GFX8-NEXT: v_lshlrev_b16_e32 v12, 12, v12 ; GFX8-NEXT: v_ashrrev_i16_e32 v19, 12, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 12, v11 -; GFX8-NEXT: v_lshlrev_b16_e32 v9, 12, v9 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, 12, v8 -; GFX8-NEXT: v_lshlrev_b16_e32 v14, 12, v14 -; GFX8-NEXT: v_lshlrev_b16_e32 v13, 12, v13 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 -; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v14 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX8-NEXT: v_mul_lo_u16_e32 v20, v16, v18 ; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 ; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v8, v9, v11 -; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v9, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v5, v5, v6 +; GFX8-NEXT: v_mul_lo_u16_e32 v14, v17, v19 ; GFX8-NEXT: v_mul_lo_u16_e32 v10, v10, v15 -; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX8-NEXT: v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 -; GFX8-NEXT: v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX8-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v10, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX8-NEXT: v_or_b32_sdwa v3, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v3, v8, v4 +; GFX8-NEXT: v_add_u16_e32 v3, v6, v4 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v8 ; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 ; GFX8-NEXT: v_mad_u16 v2, v17, v19, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v6 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v7 ; GFX8-NEXT: v_mad_u16 v2, v16, v18, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v10 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v9 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2724,15 +2730,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 12, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 4, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 12, v1 ; GFX9-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2740,59 +2746,59 @@ ; GFX9-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 12, v9 -; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-NEXT: v_ashrrev_i16_e32 v15, 12, v16 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v16, 12, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 12, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v5, 12, v14 -; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v17 ; GFX9-NEXT: v_lshlrev_b16_e32 v13, 12, v13 ; GFX9-NEXT: v_lshlrev_b16_e32 v12, 12, v12 +; GFX9-NEXT: v_ashrrev_i16_e32 v14, 12, v17 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v18, 12, v0 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v10 +; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-NEXT: v_ashrrev_i16_e32 v1, 12, v1 +; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 12, v0 -; GFX9-NEXT: v_ashrrev_i16_e32 v2, 12, v2 -; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_lo_u16_e32 v13, v16, v18 ; GFX9-NEXT: v_mul_lo_u16_e32 v19, v15, v17 ; GFX9-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v8, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 ; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v9, v9, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v2, v7, v4 -; GFX9-NEXT: v_add_u16_e32 v1, v2, v1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-NEXT: v_add_u16_e32 v1, v6, v4 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 ; GFX9-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v5 ; GFX9-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 ; GFX9-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; @@ -2816,15 +2822,15 @@ ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 12, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 4, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 4, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v15, 12, v1 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 @@ -2832,59 +2838,59 @@ ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v2, 12, v9 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v7, 12, v7 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v15 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v15, 12, v16 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v16, 12, v1 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v1, 12, v5 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v5, 12, v14 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v17 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v13, 12, v13 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v12, 12, v12 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v14, 12, v17 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v18 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v18, 12, v0 ; GFX9-DL-NEXT: v_lshlrev_b16_e32 v0, 12, v10 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v1, 12, v1 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v13 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v0, 12, v0 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v2, 12, v2 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v13, v16, v18 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v19, v15, v17 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v8, v10 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v8, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v2, v2, v5 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 ; GFX9-DL-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, v9, v14 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX9-DL-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v0 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v2, v7, v4 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v2, v1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v6, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v8 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v16, v18, v0 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v5 ; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v15, v17, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 ; GFX9-DL-NEXT: global_store_byte v3, v0, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -2906,81 +2912,81 @@ ; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 8, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v17 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15 -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 8, v8 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v0, v11 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v12 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v0, v0, v11 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v5, v12 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v5, v9 ; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 8, v0 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3 -; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v9, v3, v10 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v10, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v10, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 8, v0 +; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v10, v3, v11 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v10, v8 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v12, v0 +; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v9, v0 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 ; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0 @@ -3006,42 +3012,45 @@ ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 8, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v9, v16 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v17 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 8, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 @@ -3050,37 +3059,34 @@ ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v0, v1, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 8, v8 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v12 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v11 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 8, v10 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v5, v12 +; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v5, v9 ; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v11, v7, v14 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v13 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2 -; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v10 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v10, v11, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v10, v2, v11 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v10, v8 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v12, v0 +; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v9, v0 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6 ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0 diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -79,33 +79,33 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_lshr_b32 s0, s6, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: ds_write_b8 v0, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 +; GFX9-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s6, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 8 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_lshr_b32 s0, s5, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 8 +; GFX9-NEXT: s_lshr_b32 s0, s5, 8 ; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 24 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 24 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 +; GFX9-NEXT: s_lshr_b32 s0, s6, 8 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align1: @@ -115,39 +115,39 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s3, s2, 8 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b8 v0, v1 +; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_lshr_b32 s3, s0, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshr_b32 s3, s2, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: s_lshr_b32 s2, s2, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s1, 8 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s2, s1, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 +; GFX7-NEXT: s_lshr_b32 s0, s0, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 8 ; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 +; GFX7-NEXT: s_lshr_b32 s0, s2, 24 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s2, 8 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align1: @@ -157,39 +157,39 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s3, s2, 8 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b8 v0, v1 +; GFX6-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: s_lshr_b32 s3, s0, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_lshr_b32 s3, s2, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 +; GFX6-NEXT: s_lshr_b32 s3, s0, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s1, 8 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s2, s1, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 +; GFX6-NEXT: s_lshr_b32 s0, s0, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s1, s0, 8 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 8 ; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s1, s0, 24 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 +; GFX6-NEXT: s_lshr_b32 s0, s2, 24 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s2, 8 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align1: @@ -199,33 +199,33 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: s_lshr_b32 s0, s6, 8 -; GFX10-NEXT: s_lshr_b32 s1, s6, 24 -; GFX10-NEXT: s_lshr_b32 s2, s5, 8 -; GFX10-NEXT: s_lshr_b32 s3, s5, 24 -; GFX10-NEXT: s_lshr_b32 s5, s4, 8 -; GFX10-NEXT: s_lshr_b32 s4, s4, 24 +; GFX10-NEXT: s_lshr_b32 s0, s4, 24 +; GFX10-NEXT: s_lshr_b32 s1, s4, 8 +; GFX10-NEXT: s_lshr_b32 s2, s5, 24 +; GFX10-NEXT: s_lshr_b32 s3, s5, 8 +; GFX10-NEXT: s_lshr_b32 s4, s6, 24 +; GFX10-NEXT: s_lshr_b32 s5, s6, 8 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 -; GFX10-NEXT: v_mov_b32_e32 v8, s5 -; GFX10-NEXT: v_mov_b32_e32 v9, s4 -; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX10-NEXT: v_mov_b32_e32 v8, s4 +; GFX10-NEXT: v_mov_b32_e32 v9, s5 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX10-NEXT: ds_write_b8 v0, v1 ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX10-NEXT: ds_write_b8 v0, v3 -; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:2 -; GFX10-NEXT: ds_write_b8 v0, v4 offset:9 -; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 -; GFX10-NEXT: ds_write_b8 v0, v6 offset:5 -; GFX10-NEXT: ds_write_b8 v0, v7 offset:7 -; GFX10-NEXT: ds_write_b8 v0, v8 offset:1 -; GFX10-NEXT: ds_write_b8 v0, v9 offset:3 +; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:10 +; GFX10-NEXT: ds_write_b8 v0, v3 offset:8 +; GFX10-NEXT: ds_write_b8 v0, v4 offset:3 +; GFX10-NEXT: ds_write_b8 v0, v5 offset:1 +; GFX10-NEXT: ds_write_b8 v0, v6 offset:7 +; GFX10-NEXT: ds_write_b8 v0, v7 offset:5 +; GFX10-NEXT: ds_write_b8 v0, v8 offset:11 +; GFX10-NEXT: ds_write_b8 v0, v9 offset:9 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align1: @@ -234,29 +234,29 @@ ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: s_lshr_b32 s3, s2, 8 -; GFX11-NEXT: s_lshr_b32 s2, s2, 24 -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_lshr_b32 s1, s1, 24 -; GFX11-NEXT: s_lshr_b32 s5, s0, 8 -; GFX11-NEXT: s_lshr_b32 s0, s0, 24 -; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-NEXT: s_lshr_b32 s0, s0, 8 +; GFX11-NEXT: s_lshr_b32 s4, s1, 24 +; GFX11-NEXT: s_lshr_b32 s1, s1, 8 +; GFX11-NEXT: s_lshr_b32 s5, s2, 24 +; GFX11-NEXT: s_lshr_b32 s2, s2, 8 +; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s0 ; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s1 -; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: ds_store_b8 v0, v1 offset:8 -; GFX11-NEXT: ds_store_b8 v0, v3 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:2 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: ds_store_b8 v0, v1 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:2 +; GFX11-NEXT: ds_store_b8 v0, v3 offset:8 ; GFX11-NEXT: ds_store_b8 v0, v2 offset:4 -; GFX11-NEXT: ds_store_b8 v0, v4 offset:9 -; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 -; GFX11-NEXT: ds_store_b8 v0, v5 offset:11 -; GFX11-NEXT: ds_store_b8 v0, v6 offset:5 +; GFX11-NEXT: ds_store_b8 v0, v4 offset:3 +; GFX11-NEXT: ds_store_b8 v0, v5 offset:1 +; GFX11-NEXT: ds_store_b8 v0, v6 offset:7 ; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:6 -; GFX11-NEXT: ds_store_b8 v0, v7 offset:7 -; GFX11-NEXT: ds_store_b8 v0, v8 offset:1 -; GFX11-NEXT: ds_store_b8 v0, v9 offset:3 +; GFX11-NEXT: ds_store_b8 v0, v7 offset:5 +; GFX11-NEXT: ds_store_b8 v0, v8 offset:11 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:10 +; GFX11-NEXT: ds_store_b8 v0, v9 offset:9 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out, align 1 ret void @@ -269,15 +269,15 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 +; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX9-NEXT: s_endpgm ; ; GFX7-LABEL: store_lds_v3i32_align2: @@ -287,21 +287,21 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX7-NEXT: ds_write_b16 v0, v2 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s2, s2, 16 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_write_b16 v0, v1 +; GFX7-NEXT: ds_write_b16 v0, v2 offset:4 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: s_lshr_b32 s1, s1, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 -; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s1, 16 ; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_lshr_b32 s0, s2, 16 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 ; GFX7-NEXT: s_endpgm ; ; GFX6-LABEL: store_lds_v3i32_align2: @@ -311,21 +311,21 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX6-NEXT: ds_write_b16 v0, v2 offset:4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: s_lshr_b32 s2, s2, 16 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: ds_write_b16 v0, v1 +; GFX6-NEXT: ds_write_b16 v0, v2 offset:4 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: s_lshr_b32 s1, s1, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: s_lshr_b32 s0, s0, 16 -; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s1, 16 ; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_lshr_b32 s0, s2, 16 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align2: @@ -335,15 +335,15 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-NEXT: v_mov_b32_e32 v3, s4 -; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 -; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:2 +; GFX10-NEXT: ds_write_b16 v0, v1 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:6 -; GFX10-NEXT: ds_write_b16 v0, v3 -; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:2 +; GFX10-NEXT: ds_write_b16 v0, v2 offset:4 +; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:10 +; GFX10-NEXT: ds_write_b16 v0, v3 offset:8 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align2: @@ -352,14 +352,14 @@ ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 -; GFX11-NEXT: ds_store_b16 v0, v2 -; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 -; GFX11-NEXT: ds_store_b16 v0, v1 offset:8 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 -; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:2 +; GFX11-NEXT: ds_store_b16 v0, v1 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:6 +; GFX11-NEXT: ds_store_b16 v0, v2 offset:4 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:10 +; GFX11-NEXT: ds_store_b16 v0, v3 offset:8 ; GFX11-NEXT: s_endpgm store <3 x i32> %x, ptr addrspace(3) %out, align 2 ret void @@ -400,11 +400,11 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s2 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: ds_write_b32 v0, v1 offset:8 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: ds_write_b32 v0, v1 offset:8 ; GFX6-NEXT: s_endpgm ; ; GFX10-LABEL: store_lds_v3i32_align4: @@ -414,11 +414,11 @@ ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: ds_write_b32 v0, v1 offset:8 -; GFX10-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 +; GFX10-NEXT: v_mov_b32_e32 v3, s6 +; GFX10-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 +; GFX10-NEXT: ds_write_b32 v0, v3 offset:8 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_lds_v3i32_align4: diff --git a/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll b/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll --- a/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll +++ b/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll @@ -22,10 +22,13 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r1, [r0, #-21] ; CHECK-NEXT: ldr r2, [r0, #-17] -; CHECK-NEXT: muls r1, r2, r1 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: bxne lr +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: clz r2, r2 +; CHECK-NEXT: lsrs r1, r1, #5 +; CHECK-NEXT: lsrs r2, r2, #5 +; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: it eq +; CHECK-NEXT: bxeq lr ; CHECK-NEXT: LBB0_2: @ %_Z14printIsNotZeroi.exit17.for.body_crit_edge ; CHECK-NEXT: @ in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: adds r0, #12 diff --git a/llvm/test/CodeGen/ARM/2013-05-13-DAGCombiner-undef-mask.ll b/llvm/test/CodeGen/ARM/2013-05-13-DAGCombiner-undef-mask.ll --- a/llvm/test/CodeGen/ARM/2013-05-13-DAGCombiner-undef-mask.ll +++ b/llvm/test/CodeGen/ARM/2013-05-13-DAGCombiner-undef-mask.ll @@ -11,9 +11,10 @@ ; CHECK-NEXT: tst r2, #1 ; CHECK-NEXT: moveq r1, #0 ; CHECK-NEXT: vmoveq d18, r1, r1 +; CHECK-NEXT: mov r1, #20 ; CHECK-NEXT: vldrne d18, [sp, #8] ; CHECK-NEXT: vorr d17, d18, d18 -; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128], r1 ; CHECK-NEXT: bx lr entry: %.sink = select i1 %dec1, <3 x i64> %b, <3 x i64> zeroinitializer diff --git a/llvm/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll b/llvm/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll --- a/llvm/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll +++ b/llvm/test/CodeGen/ARM/2013-07-29-vector-or-combine.ll @@ -11,17 +11,21 @@ ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: bxne lr -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: .LBB0_1: @ %vector.body.preheader +; CHECK-NEXT: vmov.i32 q8, #0xff0000 +; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vld1.32 {d16, d17}, [r0] +; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: adr r0, .LCPI0_0 -; CHECK-NEXT: vbic.i32 q8, #0xff -; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128] -; CHECK-NEXT: vorr q8, q8, q9 -; CHECK-NEXT: vst1.32 {d16, d17}, [r0] -; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: vand q10, q9, q8 +; CHECK-NEXT: vbic.i16 q9, #0xff +; CHECK-NEXT: vorr q9, q9, q10 +; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128] +; CHECK-NEXT: vorr q9, q9, q10 +; CHECK-NEXT: vst1.32 {d18, d19}, [r0] +; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.2: +; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 2 @ 0x2 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll @@ -354,14 +354,15 @@ ; CHECK-BE-NEXT: movs r1, #0 ; CHECK-BE-NEXT: .LBB3_2: @ %for.body ; CHECK-BE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ldrh lr, [r3, #2]! -; CHECK-BE-NEXT: subs r0, #1 ; CHECK-BE-NEXT: ldrsh r4, [r2, #2]! +; CHECK-BE-NEXT: subs r0, #1 +; CHECK-BE-NEXT: ldrsh lr, [r3, #2]! ; CHECK-BE-NEXT: ldrsh.w r5, [r2, #2] -; CHECK-BE-NEXT: mul r1, lr, r1 ; CHECK-BE-NEXT: smlabb r12, r4, lr, r12 ; CHECK-BE-NEXT: ldrsh.w r4, [r3, #2] ; CHECK-BE-NEXT: smlabb r12, r5, r4, r12 +; CHECK-BE-NEXT: uxth.w r5, lr +; CHECK-BE-NEXT: mul r1, r5, r1 ; CHECK-BE-NEXT: bne .LBB3_2 ; CHECK-BE-NEXT: @ %bb.3: ; CHECK-BE-NEXT: pop.w {r4, r5, r7, lr} diff --git a/llvm/test/CodeGen/ARM/add-like-or.ll b/llvm/test/CodeGen/ARM/add-like-or.ll --- a/llvm/test/CodeGen/ARM/add-like-or.ll +++ b/llvm/test/CodeGen/ARM/add-like-or.ll @@ -215,7 +215,7 @@ ; CHECK-T1-LABEL: orgeps: ; CHECK-T1: @ %bb.0: @ %entry ; CHECK-T1-NEXT: lsls r0, r0, #3 -; CHECK-T1-NEXT: adds r0, r1, r0 +; CHECK-T1-NEXT: adds r0, r0, r1 ; CHECK-T1-NEXT: ldr r1, [r0, #4] ; CHECK-T1-NEXT: ldr r0, [r0, #8] ; CHECK-T1-NEXT: adds r0, r0, r1 diff --git a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll --- a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll +++ b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll @@ -1355,207 +1355,160 @@ ; CHECK-FIX-NOSCHED: @ %bb.0: ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-FIX-NOSCHED-NEXT: .pad #24 -; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: .pad #12 +; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #12 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1] -; CHECK-FIX-NOSCHED-NEXT: vmov r7, r6, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov r10, r12, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[1] ; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r7 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r6 -; CHECK-FIX-NOSCHED-NEXT: lsr r6, r6, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d16[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r10, #16 ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #20] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r12, #16 +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: lsr r3, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: str r6, [sp, #16] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r10, r5 +; CHECK-FIX-NOSCHED-NEXT: lsr r11, r7, #16 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: bne .LBB36_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_2: -; CHECK-FIX-NOSCHED-NEXT: vmov r4, r6, d1 -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r3, d0 -; CHECK-FIX-NOSCHED-NEXT: lsr r5, r4, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r1, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r6 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r9, r4 -; CHECK-FIX-NOSCHED-NEXT: uxth r6, r3 +; CHECK-FIX-NOSCHED-NEXT: vmov r6, r0, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov lr, r4, d0 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r6, #16 ; CHECK-FIX-NOSCHED-NEXT: b .LBB36_5 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_3: ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #14] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #12] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #8] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #6] -; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh r10, [r2] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #4] +; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #12] +; CHECK-FIX-NOSCHED-NEXT: ldrh r10, [r2, #8] +; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #4] +; CHECK-FIX-NOSCHED-NEXT: ldrh r11, [r2, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #10] +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #6] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_4: -; CHECK-FIX-NOSCHED-NEXT: vmov r5, r3, d1 -; CHECK-FIX-NOSCHED-NEXT: mov r4, r7 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d0[1] +; CHECK-FIX-NOSCHED-NEXT: vmov r6, r0, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r4, d0[1] ; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d0[0]}, [r1:16] -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r0, d0[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r9, r5 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r3 -; CHECK-FIX-NOSCHED-NEXT: uxth r6, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r1, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r5, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: mov r7, r4 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 lr, d0[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r6, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r0, #16 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_5: -; CHECK-FIX-NOSCHED-NEXT: uxth r8, r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: lsr r8, r4, #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r7, r7, r11, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r9, lr, #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r6, r3, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r7 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r7, lr, r9, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r7 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r7, r4, r8, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r7 +; CHECK-FIX-NOSCHED-NEXT: ldr r7, [sp] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r3 ; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r10, r0, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r8, lr, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r6, r12, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r3, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r9, r5, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r7, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r11, r1, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: pkhbt r7, r5, r7, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r10, r3, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r7 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r12, r0, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r3 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[1], r0 ; CHECK-FIX-NOSCHED-NEXT: aese.8 q8, q9 ; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q8 ; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #12 ; CHECK-FIX-NOSCHED-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-CORTEX-FIX-LABEL: aese_setf16_cond_via_ptr: ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-CORTEX-FIX-NEXT: .pad #24 -; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #24 +; CHECK-CORTEX-FIX-NEXT: .pad #20 +; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #20 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB36_3 +; CHECK-CORTEX-FIX-NEXT: beq .LBB36_2 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-CORTEX-FIX-NEXT: vorr q9, q8, q8 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r5, d16[1] ; CHECK-CORTEX-FIX-NEXT: vld1.16 {d18[0]}, [r1:16] -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d18[0] -; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r7, d18[0] +; CHECK-CORTEX-FIX-NEXT: mov r3, r7 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r7, #16 +; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov r7, r4, d17 +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov r3, r6, d17 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r11, r6 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB36_4 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r7, #16 +; CHECK-CORTEX-FIX-NEXT: lsr lr, r4, #16 +; CHECK-CORTEX-FIX-NEXT: b .LBB36_3 ; CHECK-CORTEX-FIX-NEXT: .LBB36_2: -; CHECK-CORTEX-FIX-NEXT: vmov r1, r7, d0 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r1 -; CHECK-CORTEX-FIX-NEXT: uxth r6, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r7, #16 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r1, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: mov r0, r3 -; CHECK-CORTEX-FIX-NEXT: vmov r7, r3, d1 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r5, r7, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 -; CHECK-CORTEX-FIX-NEXT: mov r3, r0 -; CHECK-CORTEX-FIX-NEXT: b .LBB36_5 -; CHECK-CORTEX-FIX-NEXT: .LBB36_3: ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2] -; CHECK-CORTEX-FIX-NEXT: ldrh r11, [r2, #12] -; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r2, #14] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #2] +; CHECK-CORTEX-FIX-NEXT: ldrh r5, [r2, #4] +; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r2, #8] +; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r2, #12] +; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r2, #14] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #4] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #2] +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #6] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #8] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #10] +; CHECK-CORTEX-FIX-NEXT: .LBB36_3: +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB36_2 -; CHECK-CORTEX-FIX-NEXT: .LBB36_4: +; CHECK-CORTEX-FIX-NEXT: beq .LBB36_5 +; CHECK-CORTEX-FIX-NEXT: @ %bb.4: ; CHECK-CORTEX-FIX-NEXT: vorr q8, q0, q0 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r5, d0[1] +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d0[1] ; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] -; CHECK-CORTEX-FIX-NEXT: uxth r6, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r5, #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, r7, d1 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r1, d16[0] -; CHECK-CORTEX-FIX-NEXT: uxth r10, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r5, r5, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r7, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r1 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r1, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov.32 r11, d16[0] +; CHECK-CORTEX-FIX-NEXT: b .LBB36_6 ; CHECK-CORTEX-FIX-NEXT: .LBB36_5: +; CHECK-CORTEX-FIX-NEXT: vmov r11, r3, d0 +; CHECK-CORTEX-FIX-NEXT: .LBB36_6: +; CHECK-CORTEX-FIX-NEXT: lsr r0, r11, #16 +; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r4, lr, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r6, r8, d1 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r12, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r0, lsl #16 ; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r11, r11, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r4, [sp, #16] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r12, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r10, r5, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r0, r1, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r0, lr, r8, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r1, r3, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r3, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r4 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r1 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r7 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r11 -; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r9, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r3 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r5 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r6 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r0 +; CHECK-CORTEX-FIX-NEXT: lsr r10, r6, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r9, r8, #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r8, r9, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r10, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r7, r0, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r7, [sp, #12] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r0, r7, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r7, [sp] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r5 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], lr +; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r11, r7, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r7 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r6 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r3 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r1 ; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: add sp, sp, #24 +; CHECK-CORTEX-FIX-NEXT: add sp, sp, #20 ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} br i1 %0, label %5, label %12 @@ -1604,210 +1557,161 @@ ; CHECK-FIX-NOSCHED: @ %bb.0: ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-FIX-NOSCHED-NEXT: .pad #24 -; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #24 -; CHECK-FIX-NOSCHED-NEXT: vmov r12, s0 +; CHECK-FIX-NOSCHED-NEXT: .pad #12 +; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #12 +; CHECK-FIX-NOSCHED-NEXT: vmov r2, s0 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1] -; CHECK-FIX-NOSCHED-NEXT: vmov r7, r6, d17 -; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r12 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r7 -; CHECK-FIX-NOSCHED-NEXT: uxth r2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r6 -; CHECK-FIX-NOSCHED-NEXT: lsr r6, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #20] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: vmov r10, r9, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r4, d16[1] +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r2 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r6, d16[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r10, #16 ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r3, r5 -; CHECK-FIX-NOSCHED-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: b .LBB37_3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r9, #16 +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r4, #16 +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: lsr r11, r6, #16 +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB37_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB37_2: +; CHECK-FIX-NOSCHED-NEXT: vmov r5, r0, d3 +; CHECK-FIX-NOSCHED-NEXT: vmov r12, r3, d2 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r5, #16 +; CHECK-FIX-NOSCHED-NEXT: b .LBB37_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB37_3: ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #14] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #12] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #8] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #6] -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #2] -; CHECK-FIX-NOSCHED-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r9, [r1, #12] +; CHECK-FIX-NOSCHED-NEXT: ldrh r10, [r1, #8] +; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #4] +; CHECK-FIX-NOSCHED-NEXT: ldrh r11, [r1, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r1] +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #10] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r1, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #4] -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1] -; CHECK-FIX-NOSCHED-NEXT: .LBB37_3: -; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #6] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_5 -; CHECK-FIX-NOSCHED-NEXT: @ %bb.4: -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r6, d2[1] -; CHECK-FIX-NOSCHED-NEXT: mov r3, r2 -; CHECK-FIX-NOSCHED-NEXT: mov r2, r7 -; CHECK-FIX-NOSCHED-NEXT: vmov r4, r7, d3 -; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r12 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r0, d2[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r5, r6 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r10, r4 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r9, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: mov r7, r2 -; CHECK-FIX-NOSCHED-NEXT: mov r2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, r4, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: b .LBB37_6 +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_2 +; CHECK-FIX-NOSCHED-NEXT: .LBB37_4: +; CHECK-FIX-NOSCHED-NEXT: vmov r5, r0, d3 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d2[1] +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r2 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r12, d2[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r5, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r0, #16 ; CHECK-FIX-NOSCHED-NEXT: .LBB37_5: -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r6, d3 -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r5, d2 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r9, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r6 -; CHECK-FIX-NOSCHED-NEXT: uxth r10, r3 -; CHECK-FIX-NOSCHED-NEXT: uxth r5, r5 -; CHECK-FIX-NOSCHED-NEXT: .LBB37_6: -; CHECK-FIX-NOSCHED-NEXT: uxth r8, r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: lsr lr, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r6, r6, r11, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r8, r12, #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r3, lr, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r6 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r6, r12, r8, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r6 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r3 ; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r3, r0, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r8, lr, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r5, r12, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r2, r0, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r10, r4, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r7, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r11, r9, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r4, r3, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r3 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r5, r7, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r3 +; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r10, r3, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r3 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r9, r0, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[1], r0 ; CHECK-FIX-NOSCHED-NEXT: aese.8 q8, q9 ; CHECK-FIX-NOSCHED-NEXT: aesmc.8 q8, q8 ; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r1] -; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #12 ; CHECK-FIX-NOSCHED-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-CORTEX-FIX-LABEL: aese_setf16_cond_via_val: ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-CORTEX-FIX-NEXT: .pad #28 -; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #28 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 +; CHECK-CORTEX-FIX-NEXT: .pad #20 +; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #20 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s0 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB37_2 +; CHECK-CORTEX-FIX-NEXT: beq .LBB37_3 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] -; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r7, d16[0] -; CHECK-CORTEX-FIX-NEXT: uxth r6, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r7, #16 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #24] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov.32 r6, d16[1] +; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[0] +; CHECK-CORTEX-FIX-NEXT: mov r2, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r2, r3, #16 +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: lsr r2, r6, #16 +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov r3, r2, d17 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r3, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r10, r2, #16 ; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov r3, r7, d17 -; CHECK-CORTEX-FIX-NEXT: uxth r6, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r11, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r7, #16 -; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: b .LBB37_3 +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bne .LBB37_4 ; CHECK-CORTEX-FIX-NEXT: .LBB37_2: -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1] -; CHECK-CORTEX-FIX-NEXT: ldrh r11, [r1, #12] -; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #14] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #24] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #2] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #4] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #6] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #8] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #10] +; CHECK-CORTEX-FIX-NEXT: vmov lr, r5, d2 +; CHECK-CORTEX-FIX-NEXT: b .LBB37_5 ; CHECK-CORTEX-FIX-NEXT: .LBB37_3: -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1] +; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r1, #4] +; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #8] +; CHECK-CORTEX-FIX-NEXT: ldrh r10, [r1, #14] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #2] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #6] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #10] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12] ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB37_5 -; CHECK-CORTEX-FIX-NEXT: @ %bb.4: -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d2[1] -; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov r4, r6, d3 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r4 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r4, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r6 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r6, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r5, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d2[0] -; CHECK-CORTEX-FIX-NEXT: uxth r0, r2 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r2, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: b .LBB37_6 +; CHECK-CORTEX-FIX-NEXT: beq .LBB37_2 +; CHECK-CORTEX-FIX-NEXT: .LBB37_4: +; CHECK-CORTEX-FIX-NEXT: vmov.32 r5, d2[1] +; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 lr, d2[0] ; CHECK-CORTEX-FIX-NEXT: .LBB37_5: -; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d2 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r2 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r2, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r5, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r0, lr, #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r10, r2, r10, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r7, r8, d3 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r5, #16 ; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: mov r0, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r6, r7, d3 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r6 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r7, #16 -; CHECK-CORTEX-FIX-NEXT: mov r7, r0 -; CHECK-CORTEX-FIX-NEXT: .LBB37_6: -; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r11, r11, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r12, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r10, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r0, r2, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r0, lr, r8, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r2, r3, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #24] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r6, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r6, r0, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: lsr r11, r7, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r9, r8, #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r8, r9, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r11, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r0, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r0, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: ldr r6, [sp] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r3 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r7 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r11 -; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r9, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r3 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r2 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r10 +; CHECK-CORTEX-FIX-NEXT: pkhbt r6, lr, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r6 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r7 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r5 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r4 ; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] -; CHECK-CORTEX-FIX-NEXT: add sp, sp, #28 +; CHECK-CORTEX-FIX-NEXT: add sp, sp, #20 ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} br i1 %0, label %5, label %11 @@ -3567,207 +3471,160 @@ ; CHECK-FIX-NOSCHED: @ %bb.0: ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-FIX-NOSCHED-NEXT: .pad #24 -; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: .pad #12 +; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #12 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1] -; CHECK-FIX-NOSCHED-NEXT: vmov r7, r6, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov r10, r12, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[1] ; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r7 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r6 -; CHECK-FIX-NOSCHED-NEXT: lsr r6, r6, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d16[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r10, #16 ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #20] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r12, #16 +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: lsr r3, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: str r6, [sp, #16] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r10, r5 +; CHECK-FIX-NOSCHED-NEXT: lsr r11, r7, #16 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: bne .LBB82_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_2: -; CHECK-FIX-NOSCHED-NEXT: vmov r4, r6, d1 -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r3, d0 -; CHECK-FIX-NOSCHED-NEXT: lsr r5, r4, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r1, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r6 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r9, r4 -; CHECK-FIX-NOSCHED-NEXT: uxth r6, r3 +; CHECK-FIX-NOSCHED-NEXT: vmov r6, r0, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov lr, r4, d0 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r6, #16 ; CHECK-FIX-NOSCHED-NEXT: b .LBB82_5 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_3: ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #14] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #12] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #8] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #6] -; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh r10, [r2] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #4] +; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #12] +; CHECK-FIX-NOSCHED-NEXT: ldrh r10, [r2, #8] +; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #4] +; CHECK-FIX-NOSCHED-NEXT: ldrh r11, [r2, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #10] +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #6] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 ; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_4: -; CHECK-FIX-NOSCHED-NEXT: vmov r5, r3, d1 -; CHECK-FIX-NOSCHED-NEXT: mov r4, r7 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r7, d0[1] +; CHECK-FIX-NOSCHED-NEXT: vmov r6, r0, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r4, d0[1] ; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d0[0]}, [r1:16] -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r0, d0[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r9, r5 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r3 -; CHECK-FIX-NOSCHED-NEXT: uxth r6, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r1, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r5, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: mov r7, r4 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 lr, d0[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r6, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r0, #16 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_5: -; CHECK-FIX-NOSCHED-NEXT: uxth r8, r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: lsr r8, r4, #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r7, r7, r11, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r9, lr, #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r6, r3, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r7 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r7, lr, r9, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r7 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r7, r4, r8, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r7 +; CHECK-FIX-NOSCHED-NEXT: ldr r7, [sp] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r3 ; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r10, r0, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r8, lr, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r6, r12, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r3, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r9, r5, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r7, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r11, r1, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: pkhbt r7, r5, r7, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r10, r3, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r7 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r12, r0, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r3 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[1], r0 ; CHECK-FIX-NOSCHED-NEXT: aesd.8 q8, q9 ; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q8 ; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #12 ; CHECK-FIX-NOSCHED-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-CORTEX-FIX-LABEL: aesd_setf16_cond_via_ptr: ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-CORTEX-FIX-NEXT: .pad #24 -; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #24 +; CHECK-CORTEX-FIX-NEXT: .pad #20 +; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #20 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB82_3 +; CHECK-CORTEX-FIX-NEXT: beq .LBB82_2 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-CORTEX-FIX-NEXT: vorr q9, q8, q8 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r5, d16[1] ; CHECK-CORTEX-FIX-NEXT: vld1.16 {d18[0]}, [r1:16] -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d18[0] -; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r7, d18[0] +; CHECK-CORTEX-FIX-NEXT: mov r3, r7 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r7, #16 +; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov r7, r4, d17 +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov r3, r6, d17 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r11, r6 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB82_4 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r7, #16 +; CHECK-CORTEX-FIX-NEXT: lsr lr, r4, #16 +; CHECK-CORTEX-FIX-NEXT: b .LBB82_3 ; CHECK-CORTEX-FIX-NEXT: .LBB82_2: -; CHECK-CORTEX-FIX-NEXT: vmov r1, r7, d0 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r1 -; CHECK-CORTEX-FIX-NEXT: uxth r6, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r7, #16 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r1, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: mov r0, r3 -; CHECK-CORTEX-FIX-NEXT: vmov r7, r3, d1 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r5, r7, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 -; CHECK-CORTEX-FIX-NEXT: mov r3, r0 -; CHECK-CORTEX-FIX-NEXT: b .LBB82_5 -; CHECK-CORTEX-FIX-NEXT: .LBB82_3: ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2] -; CHECK-CORTEX-FIX-NEXT: ldrh r11, [r2, #12] -; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r2, #14] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #2] +; CHECK-CORTEX-FIX-NEXT: ldrh r5, [r2, #4] +; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r2, #8] +; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r2, #12] +; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r2, #14] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #4] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #2] +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #6] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #8] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #10] +; CHECK-CORTEX-FIX-NEXT: .LBB82_3: +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB82_2 -; CHECK-CORTEX-FIX-NEXT: .LBB82_4: +; CHECK-CORTEX-FIX-NEXT: beq .LBB82_5 +; CHECK-CORTEX-FIX-NEXT: @ %bb.4: ; CHECK-CORTEX-FIX-NEXT: vorr q8, q0, q0 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r5, d0[1] +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d0[1] ; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] -; CHECK-CORTEX-FIX-NEXT: uxth r6, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r5, #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, r7, d1 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r1, d16[0] -; CHECK-CORTEX-FIX-NEXT: uxth r10, r5 -; CHECK-CORTEX-FIX-NEXT: lsr r5, r5, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r7, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r1 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r1, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov.32 r11, d16[0] +; CHECK-CORTEX-FIX-NEXT: b .LBB82_6 ; CHECK-CORTEX-FIX-NEXT: .LBB82_5: +; CHECK-CORTEX-FIX-NEXT: vmov r11, r3, d0 +; CHECK-CORTEX-FIX-NEXT: .LBB82_6: +; CHECK-CORTEX-FIX-NEXT: lsr r0, r11, #16 +; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r4, lr, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r6, r8, d1 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r12, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r0, lsl #16 ; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r11, r11, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r4, [sp, #16] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r12, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r10, r5, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r0, r1, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r0, lr, r8, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r1, r3, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r3, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r4 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r1 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r7 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r11 -; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r9, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r3 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r5 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r6 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r0 +; CHECK-CORTEX-FIX-NEXT: lsr r10, r6, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r9, r8, #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r8, r9, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r10, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r7, r0, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r7, [sp, #12] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r0, r7, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r7, [sp] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r5 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], lr +; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r11, r7, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r7 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r6 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r3 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r1 ; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: add sp, sp, #24 +; CHECK-CORTEX-FIX-NEXT: add sp, sp, #20 ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} br i1 %0, label %5, label %12 @@ -3816,210 +3673,161 @@ ; CHECK-FIX-NOSCHED: @ %bb.0: ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-FIX-NOSCHED-NEXT: .pad #24 -; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #24 -; CHECK-FIX-NOSCHED-NEXT: vmov r12, s0 +; CHECK-FIX-NOSCHED-NEXT: .pad #12 +; CHECK-FIX-NOSCHED-NEXT: sub sp, sp, #12 +; CHECK-FIX-NOSCHED-NEXT: vmov r2, s0 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d16[1] -; CHECK-FIX-NOSCHED-NEXT: vmov r7, r6, d17 -; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r12 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r7 -; CHECK-FIX-NOSCHED-NEXT: uxth r2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r4, r6 -; CHECK-FIX-NOSCHED-NEXT: lsr r6, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: str r4, [sp, #20] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: vmov r10, r9, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r4, d16[1] +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r2 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r6, d16[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r10, #16 ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: uxth r3, r5 -; CHECK-FIX-NOSCHED-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: b .LBB83_3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r9, #16 +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r4, #16 +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: lsr r11, r6, #16 +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB83_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB83_2: +; CHECK-FIX-NOSCHED-NEXT: vmov r5, r0, d3 +; CHECK-FIX-NOSCHED-NEXT: vmov r12, r3, d2 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r5, #16 +; CHECK-FIX-NOSCHED-NEXT: b .LBB83_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB83_3: ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #14] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #12] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #8] -; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #6] -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #2] -; CHECK-FIX-NOSCHED-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r9, [r1, #12] +; CHECK-FIX-NOSCHED-NEXT: ldrh r10, [r1, #8] +; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #4] +; CHECK-FIX-NOSCHED-NEXT: ldrh r11, [r1, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r1] +; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #10] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r1, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #4] -; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1] -; CHECK-FIX-NOSCHED-NEXT: .LBB83_3: -; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #6] ; CHECK-FIX-NOSCHED-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_5 -; CHECK-FIX-NOSCHED-NEXT: @ %bb.4: -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r6, d2[1] -; CHECK-FIX-NOSCHED-NEXT: mov r3, r2 -; CHECK-FIX-NOSCHED-NEXT: mov r2, r7 -; CHECK-FIX-NOSCHED-NEXT: vmov r4, r7, d3 -; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r12 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 r0, d2[0] -; CHECK-FIX-NOSCHED-NEXT: uxth r5, r6 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r10, r4 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r9, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: mov r7, r2 -; CHECK-FIX-NOSCHED-NEXT: mov r2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, r4, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: b .LBB83_6 +; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_2 +; CHECK-FIX-NOSCHED-NEXT: .LBB83_4: +; CHECK-FIX-NOSCHED-NEXT: vmov r5, r0, d3 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r3, d2[1] +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r2 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r12, d2[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r5, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r0, #16 ; CHECK-FIX-NOSCHED-NEXT: .LBB83_5: -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r6, d3 -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r5, d2 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, r3, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r9, r6, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r5, #16 -; CHECK-FIX-NOSCHED-NEXT: lsr lr, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: uxth r11, r6 -; CHECK-FIX-NOSCHED-NEXT: uxth r10, r3 -; CHECK-FIX-NOSCHED-NEXT: uxth r5, r5 -; CHECK-FIX-NOSCHED-NEXT: .LBB83_6: -; CHECK-FIX-NOSCHED-NEXT: uxth r8, r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: lsr lr, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r6, r6, r11, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r8, r12, #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r3, lr, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r6 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r6, r12, r8, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r6 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r3 ; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r3, r0, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r8, lr, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r5, r12, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d18[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r2, r0, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r10, r4, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r7, lsl #16 -; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r0 -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r11, r9, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r4, r3, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[1], r3 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r5, r7, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[0], r3 +; CHECK-FIX-NOSCHED-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: pkhbt r3, r10, r3, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d19[1], r0 -; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16 +; CHECK-FIX-NOSCHED-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[0], r3 +; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r9, r0, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d17[1], r0 ; CHECK-FIX-NOSCHED-NEXT: aesd.8 q8, q9 ; CHECK-FIX-NOSCHED-NEXT: aesimc.8 q8, q8 ; CHECK-FIX-NOSCHED-NEXT: vst1.64 {d16, d17}, [r1] -; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #24 +; CHECK-FIX-NOSCHED-NEXT: add sp, sp, #12 ; CHECK-FIX-NOSCHED-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-CORTEX-FIX-LABEL: aesd_setf16_cond_via_val: ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-CORTEX-FIX-NEXT: .pad #28 -; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #28 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 +; CHECK-CORTEX-FIX-NEXT: .pad #20 +; CHECK-CORTEX-FIX-NEXT: sub sp, sp, #20 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s0 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB83_2 +; CHECK-CORTEX-FIX-NEXT: beq .LBB83_3 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[1] -; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r7, d16[0] -; CHECK-CORTEX-FIX-NEXT: uxth r6, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r7, #16 -; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: uxth r7, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #24] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov.32 r6, d16[1] +; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d16[0] +; CHECK-CORTEX-FIX-NEXT: mov r2, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r2, r3, #16 +; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: lsr r2, r6, #16 +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: vmov r3, r2, d17 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r3, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r10, r2, #16 ; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov r3, r7, d17 -; CHECK-CORTEX-FIX-NEXT: uxth r6, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r3, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r11, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r7, r7, #16 -; CHECK-CORTEX-FIX-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: b .LBB83_3 +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: bne .LBB83_4 ; CHECK-CORTEX-FIX-NEXT: .LBB83_2: -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1] -; CHECK-CORTEX-FIX-NEXT: ldrh r11, [r1, #12] -; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #14] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #24] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #2] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #4] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #6] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #8] -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #10] +; CHECK-CORTEX-FIX-NEXT: vmov lr, r5, d2 +; CHECK-CORTEX-FIX-NEXT: b .LBB83_5 ; CHECK-CORTEX-FIX-NEXT: .LBB83_3: -; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1] +; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r1, #4] +; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #8] +; CHECK-CORTEX-FIX-NEXT: ldrh r10, [r1, #14] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #2] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #6] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #4] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #10] +; CHECK-CORTEX-FIX-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12] ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB83_5 -; CHECK-CORTEX-FIX-NEXT: @ %bb.4: -; CHECK-CORTEX-FIX-NEXT: vmov.32 r3, d2[1] -; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov r4, r6, d3 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r4 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r4, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r6 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r6, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r5, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 -; CHECK-CORTEX-FIX-NEXT: vmov.32 r2, d2[0] -; CHECK-CORTEX-FIX-NEXT: uxth r0, r2 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r2, #16 -; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: b .LBB83_6 +; CHECK-CORTEX-FIX-NEXT: beq .LBB83_2 +; CHECK-CORTEX-FIX-NEXT: .LBB83_4: +; CHECK-CORTEX-FIX-NEXT: vmov.32 r5, d2[1] +; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 lr, d2[0] ; CHECK-CORTEX-FIX-NEXT: .LBB83_5: -; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d2 -; CHECK-CORTEX-FIX-NEXT: uxth r0, r2 -; CHECK-CORTEX-FIX-NEXT: lsr r9, r2, #16 -; CHECK-CORTEX-FIX-NEXT: uxth r5, r3 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r3, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r0, lr, #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r10, r2, r10, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r7, r8, d3 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r5, #16 ; CHECK-CORTEX-FIX-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: mov r0, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r6, r7, d3 -; CHECK-CORTEX-FIX-NEXT: uxth r10, r6 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: uxth lr, r7 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r7, #16 -; CHECK-CORTEX-FIX-NEXT: mov r7, r0 -; CHECK-CORTEX-FIX-NEXT: .LBB83_6: -; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r11, r11, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r12, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r10, r4, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r0, r2, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r0, lr, r8, lsl #16 -; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r2, r3, lsl #16 -; CHECK-CORTEX-FIX-NEXT: ldr r3, [sp, #24] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r6, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r6, r0, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: lsr r11, r7, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r9, r8, #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r4, r8, r9, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r11, lsl #16 +; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r0, lsl #16 +; CHECK-CORTEX-FIX-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r0, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: ldr r6, [sp] @ 4-byte Reload -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r3 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r2 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r7 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r11 -; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r9, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[0], r3 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], r2 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r10 +; CHECK-CORTEX-FIX-NEXT: pkhbt r6, lr, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r6 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r4 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r7 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[1], r5 -; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r0 +; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[1], r4 ; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] -; CHECK-CORTEX-FIX-NEXT: add sp, sp, #28 +; CHECK-CORTEX-FIX-NEXT: add sp, sp, #20 ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} br i1 %0, label %5, label %11 diff --git a/llvm/test/CodeGen/ARM/and-load-combine.ll b/llvm/test/CodeGen/ARM/and-load-combine.ll --- a/llvm/test/CodeGen/ARM/and-load-combine.ll +++ b/llvm/test/CodeGen/ARM/and-load-combine.ll @@ -369,7 +369,7 @@ define arm_aapcscc zeroext i1 @cmp_and8_short_short(ptr nocapture readonly %a, ptr nocapture readonly %b) { ; ARM-LABEL: cmp_and8_short_short: ; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: ldrh r1, [r1] ; ARM-NEXT: ldrb r0, [r0] ; ARM-NEXT: and r0, r0, r1 ; ARM-NEXT: clz r0, r0 @@ -378,7 +378,7 @@ ; ; ARMEB-LABEL: cmp_and8_short_short: ; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldrb r1, [r1, #1] +; ARMEB-NEXT: ldrh r1, [r1] ; ARMEB-NEXT: ldrb r0, [r0, #1] ; ARMEB-NEXT: and r0, r0, r1 ; ARMEB-NEXT: clz r0, r0 @@ -387,7 +387,7 @@ ; ; THUMB1-LABEL: cmp_and8_short_short: ; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrb r1, [r1] +; THUMB1-NEXT: ldrh r1, [r1] ; THUMB1-NEXT: ldrb r2, [r0] ; THUMB1-NEXT: ands r2, r1 ; THUMB1-NEXT: rsbs r0, r2, #0 @@ -396,7 +396,7 @@ ; ; THUMB2-LABEL: cmp_and8_short_short: ; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldrb r1, [r1] +; THUMB2-NEXT: ldrh r1, [r1] ; THUMB2-NEXT: ldrb r0, [r0] ; THUMB2-NEXT: ands r0, r1 ; THUMB2-NEXT: clz r0, r0 @@ -460,7 +460,7 @@ define arm_aapcscc zeroext i1 @cmp_and8_int_int(ptr nocapture readonly %a, ptr nocapture readonly %b) { ; ARM-LABEL: cmp_and8_int_int: ; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldrb r1, [r1] +; ARM-NEXT: ldr r1, [r1] ; ARM-NEXT: ldrb r0, [r0] ; ARM-NEXT: and r0, r0, r1 ; ARM-NEXT: clz r0, r0 @@ -469,7 +469,7 @@ ; ; ARMEB-LABEL: cmp_and8_int_int: ; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldrb r1, [r1, #3] +; ARMEB-NEXT: ldr r1, [r1] ; ARMEB-NEXT: ldrb r0, [r0, #3] ; ARMEB-NEXT: and r0, r0, r1 ; ARMEB-NEXT: clz r0, r0 @@ -478,7 +478,7 @@ ; ; THUMB1-LABEL: cmp_and8_int_int: ; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrb r1, [r1] +; THUMB1-NEXT: ldr r1, [r1] ; THUMB1-NEXT: ldrb r2, [r0] ; THUMB1-NEXT: ands r2, r1 ; THUMB1-NEXT: rsbs r0, r2, #0 @@ -487,7 +487,7 @@ ; ; THUMB2-LABEL: cmp_and8_int_int: ; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldrb r1, [r1] +; THUMB2-NEXT: ldr r1, [r1] ; THUMB2-NEXT: ldrb r0, [r0] ; THUMB2-NEXT: ands r0, r1 ; THUMB2-NEXT: clz r0, r0 @@ -505,7 +505,7 @@ define arm_aapcscc zeroext i1 @cmp_and16(ptr nocapture readonly %a, ptr nocapture readonly %b) { ; ARM-LABEL: cmp_and16: ; ARM: @ %bb.0: @ %entry -; ARM-NEXT: ldrh r1, [r1] +; ARM-NEXT: ldr r1, [r1] ; ARM-NEXT: ldrh r0, [r0] ; ARM-NEXT: and r0, r0, r1 ; ARM-NEXT: clz r0, r0 @@ -514,7 +514,7 @@ ; ; ARMEB-LABEL: cmp_and16: ; ARMEB: @ %bb.0: @ %entry -; ARMEB-NEXT: ldrh r1, [r1, #2] +; ARMEB-NEXT: ldr r1, [r1] ; ARMEB-NEXT: ldrh r0, [r0, #2] ; ARMEB-NEXT: and r0, r0, r1 ; ARMEB-NEXT: clz r0, r0 @@ -523,7 +523,7 @@ ; ; THUMB1-LABEL: cmp_and16: ; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: ldrh r1, [r1] +; THUMB1-NEXT: ldr r1, [r1] ; THUMB1-NEXT: ldrh r2, [r0] ; THUMB1-NEXT: ands r2, r1 ; THUMB1-NEXT: rsbs r0, r2, #0 @@ -532,7 +532,7 @@ ; ; THUMB2-LABEL: cmp_and16: ; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: ldrh r1, [r1] +; THUMB2-NEXT: ldr r1, [r1] ; THUMB2-NEXT: ldrh r0, [r0] ; THUMB2-NEXT: ands r0, r1 ; THUMB2-NEXT: clz r0, r0 diff --git a/llvm/test/CodeGen/ARM/and-sext-combine.ll b/llvm/test/CodeGen/ARM/and-sext-combine.ll --- a/llvm/test/CodeGen/ARM/and-sext-combine.ll +++ b/llvm/test/CodeGen/ARM/and-sext-combine.ll @@ -13,9 +13,10 @@ define i32 @f_i16_i32(ptr %a, ptr %b) { ; CHECK-LABEL: f_i16_i32: -; CHECK: ldrh r1, [r1] -; CHECK-NEXT: ldrsh r0, [r0] -; CHECK-NEXT: smulbb r0, r0, r1 +; CHECK: ldrsh r0, [r0] +; CHECK-NEXT: ldrsh r1, [r1] +; CHECK-NEXT: smulbb r0, r1, r0 +; CHECK-NEXT: uxth r1, r1 ; CHECK-NEXT: mul r0, r0, r1 ; CHECK-NEXT: bx lr %1 = load i16, ptr %a, align 2 diff --git a/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll --- a/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll +++ b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll @@ -88,9 +88,10 @@ define <4 x float> @test_positive_initial_offset(ptr %A) { ; CHECK-LABEL: test_positive_initial_offset: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r0, r0, #32 -; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! -; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: add r1, r0, #32 +; CHECK-NEXT: add r0, r0, #64 +; CHECK-NEXT: vld1.32 {d16, d17}, [r1]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r1] ; CHECK-NEXT: vadd.f32 q8, q8, q9 ; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: vadd.f32 q0, q8, q9 @@ -133,9 +134,10 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: movw r0, :lower16:global_float_array ; CHECK-NEXT: movt r0, :upper16:global_float_array -; CHECK-NEXT: add r0, r0, #32 -; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! -; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: add r1, r0, #32 +; CHECK-NEXT: vld1.32 {d16, d17}, [r1]! +; CHECK-NEXT: add r0, r0, #64 +; CHECK-NEXT: vld1.32 {d18, d19}, [r1] ; CHECK-NEXT: vadd.f32 q8, q8, q9 ; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: vadd.f32 q0, q8, q9 @@ -184,9 +186,10 @@ define <2 x double> @test_double(ptr %A) { ; CHECK-LABEL: test_double: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r0, r0, #64 -; CHECK-NEXT: vld1.64 {d16, d17}, [r0]! -; CHECK-NEXT: vld1.64 {d18, d19}, [r0]! +; CHECK-NEXT: add r1, r0, #64 +; CHECK-NEXT: add r0, r0, #96 +; CHECK-NEXT: vld1.64 {d16, d17}, [r1]! +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] ; CHECK-NEXT: vadd.f64 d20, d17, d19 ; CHECK-NEXT: vadd.f64 d16, d16, d18 ; CHECK-NEXT: vld1.64 {d22, d23}, [r0] @@ -227,6 +230,8 @@ ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: bxlt lr ; CHECK-NEXT: .LBB10_1: @ %for.body.preheader +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: mov r12, #0 ; CHECK-NEXT: .LBB10_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -237,13 +242,15 @@ ; CHECK-NEXT: vld1.32 {d20, d21}, [r3]! ; CHECK-NEXT: vld1.32 {d22, d23}, [r3] ; CHECK-NEXT: add r3, r1, r12 +; CHECK-NEXT: add lr, r3, #32 ; CHECK-NEXT: add r12, r12, #64 ; CHECK-NEXT: vst1.32 {d16, d17}, [r3]! -; CHECK-NEXT: vst1.32 {d18, d19}, [r3]! -; CHECK-NEXT: vst1.32 {d20, d21}, [r3]! -; CHECK-NEXT: vst1.32 {d22, d23}, [r3] +; CHECK-NEXT: vst1.32 {d20, d21}, [lr]! +; CHECK-NEXT: vst1.32 {d18, d19}, [r3] +; CHECK-NEXT: vst1.32 {d22, d23}, [lr] ; CHECK-NEXT: bne .LBB10_2 -; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: bx lr entry: %cmp61 = icmp sgt i32 %n, 0 diff --git a/llvm/test/CodeGen/ARM/arm-storebytesmerge.ll b/llvm/test/CodeGen/ARM/arm-storebytesmerge.ll --- a/llvm/test/CodeGen/ARM/arm-storebytesmerge.ll +++ b/llvm/test/CodeGen/ARM/arm-storebytesmerge.ll @@ -4,95 +4,98 @@ define arm_aapcs_vfpcc void @test(ptr %v50) { ; CHECK-LABEL: test: ; CHECK: @ %bb.0: -; CHECK-NEXT: movw r1, #65534 -; CHECK-NEXT: strh.w r1, [r0, #510] -; CHECK-NEXT: movw r1, #64506 -; CHECK-NEXT: movt r1, #65020 -; CHECK-NEXT: str.w r1, [r0, #506] -; CHECK-NEXT: movw r1, #63478 -; CHECK-NEXT: movt r1, #63992 -; CHECK-NEXT: str.w r1, [r0, #502] -; CHECK-NEXT: movw r1, #62450 -; CHECK-NEXT: movt r1, #62964 -; CHECK-NEXT: str.w r1, [r0, #498] -; CHECK-NEXT: movw r1, #61422 -; CHECK-NEXT: movt r1, #61936 -; CHECK-NEXT: str.w r1, [r0, #494] -; CHECK-NEXT: movw r1, #60394 -; CHECK-NEXT: movt r1, #60908 -; CHECK-NEXT: str.w r1, [r0, #490] -; CHECK-NEXT: movw r1, #59366 -; CHECK-NEXT: movt r1, #59880 -; CHECK-NEXT: str.w r1, [r0, #486] -; CHECK-NEXT: movw r1, #58338 -; CHECK-NEXT: movt r1, #58852 -; CHECK-NEXT: str.w r1, [r0, #482] -; CHECK-NEXT: movw r1, #57310 -; CHECK-NEXT: movt r1, #57824 -; CHECK-NEXT: str.w r1, [r0, #478] -; CHECK-NEXT: movw r1, #56282 -; CHECK-NEXT: movt r1, #56796 -; CHECK-NEXT: str.w r1, [r0, #474] -; CHECK-NEXT: movw r1, #55254 -; CHECK-NEXT: movt r1, #55768 -; CHECK-NEXT: str.w r1, [r0, #470] -; CHECK-NEXT: movw r1, #54226 -; CHECK-NEXT: movt r1, #54740 -; CHECK-NEXT: str.w r1, [r0, #466] -; CHECK-NEXT: movw r1, #53198 -; CHECK-NEXT: movt r1, #53712 -; CHECK-NEXT: str.w r1, [r0, #462] -; CHECK-NEXT: movw r1, #52170 -; CHECK-NEXT: movt r1, #52684 -; CHECK-NEXT: str.w r1, [r0, #458] -; CHECK-NEXT: movw r1, #51142 -; CHECK-NEXT: movt r1, #51656 -; CHECK-NEXT: str.w r1, [r0, #454] -; CHECK-NEXT: movw r1, #50114 -; CHECK-NEXT: movt r1, #50628 -; CHECK-NEXT: str.w r1, [r0, #450] -; CHECK-NEXT: movw r1, #49086 -; CHECK-NEXT: movt r1, #49600 -; CHECK-NEXT: str.w r1, [r0, #446] -; CHECK-NEXT: movw r1, #48058 -; CHECK-NEXT: movt r1, #48572 -; CHECK-NEXT: str.w r1, [r0, #442] -; CHECK-NEXT: movw r1, #47030 -; CHECK-NEXT: movt r1, #47544 -; CHECK-NEXT: str.w r1, [r0, #438] -; CHECK-NEXT: movw r1, #46002 -; CHECK-NEXT: movt r1, #46516 -; CHECK-NEXT: str.w r1, [r0, #434] -; CHECK-NEXT: movw r1, #44974 -; CHECK-NEXT: movt r1, #45488 -; CHECK-NEXT: str.w r1, [r0, #430] -; CHECK-NEXT: movw r1, #43946 -; CHECK-NEXT: movt r1, #44460 -; CHECK-NEXT: str.w r1, [r0, #426] -; CHECK-NEXT: movw r1, #42918 -; CHECK-NEXT: movt r1, #43432 -; CHECK-NEXT: str.w r1, [r0, #422] -; CHECK-NEXT: movw r1, #41890 -; CHECK-NEXT: movt r1, #42404 -; CHECK-NEXT: str.w r1, [r0, #418] -; CHECK-NEXT: movw r1, #40862 -; CHECK-NEXT: movt r1, #41376 -; CHECK-NEXT: str.w r1, [r0, #414] -; CHECK-NEXT: movw r1, #39834 -; CHECK-NEXT: movt r1, #40348 -; CHECK-NEXT: str.w r1, [r0, #410] -; CHECK-NEXT: movw r1, #38806 -; CHECK-NEXT: movt r1, #39320 -; CHECK-NEXT: str.w r1, [r0, #406] -; CHECK-NEXT: movw r1, #37778 -; CHECK-NEXT: movt r1, #38292 -; CHECK-NEXT: str.w r1, [r0, #402] -; CHECK-NEXT: movw r1, #36750 -; CHECK-NEXT: movt r1, #37264 -; CHECK-NEXT: str.w r1, [r0, #398] ; CHECK-NEXT: movw r1, #35722 ; CHECK-NEXT: movt r1, #36236 ; CHECK-NEXT: str.w r1, [r0, #394] +; CHECK-NEXT: movw r1, #36750 +; CHECK-NEXT: movt r1, #37264 +; CHECK-NEXT: str.w r1, [r0, #398] +; CHECK-NEXT: movw r1, #37778 +; CHECK-NEXT: movt r1, #38292 +; CHECK-NEXT: str.w r1, [r0, #402] +; CHECK-NEXT: movw r1, #38806 +; CHECK-NEXT: movt r1, #39320 +; CHECK-NEXT: str.w r1, [r0, #406] +; CHECK-NEXT: movw r1, #39834 +; CHECK-NEXT: movt r1, #40348 +; CHECK-NEXT: str.w r1, [r0, #410] +; CHECK-NEXT: movw r1, #40862 +; CHECK-NEXT: movt r1, #41376 +; CHECK-NEXT: str.w r1, [r0, #414] +; CHECK-NEXT: movw r1, #41890 +; CHECK-NEXT: movt r1, #42404 +; CHECK-NEXT: str.w r1, [r0, #418] +; CHECK-NEXT: movw r1, #42918 +; CHECK-NEXT: movt r1, #43432 +; CHECK-NEXT: str.w r1, [r0, #422] +; CHECK-NEXT: movw r1, #43946 +; CHECK-NEXT: movt r1, #44460 +; CHECK-NEXT: str.w r1, [r0, #426] +; CHECK-NEXT: movw r1, #44974 +; CHECK-NEXT: movt r1, #45488 +; CHECK-NEXT: str.w r1, [r0, #430] +; CHECK-NEXT: movw r1, #46002 +; CHECK-NEXT: movt r1, #46516 +; CHECK-NEXT: str.w r1, [r0, #434] +; CHECK-NEXT: movw r1, #47030 +; CHECK-NEXT: movt r1, #47544 +; CHECK-NEXT: str.w r1, [r0, #438] +; CHECK-NEXT: movw r1, #48058 +; CHECK-NEXT: movt r1, #48572 +; CHECK-NEXT: str.w r1, [r0, #442] +; CHECK-NEXT: movw r1, #49086 +; CHECK-NEXT: movt r1, #49600 +; CHECK-NEXT: str.w r1, [r0, #446] +; CHECK-NEXT: movw r1, #50114 +; CHECK-NEXT: movt r1, #50628 +; CHECK-NEXT: str.w r1, [r0, #450] +; CHECK-NEXT: movw r1, #51142 +; CHECK-NEXT: strh.w r1, [r0, #454] +; CHECK-NEXT: movs r1, #200 +; CHECK-NEXT: strb.w r1, [r0, #456] +; CHECK-NEXT: movw r1, #51913 +; CHECK-NEXT: movt r1, #52427 +; CHECK-NEXT: str.w r1, [r0, #457] +; CHECK-NEXT: movw r1, #52941 +; CHECK-NEXT: movt r1, #53455 +; CHECK-NEXT: str.w r1, [r0, #461] +; CHECK-NEXT: movw r1, #53969 +; CHECK-NEXT: movt r1, #54483 +; CHECK-NEXT: str.w r1, [r0, #465] +; CHECK-NEXT: movw r1, #54997 +; CHECK-NEXT: movt r1, #55511 +; CHECK-NEXT: str.w r1, [r0, #469] +; CHECK-NEXT: movw r1, #56025 +; CHECK-NEXT: movt r1, #56539 +; CHECK-NEXT: str.w r1, [r0, #473] +; CHECK-NEXT: movw r1, #57053 +; CHECK-NEXT: movt r1, #57567 +; CHECK-NEXT: str.w r1, [r0, #477] +; CHECK-NEXT: movw r1, #58081 +; CHECK-NEXT: movt r1, #58595 +; CHECK-NEXT: str.w r1, [r0, #481] +; CHECK-NEXT: movw r1, #59109 +; CHECK-NEXT: movt r1, #59623 +; CHECK-NEXT: str.w r1, [r0, #485] +; CHECK-NEXT: movw r1, #60137 +; CHECK-NEXT: movt r1, #60651 +; CHECK-NEXT: str.w r1, [r0, #489] +; CHECK-NEXT: movw r1, #61165 +; CHECK-NEXT: movt r1, #61679 +; CHECK-NEXT: str.w r1, [r0, #493] +; CHECK-NEXT: movw r1, #62193 +; CHECK-NEXT: movt r1, #62707 +; CHECK-NEXT: str.w r1, [r0, #497] +; CHECK-NEXT: movw r1, #63221 +; CHECK-NEXT: movt r1, #63735 +; CHECK-NEXT: str.w r1, [r0, #501] +; CHECK-NEXT: movw r1, #64249 +; CHECK-NEXT: movt r1, #64763 +; CHECK-NEXT: str.w r1, [r0, #505] +; CHECK-NEXT: movw r1, #65277 +; CHECK-NEXT: strh.w r1, [r0, #509] +; CHECK-NEXT: movs r1, #255 +; CHECK-NEXT: strb.w r1, [r0, #511] ; CHECK-NEXT: bx lr %v190 = getelementptr inbounds i8, ptr %v50, i32 394 store i8 -118, ptr %v190, align 1 diff --git a/llvm/test/CodeGen/ARM/bfi.ll b/llvm/test/CodeGen/ARM/bfi.ll --- a/llvm/test/CodeGen/ARM/bfi.ll +++ b/llvm/test/CodeGen/ARM/bfi.ll @@ -220,13 +220,10 @@ define i32 @bfi1(i32 %a, i32 %b) { ; CHECK-LABEL: bfi1: ; CHECK: @ %bb.0: -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: bic r1, r1, #19 -; CHECK-NEXT: orr r1, r1, r2 -; CHECK-NEXT: and r2, r0, #16 -; CHECK-NEXT: orr r1, r1, r2 -; CHECK-NEXT: and r0, r0, #2 -; CHECK-NEXT: orr r0, r1, r0 +; CHECK-NEXT: bfi r1, r0, #0, #2 +; CHECK-NEXT: lsr r0, r0, #4 +; CHECK-NEXT: bfi r1, r0, #4, #1 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr %x1 = and i32 %a, 1 %y1 = and i32 %b, 4294967294 @@ -272,17 +269,10 @@ define i32 @bfi2(i32 %a, i32 %b) { ; CHECK-LABEL: bfi2: ; CHECK: @ %bb.0: -; CHECK-NEXT: movw r2, #65148 -; CHECK-NEXT: movt r2, #65535 -; CHECK-NEXT: and r1, r1, r2 -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: orr r1, r1, r2 -; CHECK-NEXT: and r2, r0, #2 -; CHECK-NEXT: orr r1, r1, r2 -; CHECK-NEXT: and r2, r0, #128 -; CHECK-NEXT: orr r1, r1, r2 -; CHECK-NEXT: and r0, r0, #256 -; CHECK-NEXT: orr r0, r1, r0 +; CHECK-NEXT: bfi r1, r0, #0, #2 +; CHECK-NEXT: lsr r0, r0, #7 +; CHECK-NEXT: bfi r1, r0, #7, #2 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr %x1 = and i32 %a, 1 %y1 = and i32 %b, 4294967294 @@ -333,17 +323,10 @@ define i32 @bfi3(i32 %a, i32 %b) { ; CHECK-LABEL: bfi3: ; CHECK: @ %bb.0: -; CHECK-NEXT: movw r2, #65148 -; CHECK-NEXT: movt r2, #65535 -; CHECK-NEXT: and r1, r1, r2 -; CHECK-NEXT: and r2, r0, #1 -; CHECK-NEXT: orr r1, r1, r2 -; CHECK-NEXT: and r2, r0, #128 -; CHECK-NEXT: orr r1, r1, r2 -; CHECK-NEXT: and r2, r0, #2 -; CHECK-NEXT: orr r1, r1, r2 -; CHECK-NEXT: and r0, r0, #256 -; CHECK-NEXT: orr r0, r1, r0 +; CHECK-NEXT: bfi r1, r0, #0, #2 +; CHECK-NEXT: lsr r0, r0, #7 +; CHECK-NEXT: bfi r1, r0, #7, #2 +; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr %x1 = and i32 %a, 1 %y1 = and i32 %b, 4294967294 diff --git a/llvm/test/CodeGen/ARM/bool-ext-inc.ll b/llvm/test/CodeGen/ARM/bool-ext-inc.ll --- a/llvm/test/CodeGen/ARM/bool-ext-inc.ll +++ b/llvm/test/CodeGen/ARM/bool-ext-inc.ll @@ -14,14 +14,13 @@ define <4 x i32> @sext_inc_vec(<4 x i1> %x) { ; CHECK-LABEL: sext_inc_vec: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.i16 d16, #0x1 -; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: veor d16, d17, d16 +; CHECK-NEXT: vmov d16, r0, r1 ; CHECK-NEXT: vmov.i32 q9, #0x1 ; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vand q8, q8, q9 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vshl.i32 q8, q8, #31 +; CHECK-NEXT: vsra.s32 q9, q8, #31 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 ; CHECK-NEXT: mov pc, lr %ext = sext <4 x i1> %x to <4 x i32> %add = add <4 x i32> %ext, @@ -32,12 +31,12 @@ ; CHECK-LABEL: cmpgt_sext_inc_vec: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vld1.64 {d18, d19}, [r0] -; CHECK-NEXT: vcge.s32 q8, q9, q8 +; CHECK-NEXT: vcgt.s32 q8, q8, q9 ; CHECK-NEXT: vmov.i32 q9, #0x1 -; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vadd.i32 q8, q8, q9 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -55,8 +54,7 @@ ; CHECK-NEXT: vld1.64 {d18, d19}, [r12] ; CHECK-NEXT: vmov d16, r0, r1 ; CHECK-NEXT: vceq.i32 q8, q8, q9 -; CHECK-NEXT: vmov.i32 q9, #0x1 -; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vneg.s32 q8, q8 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr diff --git a/llvm/test/CodeGen/ARM/cmp-bool.ll b/llvm/test/CodeGen/ARM/cmp-bool.ll --- a/llvm/test/CodeGen/ARM/cmp-bool.ll +++ b/llvm/test/CodeGen/ARM/cmp-bool.ll @@ -43,16 +43,18 @@ define void @bool_ne(i1 zeroext %a, i1 zeroext %b, ptr nocapture %c) nounwind { ; ARM-LABEL: bool_ne: ; ARM: @ %bb.0: @ %entry -; ARM-NEXT: cmp r0, r1 -; ARM-NEXT: bxeq lr +; ARM-NEXT: eor r0, r0, r1 +; ARM-NEXT: cmp r0, #1 +; ARM-NEXT: bxne lr ; ARM-NEXT: .LBB1_1: @ %if.then ; ARM-NEXT: bx r2 ; ; THUMB-LABEL: bool_ne: ; THUMB: @ %bb.0: @ %entry ; THUMB-NEXT: push {r7, lr} -; THUMB-NEXT: cmp r0, r1 -; THUMB-NEXT: beq .LBB1_2 +; THUMB-NEXT: eors r0, r1 +; THUMB-NEXT: cmp r0, #1 +; THUMB-NEXT: bne .LBB1_2 ; THUMB-NEXT: @ %bb.1: @ %if.then ; THUMB-NEXT: blx r2 ; THUMB-NEXT: .LBB1_2: @ %if.end @@ -60,9 +62,10 @@ ; ; THUMB2-LABEL: bool_ne: ; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: cmp r0, r1 -; THUMB2-NEXT: it eq -; THUMB2-NEXT: bxeq lr +; THUMB2-NEXT: eors r0, r1 +; THUMB2-NEXT: cmp r0, #1 +; THUMB2-NEXT: it ne +; THUMB2-NEXT: bxne lr ; THUMB2-NEXT: .LBB1_1: @ %if.then ; THUMB2-NEXT: bx r2 entry: diff --git a/llvm/test/CodeGen/ARM/cmp-peephole.ll b/llvm/test/CodeGen/ARM/cmp-peephole.ll --- a/llvm/test/CodeGen/ARM/cmp-peephole.ll +++ b/llvm/test/CodeGen/ARM/cmp-peephole.ll @@ -1265,9 +1265,9 @@ define void @br_on_binop_lt_zero(i32 %a, i32 %b) { ; ARM-LABEL: br_on_binop_lt_zero: ; ARM: @ %bb.0: -; ARM-NEXT: orr r1, r0, r1 +; ARM-NEXT: mov r1, #1 ; ARM-NEXT: cmp r1, #0 -; ARM-NEXT: bxhs lr +; ARM-NEXT: bxne lr ; ARM-NEXT: .LBB46_1: @ %true_br ; ARM-NEXT: push {r11, lr} ; ARM-NEXT: bl consume @@ -1277,9 +1277,9 @@ ; THUMB-LABEL: br_on_binop_lt_zero: ; THUMB: @ %bb.0: ; THUMB-NEXT: push {r7, lr} -; THUMB-NEXT: orrs r1, r0 +; THUMB-NEXT: movs r1, #1 ; THUMB-NEXT: cmp r1, #0 -; THUMB-NEXT: bhs .LBB46_2 +; THUMB-NEXT: bne .LBB46_2 ; THUMB-NEXT: @ %bb.1: @ %true_br ; THUMB-NEXT: bl consume ; THUMB-NEXT: .LBB46_2: @ %exit @@ -1287,10 +1287,10 @@ ; ; THUMB2-LABEL: br_on_binop_lt_zero: ; THUMB2: @ %bb.0: -; THUMB2-NEXT: orrs r1, r0 +; THUMB2-NEXT: movs r1, #1 ; THUMB2-NEXT: cmp r1, #0 -; THUMB2-NEXT: it hs -; THUMB2-NEXT: bxhs lr +; THUMB2-NEXT: it ne +; THUMB2-NEXT: bxne lr ; THUMB2-NEXT: .LBB46_1: @ %true_br ; THUMB2-NEXT: push {r7, lr} ; THUMB2-NEXT: bl consume @@ -1570,9 +1570,9 @@ define void @br_on_shift_lt_zero(i32 %a, i32 %b) { ; ARM-LABEL: br_on_shift_lt_zero: ; ARM: @ %bb.0: -; ARM-NEXT: asr r1, r0, r1 +; ARM-NEXT: mov r1, #1 ; ARM-NEXT: cmp r1, #0 -; ARM-NEXT: bxhs lr +; ARM-NEXT: bxne lr ; ARM-NEXT: .LBB53_1: @ %true_br ; ARM-NEXT: push {r11, lr} ; ARM-NEXT: bl consume @@ -1582,10 +1582,9 @@ ; THUMB-LABEL: br_on_shift_lt_zero: ; THUMB: @ %bb.0: ; THUMB-NEXT: push {r7, lr} -; THUMB-NEXT: mov r2, r0 -; THUMB-NEXT: asrs r2, r1 -; THUMB-NEXT: cmp r2, #0 -; THUMB-NEXT: bhs .LBB53_2 +; THUMB-NEXT: movs r1, #1 +; THUMB-NEXT: cmp r1, #0 +; THUMB-NEXT: bne .LBB53_2 ; THUMB-NEXT: @ %bb.1: @ %true_br ; THUMB-NEXT: bl consume ; THUMB-NEXT: .LBB53_2: @ %exit @@ -1593,10 +1592,10 @@ ; ; THUMB2-LABEL: br_on_shift_lt_zero: ; THUMB2: @ %bb.0: -; THUMB2-NEXT: asr.w r1, r0, r1 +; THUMB2-NEXT: movs r1, #1 ; THUMB2-NEXT: cmp r1, #0 -; THUMB2-NEXT: it hs -; THUMB2-NEXT: bxhs lr +; THUMB2-NEXT: it ne +; THUMB2-NEXT: bxne lr ; THUMB2-NEXT: .LBB53_1: @ %true_br ; THUMB2-NEXT: push {r7, lr} ; THUMB2-NEXT: bl consume diff --git a/llvm/test/CodeGen/ARM/cttz.ll b/llvm/test/CodeGen/ARM/cttz.ll --- a/llvm/test/CodeGen/ARM/cttz.ll +++ b/llvm/test/CodeGen/ARM/cttz.ll @@ -26,14 +26,15 @@ ; CHECK-6M-NEXT: lsls r1, r0, #24 ; CHECK-6M-NEXT: beq .LBB0_3 ; CHECK-6M-NEXT: @ %bb.1: @ %cond.false -; CHECK-6M-NEXT: cmp r0, #0 -; CHECK-6M-NEXT: beq .LBB0_4 -; CHECK-6M-NEXT: @ %bb.2: @ %cond.false ; CHECK-6M-NEXT: rsbs r1, r0, #0 -; CHECK-6M-NEXT: ands r1, r0 -; CHECK-6M-NEXT: ldr r0, .LCPI0_0 -; CHECK-6M-NEXT: muls r0, r1, r0 -; CHECK-6M-NEXT: lsrs r0, r0, #27 +; CHECK-6M-NEXT: mov r2, r0 +; CHECK-6M-NEXT: adcs r2, r1 +; CHECK-6M-NEXT: bne .LBB0_4 +; CHECK-6M-NEXT: @ %bb.2: @ %cond.false +; CHECK-6M-NEXT: ands r0, r1 +; CHECK-6M-NEXT: ldr r1, .LCPI0_0 +; CHECK-6M-NEXT: muls r1, r0, r1 +; CHECK-6M-NEXT: lsrs r0, r1, #27 ; CHECK-6M-NEXT: adr r1, .LCPI0_1 ; CHECK-6M-NEXT: ldrb r0, [r1, r0] ; CHECK-6M-NEXT: bx lr @@ -55,14 +56,16 @@ ; CHECK-8MBASE-NEXT: lsls r1, r0, #24 ; CHECK-8MBASE-NEXT: beq .LBB0_3 ; CHECK-8MBASE-NEXT: @ %bb.1: @ %cond.false -; CHECK-8MBASE-NEXT: cbz r0, .LBB0_4 -; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false ; CHECK-8MBASE-NEXT: rsbs r1, r0, #0 -; CHECK-8MBASE-NEXT: ands r1, r0 -; CHECK-8MBASE-NEXT: movw r0, #46385 -; CHECK-8MBASE-NEXT: movt r0, #1916 -; CHECK-8MBASE-NEXT: muls r0, r1, r0 -; CHECK-8MBASE-NEXT: lsrs r0, r0, #27 +; CHECK-8MBASE-NEXT: mov r2, r0 +; CHECK-8MBASE-NEXT: adcs r2, r1 +; CHECK-8MBASE-NEXT: bne .LBB0_4 +; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false +; CHECK-8MBASE-NEXT: ands r0, r1 +; CHECK-8MBASE-NEXT: movw r1, #46385 +; CHECK-8MBASE-NEXT: movt r1, #1916 +; CHECK-8MBASE-NEXT: muls r1, r0, r1 +; CHECK-8MBASE-NEXT: lsrs r0, r1, #27 ; CHECK-8MBASE-NEXT: adr r1, .LCPI0_0 ; CHECK-8MBASE-NEXT: ldrb r0, [r1, r0] ; CHECK-8MBASE-NEXT: bx lr @@ -93,14 +96,15 @@ ; CHECK-6M-NEXT: lsls r1, r0, #16 ; CHECK-6M-NEXT: beq .LBB1_3 ; CHECK-6M-NEXT: @ %bb.1: @ %cond.false -; CHECK-6M-NEXT: cmp r0, #0 -; CHECK-6M-NEXT: beq .LBB1_4 -; CHECK-6M-NEXT: @ %bb.2: @ %cond.false ; CHECK-6M-NEXT: rsbs r1, r0, #0 -; CHECK-6M-NEXT: ands r1, r0 -; CHECK-6M-NEXT: ldr r0, .LCPI1_0 -; CHECK-6M-NEXT: muls r0, r1, r0 -; CHECK-6M-NEXT: lsrs r0, r0, #27 +; CHECK-6M-NEXT: mov r2, r0 +; CHECK-6M-NEXT: adcs r2, r1 +; CHECK-6M-NEXT: bne .LBB1_4 +; CHECK-6M-NEXT: @ %bb.2: @ %cond.false +; CHECK-6M-NEXT: ands r0, r1 +; CHECK-6M-NEXT: ldr r1, .LCPI1_0 +; CHECK-6M-NEXT: muls r1, r0, r1 +; CHECK-6M-NEXT: lsrs r0, r1, #27 ; CHECK-6M-NEXT: adr r1, .LCPI1_1 ; CHECK-6M-NEXT: ldrb r0, [r1, r0] ; CHECK-6M-NEXT: bx lr @@ -122,14 +126,16 @@ ; CHECK-8MBASE-NEXT: lsls r1, r0, #16 ; CHECK-8MBASE-NEXT: beq .LBB1_3 ; CHECK-8MBASE-NEXT: @ %bb.1: @ %cond.false -; CHECK-8MBASE-NEXT: cbz r0, .LBB1_4 -; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false ; CHECK-8MBASE-NEXT: rsbs r1, r0, #0 -; CHECK-8MBASE-NEXT: ands r1, r0 -; CHECK-8MBASE-NEXT: movw r0, #46385 -; CHECK-8MBASE-NEXT: movt r0, #1916 -; CHECK-8MBASE-NEXT: muls r0, r1, r0 -; CHECK-8MBASE-NEXT: lsrs r0, r0, #27 +; CHECK-8MBASE-NEXT: mov r2, r0 +; CHECK-8MBASE-NEXT: adcs r2, r1 +; CHECK-8MBASE-NEXT: bne .LBB1_4 +; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false +; CHECK-8MBASE-NEXT: ands r0, r1 +; CHECK-8MBASE-NEXT: movw r1, #46385 +; CHECK-8MBASE-NEXT: movt r1, #1916 +; CHECK-8MBASE-NEXT: muls r1, r0, r1 +; CHECK-8MBASE-NEXT: lsrs r0, r1, #27 ; CHECK-8MBASE-NEXT: adr r1, .LCPI1_0 ; CHECK-8MBASE-NEXT: ldrb r0, [r1, r0] ; CHECK-8MBASE-NEXT: bx lr @@ -159,14 +165,15 @@ ; CHECK-6M-NEXT: cmp r0, #0 ; CHECK-6M-NEXT: beq .LBB2_3 ; CHECK-6M-NEXT: @ %bb.1: @ %cond.false -; CHECK-6M-NEXT: cmp r0, #0 -; CHECK-6M-NEXT: beq .LBB2_3 -; CHECK-6M-NEXT: @ %bb.2: @ %cond.false ; CHECK-6M-NEXT: rsbs r1, r0, #0 -; CHECK-6M-NEXT: ands r1, r0 -; CHECK-6M-NEXT: ldr r0, .LCPI2_0 -; CHECK-6M-NEXT: muls r0, r1, r0 -; CHECK-6M-NEXT: lsrs r0, r0, #27 +; CHECK-6M-NEXT: mov r2, r0 +; CHECK-6M-NEXT: adcs r2, r1 +; CHECK-6M-NEXT: bne .LBB2_3 +; CHECK-6M-NEXT: @ %bb.2: @ %cond.false +; CHECK-6M-NEXT: ands r0, r1 +; CHECK-6M-NEXT: ldr r1, .LCPI2_0 +; CHECK-6M-NEXT: muls r1, r0, r1 +; CHECK-6M-NEXT: lsrs r0, r1, #27 ; CHECK-6M-NEXT: adr r1, .LCPI2_1 ; CHECK-6M-NEXT: ldrb r0, [r1, r0] ; CHECK-6M-NEXT: bx lr @@ -184,14 +191,16 @@ ; CHECK-8MBASE: @ %bb.0: ; CHECK-8MBASE-NEXT: cbz r0, .LBB2_3 ; CHECK-8MBASE-NEXT: @ %bb.1: @ %cond.false -; CHECK-8MBASE-NEXT: cbz r0, .LBB2_3 -; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false ; CHECK-8MBASE-NEXT: rsbs r1, r0, #0 -; CHECK-8MBASE-NEXT: ands r1, r0 -; CHECK-8MBASE-NEXT: movw r0, #46385 -; CHECK-8MBASE-NEXT: movt r0, #1916 -; CHECK-8MBASE-NEXT: muls r0, r1, r0 -; CHECK-8MBASE-NEXT: lsrs r0, r0, #27 +; CHECK-8MBASE-NEXT: mov r2, r0 +; CHECK-8MBASE-NEXT: adcs r2, r1 +; CHECK-8MBASE-NEXT: bne .LBB2_3 +; CHECK-8MBASE-NEXT: @ %bb.2: @ %cond.false +; CHECK-8MBASE-NEXT: ands r0, r1 +; CHECK-8MBASE-NEXT: movw r1, #46385 +; CHECK-8MBASE-NEXT: movt r1, #1916 +; CHECK-8MBASE-NEXT: muls r1, r0, r1 +; CHECK-8MBASE-NEXT: lsrs r0, r1, #27 ; CHECK-8MBASE-NEXT: adr r1, .LCPI2_0 ; CHECK-8MBASE-NEXT: ldrb r0, [r1, r0] ; CHECK-8MBASE-NEXT: bx lr @@ -221,44 +230,42 @@ ; ; CHECK-6M-LABEL: test_i64: ; CHECK-6M: @ %bb.0: -; CHECK-6M-NEXT: .save {r4, r5, r7, lr} -; CHECK-6M-NEXT: push {r4, r5, r7, lr} +; CHECK-6M-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-6M-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-6M-NEXT: rsbs r6, r0, #0 +; CHECK-6M-NEXT: mov r2, r0 +; CHECK-6M-NEXT: adcs r2, r6 ; CHECK-6M-NEXT: ldr r5, .LCPI3_0 ; CHECK-6M-NEXT: adr r4, .LCPI3_1 ; CHECK-6M-NEXT: movs r3, #32 -; CHECK-6M-NEXT: cmp r0, #0 +; CHECK-6M-NEXT: cmp r2, #0 ; CHECK-6M-NEXT: mov r2, r3 -; CHECK-6M-NEXT: bne .LBB3_5 +; CHECK-6M-NEXT: bne .LBB3_2 ; CHECK-6M-NEXT: @ %bb.1: -; CHECK-6M-NEXT: cmp r1, #0 -; CHECK-6M-NEXT: bne .LBB3_6 +; CHECK-6M-NEXT: ands r6, r0 +; CHECK-6M-NEXT: muls r6, r5, r6 +; CHECK-6M-NEXT: lsrs r2, r6, #27 +; CHECK-6M-NEXT: ldrb r2, [r4, r2] ; CHECK-6M-NEXT: .LBB3_2: -; CHECK-6M-NEXT: cmp r0, #0 +; CHECK-6M-NEXT: rsbs r6, r1, #0 +; CHECK-6M-NEXT: mov r7, r1 +; CHECK-6M-NEXT: adcs r7, r6 ; CHECK-6M-NEXT: bne .LBB3_4 -; CHECK-6M-NEXT: .LBB3_3: +; CHECK-6M-NEXT: @ %bb.3: +; CHECK-6M-NEXT: ands r1, r6 +; CHECK-6M-NEXT: muls r5, r1, r5 +; CHECK-6M-NEXT: lsrs r1, r5, #27 +; CHECK-6M-NEXT: ldrb r3, [r4, r1] +; CHECK-6M-NEXT: .LBB3_4: +; CHECK-6M-NEXT: cmp r0, #0 +; CHECK-6M-NEXT: bne .LBB3_6 +; CHECK-6M-NEXT: @ %bb.5: ; CHECK-6M-NEXT: adds r3, #32 ; CHECK-6M-NEXT: mov r2, r3 -; CHECK-6M-NEXT: .LBB3_4: +; CHECK-6M-NEXT: .LBB3_6: ; CHECK-6M-NEXT: movs r1, #0 ; CHECK-6M-NEXT: mov r0, r2 -; CHECK-6M-NEXT: pop {r4, r5, r7, pc} -; CHECK-6M-NEXT: .LBB3_5: -; CHECK-6M-NEXT: rsbs r2, r0, #0 -; CHECK-6M-NEXT: ands r2, r0 -; CHECK-6M-NEXT: muls r2, r5, r2 -; CHECK-6M-NEXT: lsrs r2, r2, #27 -; CHECK-6M-NEXT: ldrb r2, [r4, r2] -; CHECK-6M-NEXT: cmp r1, #0 -; CHECK-6M-NEXT: beq .LBB3_2 -; CHECK-6M-NEXT: .LBB3_6: -; CHECK-6M-NEXT: rsbs r3, r1, #0 -; CHECK-6M-NEXT: ands r3, r1 -; CHECK-6M-NEXT: muls r5, r3, r5 -; CHECK-6M-NEXT: lsrs r1, r5, #27 -; CHECK-6M-NEXT: ldrb r3, [r4, r1] -; CHECK-6M-NEXT: cmp r0, #0 -; CHECK-6M-NEXT: beq .LBB3_3 -; CHECK-6M-NEXT: b .LBB3_4 +; CHECK-6M-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-6M-NEXT: .p2align 2 ; CHECK-6M-NEXT: @ %bb.7: ; CHECK-6M-NEXT: .LCPI3_0: @@ -268,42 +275,42 @@ ; ; CHECK-8MBASE-LABEL: test_i64: ; CHECK-8MBASE: @ %bb.0: -; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr} -; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr} +; CHECK-8MBASE-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-8MBASE-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-8MBASE-NEXT: rsbs r6, r0, #0 +; CHECK-8MBASE-NEXT: mov r2, r0 +; CHECK-8MBASE-NEXT: adcs r2, r6 ; CHECK-8MBASE-NEXT: movw r5, #46385 ; CHECK-8MBASE-NEXT: movt r5, #1916 ; CHECK-8MBASE-NEXT: adr r4, .LCPI3_0 ; CHECK-8MBASE-NEXT: movs r3, #32 +; CHECK-8MBASE-NEXT: cmp r2, #0 ; CHECK-8MBASE-NEXT: mov r2, r3 -; CHECK-8MBASE-NEXT: cbnz r0, .LBB3_5 +; CHECK-8MBASE-NEXT: bne .LBB3_2 ; CHECK-8MBASE-NEXT: @ %bb.1: -; CHECK-8MBASE-NEXT: cbnz r1, .LBB3_6 +; CHECK-8MBASE-NEXT: ands r6, r0 +; CHECK-8MBASE-NEXT: muls r6, r5, r6 +; CHECK-8MBASE-NEXT: lsrs r2, r6, #27 +; CHECK-8MBASE-NEXT: ldrb r2, [r4, r2] ; CHECK-8MBASE-NEXT: .LBB3_2: -; CHECK-8MBASE-NEXT: cbnz r0, .LBB3_4 -; CHECK-8MBASE-NEXT: .LBB3_3: +; CHECK-8MBASE-NEXT: rsbs r6, r1, #0 +; CHECK-8MBASE-NEXT: mov r7, r1 +; CHECK-8MBASE-NEXT: adcs r7, r6 +; CHECK-8MBASE-NEXT: bne .LBB3_4 +; CHECK-8MBASE-NEXT: @ %bb.3: +; CHECK-8MBASE-NEXT: ands r1, r6 +; CHECK-8MBASE-NEXT: muls r5, r1, r5 +; CHECK-8MBASE-NEXT: lsrs r1, r5, #27 +; CHECK-8MBASE-NEXT: ldrb r3, [r4, r1] +; CHECK-8MBASE-NEXT: .LBB3_4: +; CHECK-8MBASE-NEXT: cbnz r0, .LBB3_6 +; CHECK-8MBASE-NEXT: @ %bb.5: ; CHECK-8MBASE-NEXT: adds r3, #32 ; CHECK-8MBASE-NEXT: mov r2, r3 -; CHECK-8MBASE-NEXT: .LBB3_4: +; CHECK-8MBASE-NEXT: .LBB3_6: ; CHECK-8MBASE-NEXT: movs r1, #0 ; CHECK-8MBASE-NEXT: mov r0, r2 -; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc} -; CHECK-8MBASE-NEXT: .LBB3_5: -; CHECK-8MBASE-NEXT: rsbs r2, r0, #0 -; CHECK-8MBASE-NEXT: ands r2, r0 -; CHECK-8MBASE-NEXT: muls r2, r5, r2 -; CHECK-8MBASE-NEXT: lsrs r2, r2, #27 -; CHECK-8MBASE-NEXT: ldrb r2, [r4, r2] -; CHECK-8MBASE-NEXT: cmp r1, #0 -; CHECK-8MBASE-NEXT: beq .LBB3_2 -; CHECK-8MBASE-NEXT: .LBB3_6: -; CHECK-8MBASE-NEXT: rsbs r3, r1, #0 -; CHECK-8MBASE-NEXT: ands r3, r1 -; CHECK-8MBASE-NEXT: muls r5, r3, r5 -; CHECK-8MBASE-NEXT: lsrs r1, r5, #27 -; CHECK-8MBASE-NEXT: ldrb r3, [r4, r1] -; CHECK-8MBASE-NEXT: cmp r0, #0 -; CHECK-8MBASE-NEXT: beq .LBB3_3 -; CHECK-8MBASE-NEXT: b .LBB3_4 +; CHECK-8MBASE-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-8MBASE-NEXT: .p2align 2 ; CHECK-8MBASE-NEXT: @ %bb.7: ; CHECK-8MBASE-NEXT: .LCPI3_0: @@ -323,14 +330,15 @@ ; ; CHECK-6M-LABEL: test_i8_zero_undef: ; CHECK-6M: @ %bb.0: -; CHECK-6M-NEXT: cmp r0, #0 -; CHECK-6M-NEXT: beq .LBB4_2 -; CHECK-6M-NEXT: @ %bb.1: ; CHECK-6M-NEXT: rsbs r1, r0, #0 -; CHECK-6M-NEXT: ands r1, r0 -; CHECK-6M-NEXT: ldr r0, .LCPI4_0 -; CHECK-6M-NEXT: muls r0, r1, r0 -; CHECK-6M-NEXT: lsrs r0, r0, #27 +; CHECK-6M-NEXT: mov r2, r0 +; CHECK-6M-NEXT: adcs r2, r1 +; CHECK-6M-NEXT: bne .LBB4_2 +; CHECK-6M-NEXT: @ %bb.1: +; CHECK-6M-NEXT: ands r0, r1 +; CHECK-6M-NEXT: ldr r1, .LCPI4_0 +; CHECK-6M-NEXT: muls r1, r0, r1 +; CHECK-6M-NEXT: lsrs r0, r1, #27 ; CHECK-6M-NEXT: adr r1, .LCPI4_1 ; CHECK-6M-NEXT: ldrb r0, [r1, r0] ; CHECK-6M-NEXT: bx lr @@ -346,14 +354,16 @@ ; ; CHECK-8MBASE-LABEL: test_i8_zero_undef: ; CHECK-8MBASE: @ %bb.0: -; CHECK-8MBASE-NEXT: cbz r0, .LBB4_2 -; CHECK-8MBASE-NEXT: @ %bb.1: ; CHECK-8MBASE-NEXT: rsbs r1, r0, #0 -; CHECK-8MBASE-NEXT: ands r1, r0 -; CHECK-8MBASE-NEXT: movw r0, #46385 -; CHECK-8MBASE-NEXT: movt r0, #1916 -; CHECK-8MBASE-NEXT: muls r0, r1, r0 -; CHECK-8MBASE-NEXT: lsrs r0, r0, #27 +; CHECK-8MBASE-NEXT: mov r2, r0 +; CHECK-8MBASE-NEXT: adcs r2, r1 +; CHECK-8MBASE-NEXT: bne .LBB4_2 +; CHECK-8MBASE-NEXT: @ %bb.1: +; CHECK-8MBASE-NEXT: ands r0, r1 +; CHECK-8MBASE-NEXT: movw r1, #46385 +; CHECK-8MBASE-NEXT: movt r1, #1916 +; CHECK-8MBASE-NEXT: muls r1, r0, r1 +; CHECK-8MBASE-NEXT: lsrs r0, r1, #27 ; CHECK-8MBASE-NEXT: adr r1, .LCPI4_0 ; CHECK-8MBASE-NEXT: ldrb r0, [r1, r0] ; CHECK-8MBASE-NEXT: bx lr @@ -377,14 +387,15 @@ ; ; CHECK-6M-LABEL: test_i16_zero_undef: ; CHECK-6M: @ %bb.0: -; CHECK-6M-NEXT: cmp r0, #0 -; CHECK-6M-NEXT: beq .LBB5_2 -; CHECK-6M-NEXT: @ %bb.1: ; CHECK-6M-NEXT: rsbs r1, r0, #0 -; CHECK-6M-NEXT: ands r1, r0 -; CHECK-6M-NEXT: ldr r0, .LCPI5_0 -; CHECK-6M-NEXT: muls r0, r1, r0 -; CHECK-6M-NEXT: lsrs r0, r0, #27 +; CHECK-6M-NEXT: mov r2, r0 +; CHECK-6M-NEXT: adcs r2, r1 +; CHECK-6M-NEXT: bne .LBB5_2 +; CHECK-6M-NEXT: @ %bb.1: +; CHECK-6M-NEXT: ands r0, r1 +; CHECK-6M-NEXT: ldr r1, .LCPI5_0 +; CHECK-6M-NEXT: muls r1, r0, r1 +; CHECK-6M-NEXT: lsrs r0, r1, #27 ; CHECK-6M-NEXT: adr r1, .LCPI5_1 ; CHECK-6M-NEXT: ldrb r0, [r1, r0] ; CHECK-6M-NEXT: bx lr @@ -400,14 +411,16 @@ ; ; CHECK-8MBASE-LABEL: test_i16_zero_undef: ; CHECK-8MBASE: @ %bb.0: -; CHECK-8MBASE-NEXT: cbz r0, .LBB5_2 -; CHECK-8MBASE-NEXT: @ %bb.1: ; CHECK-8MBASE-NEXT: rsbs r1, r0, #0 -; CHECK-8MBASE-NEXT: ands r1, r0 -; CHECK-8MBASE-NEXT: movw r0, #46385 -; CHECK-8MBASE-NEXT: movt r0, #1916 -; CHECK-8MBASE-NEXT: muls r0, r1, r0 -; CHECK-8MBASE-NEXT: lsrs r0, r0, #27 +; CHECK-8MBASE-NEXT: mov r2, r0 +; CHECK-8MBASE-NEXT: adcs r2, r1 +; CHECK-8MBASE-NEXT: bne .LBB5_2 +; CHECK-8MBASE-NEXT: @ %bb.1: +; CHECK-8MBASE-NEXT: ands r0, r1 +; CHECK-8MBASE-NEXT: movw r1, #46385 +; CHECK-8MBASE-NEXT: movt r1, #1916 +; CHECK-8MBASE-NEXT: muls r1, r0, r1 +; CHECK-8MBASE-NEXT: lsrs r0, r1, #27 ; CHECK-8MBASE-NEXT: adr r1, .LCPI5_0 ; CHECK-8MBASE-NEXT: ldrb r0, [r1, r0] ; CHECK-8MBASE-NEXT: bx lr @@ -432,14 +445,15 @@ ; ; CHECK-6M-LABEL: test_i32_zero_undef: ; CHECK-6M: @ %bb.0: -; CHECK-6M-NEXT: cmp r0, #0 -; CHECK-6M-NEXT: beq .LBB6_2 -; CHECK-6M-NEXT: @ %bb.1: ; CHECK-6M-NEXT: rsbs r1, r0, #0 -; CHECK-6M-NEXT: ands r1, r0 -; CHECK-6M-NEXT: ldr r0, .LCPI6_0 -; CHECK-6M-NEXT: muls r0, r1, r0 -; CHECK-6M-NEXT: lsrs r0, r0, #27 +; CHECK-6M-NEXT: mov r2, r0 +; CHECK-6M-NEXT: adcs r2, r1 +; CHECK-6M-NEXT: bne .LBB6_2 +; CHECK-6M-NEXT: @ %bb.1: +; CHECK-6M-NEXT: ands r0, r1 +; CHECK-6M-NEXT: ldr r1, .LCPI6_0 +; CHECK-6M-NEXT: muls r1, r0, r1 +; CHECK-6M-NEXT: lsrs r0, r1, #27 ; CHECK-6M-NEXT: adr r1, .LCPI6_1 ; CHECK-6M-NEXT: ldrb r0, [r1, r0] ; CHECK-6M-NEXT: bx lr @@ -455,14 +469,16 @@ ; ; CHECK-8MBASE-LABEL: test_i32_zero_undef: ; CHECK-8MBASE: @ %bb.0: -; CHECK-8MBASE-NEXT: cbz r0, .LBB6_2 -; CHECK-8MBASE-NEXT: @ %bb.1: ; CHECK-8MBASE-NEXT: rsbs r1, r0, #0 -; CHECK-8MBASE-NEXT: ands r1, r0 -; CHECK-8MBASE-NEXT: movw r0, #46385 -; CHECK-8MBASE-NEXT: movt r0, #1916 -; CHECK-8MBASE-NEXT: muls r0, r1, r0 -; CHECK-8MBASE-NEXT: lsrs r0, r0, #27 +; CHECK-8MBASE-NEXT: mov r2, r0 +; CHECK-8MBASE-NEXT: adcs r2, r1 +; CHECK-8MBASE-NEXT: bne .LBB6_2 +; CHECK-8MBASE-NEXT: @ %bb.1: +; CHECK-8MBASE-NEXT: ands r0, r1 +; CHECK-8MBASE-NEXT: movw r1, #46385 +; CHECK-8MBASE-NEXT: movt r1, #1916 +; CHECK-8MBASE-NEXT: muls r1, r0, r1 +; CHECK-8MBASE-NEXT: lsrs r0, r1, #27 ; CHECK-8MBASE-NEXT: adr r1, .LCPI6_0 ; CHECK-8MBASE-NEXT: ldrb r0, [r1, r0] ; CHECK-8MBASE-NEXT: bx lr @@ -492,44 +508,42 @@ ; ; CHECK-6M-LABEL: test_i64_zero_undef: ; CHECK-6M: @ %bb.0: -; CHECK-6M-NEXT: .save {r4, r5, r7, lr} -; CHECK-6M-NEXT: push {r4, r5, r7, lr} +; CHECK-6M-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-6M-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-6M-NEXT: rsbs r6, r0, #0 +; CHECK-6M-NEXT: mov r2, r0 +; CHECK-6M-NEXT: adcs r2, r6 ; CHECK-6M-NEXT: ldr r5, .LCPI7_0 ; CHECK-6M-NEXT: adr r4, .LCPI7_1 ; CHECK-6M-NEXT: movs r3, #32 -; CHECK-6M-NEXT: cmp r0, #0 +; CHECK-6M-NEXT: cmp r2, #0 ; CHECK-6M-NEXT: mov r2, r3 -; CHECK-6M-NEXT: bne .LBB7_5 +; CHECK-6M-NEXT: bne .LBB7_2 ; CHECK-6M-NEXT: @ %bb.1: -; CHECK-6M-NEXT: cmp r1, #0 -; CHECK-6M-NEXT: bne .LBB7_6 +; CHECK-6M-NEXT: ands r6, r0 +; CHECK-6M-NEXT: muls r6, r5, r6 +; CHECK-6M-NEXT: lsrs r2, r6, #27 +; CHECK-6M-NEXT: ldrb r2, [r4, r2] ; CHECK-6M-NEXT: .LBB7_2: -; CHECK-6M-NEXT: cmp r0, #0 +; CHECK-6M-NEXT: rsbs r6, r1, #0 +; CHECK-6M-NEXT: mov r7, r1 +; CHECK-6M-NEXT: adcs r7, r6 ; CHECK-6M-NEXT: bne .LBB7_4 -; CHECK-6M-NEXT: .LBB7_3: +; CHECK-6M-NEXT: @ %bb.3: +; CHECK-6M-NEXT: ands r1, r6 +; CHECK-6M-NEXT: muls r5, r1, r5 +; CHECK-6M-NEXT: lsrs r1, r5, #27 +; CHECK-6M-NEXT: ldrb r3, [r4, r1] +; CHECK-6M-NEXT: .LBB7_4: +; CHECK-6M-NEXT: cmp r0, #0 +; CHECK-6M-NEXT: bne .LBB7_6 +; CHECK-6M-NEXT: @ %bb.5: ; CHECK-6M-NEXT: adds r3, #32 ; CHECK-6M-NEXT: mov r2, r3 -; CHECK-6M-NEXT: .LBB7_4: +; CHECK-6M-NEXT: .LBB7_6: ; CHECK-6M-NEXT: movs r1, #0 ; CHECK-6M-NEXT: mov r0, r2 -; CHECK-6M-NEXT: pop {r4, r5, r7, pc} -; CHECK-6M-NEXT: .LBB7_5: -; CHECK-6M-NEXT: rsbs r2, r0, #0 -; CHECK-6M-NEXT: ands r2, r0 -; CHECK-6M-NEXT: muls r2, r5, r2 -; CHECK-6M-NEXT: lsrs r2, r2, #27 -; CHECK-6M-NEXT: ldrb r2, [r4, r2] -; CHECK-6M-NEXT: cmp r1, #0 -; CHECK-6M-NEXT: beq .LBB7_2 -; CHECK-6M-NEXT: .LBB7_6: -; CHECK-6M-NEXT: rsbs r3, r1, #0 -; CHECK-6M-NEXT: ands r3, r1 -; CHECK-6M-NEXT: muls r5, r3, r5 -; CHECK-6M-NEXT: lsrs r1, r5, #27 -; CHECK-6M-NEXT: ldrb r3, [r4, r1] -; CHECK-6M-NEXT: cmp r0, #0 -; CHECK-6M-NEXT: beq .LBB7_3 -; CHECK-6M-NEXT: b .LBB7_4 +; CHECK-6M-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-6M-NEXT: .p2align 2 ; CHECK-6M-NEXT: @ %bb.7: ; CHECK-6M-NEXT: .LCPI7_0: @@ -539,42 +553,42 @@ ; ; CHECK-8MBASE-LABEL: test_i64_zero_undef: ; CHECK-8MBASE: @ %bb.0: -; CHECK-8MBASE-NEXT: .save {r4, r5, r7, lr} -; CHECK-8MBASE-NEXT: push {r4, r5, r7, lr} +; CHECK-8MBASE-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-8MBASE-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-8MBASE-NEXT: rsbs r6, r0, #0 +; CHECK-8MBASE-NEXT: mov r2, r0 +; CHECK-8MBASE-NEXT: adcs r2, r6 ; CHECK-8MBASE-NEXT: movw r5, #46385 ; CHECK-8MBASE-NEXT: movt r5, #1916 ; CHECK-8MBASE-NEXT: adr r4, .LCPI7_0 ; CHECK-8MBASE-NEXT: movs r3, #32 +; CHECK-8MBASE-NEXT: cmp r2, #0 ; CHECK-8MBASE-NEXT: mov r2, r3 -; CHECK-8MBASE-NEXT: cbnz r0, .LBB7_5 +; CHECK-8MBASE-NEXT: bne .LBB7_2 ; CHECK-8MBASE-NEXT: @ %bb.1: -; CHECK-8MBASE-NEXT: cbnz r1, .LBB7_6 +; CHECK-8MBASE-NEXT: ands r6, r0 +; CHECK-8MBASE-NEXT: muls r6, r5, r6 +; CHECK-8MBASE-NEXT: lsrs r2, r6, #27 +; CHECK-8MBASE-NEXT: ldrb r2, [r4, r2] ; CHECK-8MBASE-NEXT: .LBB7_2: -; CHECK-8MBASE-NEXT: cbnz r0, .LBB7_4 -; CHECK-8MBASE-NEXT: .LBB7_3: +; CHECK-8MBASE-NEXT: rsbs r6, r1, #0 +; CHECK-8MBASE-NEXT: mov r7, r1 +; CHECK-8MBASE-NEXT: adcs r7, r6 +; CHECK-8MBASE-NEXT: bne .LBB7_4 +; CHECK-8MBASE-NEXT: @ %bb.3: +; CHECK-8MBASE-NEXT: ands r1, r6 +; CHECK-8MBASE-NEXT: muls r5, r1, r5 +; CHECK-8MBASE-NEXT: lsrs r1, r5, #27 +; CHECK-8MBASE-NEXT: ldrb r3, [r4, r1] +; CHECK-8MBASE-NEXT: .LBB7_4: +; CHECK-8MBASE-NEXT: cbnz r0, .LBB7_6 +; CHECK-8MBASE-NEXT: @ %bb.5: ; CHECK-8MBASE-NEXT: adds r3, #32 ; CHECK-8MBASE-NEXT: mov r2, r3 -; CHECK-8MBASE-NEXT: .LBB7_4: +; CHECK-8MBASE-NEXT: .LBB7_6: ; CHECK-8MBASE-NEXT: movs r1, #0 ; CHECK-8MBASE-NEXT: mov r0, r2 -; CHECK-8MBASE-NEXT: pop {r4, r5, r7, pc} -; CHECK-8MBASE-NEXT: .LBB7_5: -; CHECK-8MBASE-NEXT: rsbs r2, r0, #0 -; CHECK-8MBASE-NEXT: ands r2, r0 -; CHECK-8MBASE-NEXT: muls r2, r5, r2 -; CHECK-8MBASE-NEXT: lsrs r2, r2, #27 -; CHECK-8MBASE-NEXT: ldrb r2, [r4, r2] -; CHECK-8MBASE-NEXT: cmp r1, #0 -; CHECK-8MBASE-NEXT: beq .LBB7_2 -; CHECK-8MBASE-NEXT: .LBB7_6: -; CHECK-8MBASE-NEXT: rsbs r3, r1, #0 -; CHECK-8MBASE-NEXT: ands r3, r1 -; CHECK-8MBASE-NEXT: muls r5, r3, r5 -; CHECK-8MBASE-NEXT: lsrs r1, r5, #27 -; CHECK-8MBASE-NEXT: ldrb r3, [r4, r1] -; CHECK-8MBASE-NEXT: cmp r0, #0 -; CHECK-8MBASE-NEXT: beq .LBB7_3 -; CHECK-8MBASE-NEXT: b .LBB7_4 +; CHECK-8MBASE-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-8MBASE-NEXT: .p2align 2 ; CHECK-8MBASE-NEXT: @ %bb.7: ; CHECK-8MBASE-NEXT: .LCPI7_0: diff --git a/llvm/test/CodeGen/ARM/cttz_vector.ll b/llvm/test/CodeGen/ARM/cttz_vector.ll --- a/llvm/test/CodeGen/ARM/cttz_vector.ll +++ b/llvm/test/CodeGen/ARM/cttz_vector.ll @@ -42,14 +42,16 @@ ; CHECK-LABEL: test_v2i8: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-NEXT: vmov.i32 d19, #0x1 ; CHECK-NEXT: vmovl.u8 q8, d16 ; CHECK-NEXT: vmovl.u16 q8, d16 ; CHECK-NEXT: vorr.i32 d16, #0x100 ; CHECK-NEXT: vneg.s32 d18, d16 ; CHECK-NEXT: vand d16, d16, d18 -; CHECK-NEXT: vmov.i32 d17, #0x1f -; CHECK-NEXT: vclz.i32 d16, d16 -; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vsub.i32 d16, d16, d19 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vpaddl.u16 d16, d16 ; CHECK-NEXT: vmov.32 r1, d16[1] ; CHECK-NEXT: vmov.32 r2, d16[0] ; CHECK-NEXT: strb r1, [r0, #1] @@ -65,13 +67,14 @@ ; CHECK-LABEL: test_v4i8: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmov.i16 d19, #0x1 ; CHECK-NEXT: vmovl.u8 q8, d16 ; CHECK-NEXT: vorr.i16 d16, #0x100 ; CHECK-NEXT: vneg.s16 d18, d16 ; CHECK-NEXT: vand d16, d16, d18 -; CHECK-NEXT: vmov.i16 d17, #0xf -; CHECK-NEXT: vclz.i16 d16, d16 -; CHECK-NEXT: vsub.i16 d16, d17, d16 +; CHECK-NEXT: vsub.i16 d16, d16, d19 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 ; CHECK-NEXT: vuzp.8 d16, d17 ; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] ; CHECK-NEXT: bx lr @@ -134,13 +137,15 @@ ; CHECK-LABEL: test_v2i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmov.i32 d19, #0x1 ; CHECK-NEXT: vmovl.u16 q8, d16 ; CHECK-NEXT: vorr.i32 d16, #0x10000 ; CHECK-NEXT: vneg.s32 d18, d16 ; CHECK-NEXT: vand d16, d16, d18 -; CHECK-NEXT: vmov.i32 d17, #0x1f -; CHECK-NEXT: vclz.i32 d16, d16 -; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vsub.i32 d16, d16, d19 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vpaddl.u16 d16, d16 ; CHECK-NEXT: vuzp.16 d16, d17 ; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll b/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll --- a/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll +++ b/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll @@ -44,14 +44,9 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] ; CHECK-NEXT: vmovl.u8 q8, d16 -; CHECK-NEXT: vmov.u16 r0, d16[0] -; CHECK-NEXT: vmov.u16 r1, d16[1] -; CHECK-NEXT: vmov.u16 r2, d16[2] -; CHECK-NEXT: vmov.u16 r3, d16[3] -; CHECK-NEXT: uxtb r0, r0 -; CHECK-NEXT: uxtb r1, r1 -; CHECK-NEXT: uxtb r2, r2 -; CHECK-NEXT: uxtb r3, r3 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: bx lr %1 = load <4 x i8>, ptr %in, align 4 %2 = extractelement <4 x i8> %1, i32 0 diff --git a/llvm/test/CodeGen/ARM/dsp-mlal.ll b/llvm/test/CodeGen/ARM/dsp-mlal.ll --- a/llvm/test/CodeGen/ARM/dsp-mlal.ll +++ b/llvm/test/CodeGen/ARM/dsp-mlal.ll @@ -6,28 +6,36 @@ define hidden i32 @SMMULR_SMMLAR(i32 %a, i32 %b0, i32 %b1, i32 %Xn, i32 %Xn1) local_unnamed_addr { ; DSP-LABEL: SMMULR_SMMLAR: ; DSP: @ %bb.0: @ %entry -; DSP-NEXT: ldr r0, [sp] -; DSP-NEXT: smmulr r0, r0, r2 -; DSP-NEXT: smmlar r0, r3, r1, r0 +; DSP-NEXT: smull r0, r1, r3, r1 +; DSP-NEXT: ldr r3, [sp] +; DSP-NEXT: smull r2, r3, r3, r2 +; DSP-NEXT: adds.w r2, r2, #-2147483648 +; DSP-NEXT: adcs r1, r3 +; DSP-NEXT: adds.w r0, r0, #-2147483648 +; DSP-NEXT: adc r0, r1, #0 ; DSP-NEXT: bx lr ; ; ARM7-LABEL: SMMULR_SMMLAR: ; ARM7: @ %bb.0: @ %entry ; ARM7-NEXT: ldr r0, [sp] -; ARM7-NEXT: smmulr r0, r0, r2 -; ARM7-NEXT: smmlar r0, r3, r1, r0 +; ARM7-NEXT: smull r1, r3, r3, r1 +; ARM7-NEXT: smull r0, r2, r0, r2 +; ARM7-NEXT: adds r0, r0, #-2147483648 +; ARM7-NEXT: adc r0, r3, r2 +; ARM7-NEXT: adds r1, r1, #-2147483648 +; ARM7-NEXT: adc r0, r0, #0 ; ARM7-NEXT: bx lr ; ; NODSP-LABEL: SMMULR_SMMLAR: ; NODSP: @ %bb.0: @ %entry -; NODSP-NEXT: push {r4, lr} -; NODSP-NEXT: ldr.w lr, [sp, #8] -; NODSP-NEXT: movs r0, #0 -; NODSP-NEXT: mov.w r4, #-2147483648 -; NODSP-NEXT: mov.w r12, #-2147483648 -; NODSP-NEXT: smlal r4, r0, lr, r2 -; NODSP-NEXT: smlal r12, r0, r3, r1 -; NODSP-NEXT: pop {r4, pc} +; NODSP-NEXT: smull r0, r1, r3, r1 +; NODSP-NEXT: ldr r3, [sp] +; NODSP-NEXT: smull r2, r3, r3, r2 +; NODSP-NEXT: adds.w r2, r2, #-2147483648 +; NODSP-NEXT: adcs r1, r3 +; NODSP-NEXT: adds.w r0, r0, #-2147483648 +; NODSP-NEXT: adc r0, r1, #0 +; NODSP-NEXT: bx lr entry: %conv = sext i32 %b1 to i64 %conv1 = sext i32 %Xn1 to i64 diff --git a/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll b/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll --- a/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll +++ b/llvm/test/CodeGen/ARM/fadd-select-fneg-combine.ll @@ -4,12 +4,14 @@ define float @fadd_select_fneg_fneg_f32(i32 %arg0, float %x, float %y, float %z) { ; CHECK-LABEL: fadd_select_fneg_fneg_f32: ; CHECK: @ %bb.0: +; CHECK-NEXT: eor r2, r2, #-2147483648 +; CHECK-NEXT: eor r1, r1, #-2147483648 ; CHECK-NEXT: vmov s0, r3 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vmov s2, r2 ; CHECK-NEXT: vmov s4, r1 ; CHECK-NEXT: vseleq.f32 s2, s4, s2 -; CHECK-NEXT: vsub.f32 s0, s0, s2 +; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr %cmp = icmp eq i32 %arg0, 0 @@ -26,9 +28,11 @@ ; CHECK-NEXT: vmov.f16 s0, r2 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vmov.f16 s2, r1 +; CHECK-NEXT: vneg.f16 s0, s0 +; CHECK-NEXT: vneg.f16 s2, s2 ; CHECK-NEXT: vseleq.f16 s0, s2, s0 ; CHECK-NEXT: vmov.f16 s2, r3 -; CHECK-NEXT: vsub.f16 s0, s2, s0 +; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr %cmp = icmp eq i32 %arg0, 0 @@ -249,11 +253,12 @@ ; CHECK-LABEL: fadd_select_fneg_negk_f16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov.f16 s0, r1 -; CHECK-NEXT: vmov.f16 s2, #4.000000e+00 +; CHECK-NEXT: vmov.f16 s2, #-4.000000e+00 +; CHECK-NEXT: vneg.f16 s0, s0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vseleq.f16 s0, s0, s2 ; CHECK-NEXT: vmov.f16 s2, r2 -; CHECK-NEXT: vsub.f16 s0, s2, s0 +; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr %cmp = icmp eq i32 %arg0, 0 @@ -267,11 +272,12 @@ ; CHECK-LABEL: fadd_select_fneg_posk_f16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov.f16 s0, r1 -; CHECK-NEXT: vmov.f16 s2, #-4.000000e+00 +; CHECK-NEXT: vmov.f16 s2, #4.000000e+00 +; CHECK-NEXT: vneg.f16 s0, s0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vseleq.f16 s0, s0, s2 ; CHECK-NEXT: vmov.f16 s2, r2 -; CHECK-NEXT: vsub.f16 s0, s2, s0 +; CHECK-NEXT: vadd.f16 s0, s0, s2 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bx lr %cmp = icmp eq i32 %arg0, 0 diff --git a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll --- a/llvm/test/CodeGen/ARM/fp16-fullfp16.ll +++ b/llvm/test/CodeGen/ARM/fp16-fullfp16.ll @@ -482,9 +482,11 @@ ; CHECK-NEXT: vstr.16 s0, [sp] ; CHECK-NEXT: vldr.16 s0, [r0] ; CHECK-NEXT: ldrb r1, [sp, #1] +; CHECK-NEXT: ands r1, r1, #128 ; CHECK-NEXT: vabs.f16 s0, s0 -; CHECK-NEXT: tst r1, #128 +; CHECK-NEXT: movwne r1, #1 ; CHECK-NEXT: vneg.f16 s2, s0 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: vseleq.f16 s0, s0, s2 ; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: add sp, sp, #4 diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll @@ -2380,41 +2380,51 @@ ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vmov r0, r2, d9 +; CHECK-NEXT: mov r2, r1 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: cmn r4, #-2147483647 ; CHECK-NEXT: mvn r3, #-2147483648 +; CHECK-NEXT: clz r7, r2 ; CHECK-NEXT: movlo r3, r4 ; CHECK-NEXT: mvn r5, #-2147483648 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mov r6, #0 ; CHECK-NEXT: movpl r4, r5 -; CHECK-NEXT: movpl r1, r6 -; CHECK-NEXT: moveq r4, r3 -; CHECK-NEXT: cmn r1, #1 +; CHECK-NEXT: lsrs r7, r7, #5 +; CHECK-NEXT: movne r4, r3 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: movpl r2, r6 +; CHECK-NEXT: cmn r2, #1 +; CHECK-NEXT: add r2, r2, #1 ; CHECK-NEXT: mov r3, #-2147483648 -; CHECK-NEXT: mov r7, #-2147483648 +; CHECK-NEXT: clz r2, r2 ; CHECK-NEXT: movgt r3, r4 +; CHECK-NEXT: mov r7, #-2147483648 ; CHECK-NEXT: cmp r4, #-2147483648 ; CHECK-NEXT: movls r4, r7 -; CHECK-NEXT: cmn r1, #1 -; CHECK-NEXT: movne r4, r3 -; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: lsrs r2, r2, #5 +; CHECK-NEXT: moveq r4, r3 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: cmn r0, #-2147483647 ; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: movlo r2, r0 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: movmi r5, r0 +; CHECK-NEXT: clz r0, r1 +; CHECK-NEXT: vmov.32 d0[0], r4 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: movne r5, r2 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: movmi r6, r1 -; CHECK-NEXT: moveq r5, r2 +; CHECK-NEXT: add r1, r6, #1 ; CHECK-NEXT: cmn r6, #1 ; CHECK-NEXT: mov r0, #-2147483648 -; CHECK-NEXT: vmov.32 d0[0], r4 +; CHECK-NEXT: clz r1, r1 ; CHECK-NEXT: movgt r0, r5 ; CHECK-NEXT: cmp r5, #-2147483648 ; CHECK-NEXT: movls r5, r7 -; CHECK-NEXT: cmn r6, #1 -; CHECK-NEXT: movne r5, r0 +; CHECK-NEXT: lsrs r1, r1, #5 +; CHECK-NEXT: moveq r5, r0 ; CHECK-NEXT: vmov.32 d0[1], r5 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc} @@ -2460,45 +2470,53 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i32_mm: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vorr q4, q0, q0 -; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: vmov r2, r12, d9 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvn r3, #0 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: vmov r2, r1, d8 +; CHECK-NEXT: clz r3, r4 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mvn r8, #0 ; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: movmi r3, r0 -; CHECK-NEXT: movpl r1, r5 -; CHECK-NEXT: moveq r3, r0 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: mvn r4, #0 -; CHECK-NEXT: movwgt r6, #1 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: movne r6, r3 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: moveq r6, r3 +; CHECK-NEXT: movmi r8, r0 +; CHECK-NEXT: lsrs r3, r3, #5 +; CHECK-NEXT: movne r8, r0 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: movpl r4, r5 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mov r7, #0 +; CHECK-NEXT: mvn r6, #0 +; CHECK-NEXT: movwgt r7, #1 +; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: movne r7, r8 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r12 ; CHECK-NEXT: bl __aeabi_d2lz +; CHECK-NEXT: clz r2, r1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: movmi r6, r0 +; CHECK-NEXT: lsrs r2, r2, #5 +; CHECK-NEXT: movne r6, r0 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov.32 d0[0], r6 -; CHECK-NEXT: movmi r4, r0 ; CHECK-NEXT: movpl r1, r5 -; CHECK-NEXT: moveq r4, r0 ; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: clz r0, r1 ; CHECK-NEXT: movwgt r5, #1 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: movne r5, r4 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: moveq r5, r4 -; CHECK-NEXT: vmov.32 d0[1], r5 +; CHECK-NEXT: movne r5, r6 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: movne r5, r6 +; CHECK-NEXT: clz r0, r4 +; CHECK-NEXT: vmov.32 d0[0], r5 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: movne r7, r8 +; CHECK-NEXT: vmov.32 d0[1], r7 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, pc} entry: %conv = fptosi <2 x double> %x to <2 x i64> %spec.store.select = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %conv, <2 x i64> ) @@ -2518,77 +2536,99 @@ ; CHECK-NEXT: mov r8, #-2147483648 ; CHECK-NEXT: mvn r7, #-2147483648 ; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: cmn r0, #-2147483647 ; CHECK-NEXT: mvn r0, #-2147483648 -; CHECK-NEXT: mov r9, #0 +; CHECK-NEXT: clz r2, r1 ; CHECK-NEXT: movlo r0, r4 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: movpl r4, r7 +; CHECK-NEXT: lsrs r2, r2, #5 +; CHECK-NEXT: movne r4, r0 +; CHECK-NEXT: mov r9, #0 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mov r0, #-2147483648 ; CHECK-NEXT: movpl r1, r9 -; CHECK-NEXT: moveq r4, r0 ; CHECK-NEXT: cmn r1, #1 -; CHECK-NEXT: mov r0, #-2147483648 +; CHECK-NEXT: add r1, r1, #1 ; CHECK-NEXT: movgt r0, r4 +; CHECK-NEXT: clz r1, r1 ; CHECK-NEXT: cmp r4, #-2147483648 ; CHECK-NEXT: movls r4, r8 -; CHECK-NEXT: cmn r1, #1 -; CHECK-NEXT: movne r4, r0 +; CHECK-NEXT: lsrs r1, r1, #5 +; CHECK-NEXT: moveq r4, r0 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: clz r2, r1 ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: cmn r0, #-2147483647 ; CHECK-NEXT: mvn r0, #-2147483648 -; CHECK-NEXT: mov r2, #-2147483648 ; CHECK-NEXT: movlo r0, r5 ; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: lsr r2, r2, #5 ; CHECK-NEXT: movpl r5, r7 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: mov r2, #-2147483648 +; CHECK-NEXT: movne r5, r0 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: movpl r1, r9 -; CHECK-NEXT: moveq r5, r0 -; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: add r0, r1, #1 ; CHECK-NEXT: cmn r1, #1 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: movgt r2, r5 ; CHECK-NEXT: cmp r5, #-2147483648 ; CHECK-NEXT: movls r5, r8 -; CHECK-NEXT: cmn r1, #1 -; CHECK-NEXT: movne r5, r2 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: moveq r5, r2 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: cmn r0, #-2147483647 -; CHECK-NEXT: mvn r0, #-2147483648 -; CHECK-NEXT: mov r2, #-2147483648 -; CHECK-NEXT: movlo r0, r6 +; CHECK-NEXT: clz r0, r1 +; CHECK-NEXT: cmn r6, #-2147483647 +; CHECK-NEXT: mvn r2, #-2147483648 +; CHECK-NEXT: movlo r2, r6 ; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: lsr r0, r0, #5 ; CHECK-NEXT: movpl r6, r7 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: movne r6, r2 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: movpl r1, r9 -; CHECK-NEXT: moveq r6, r0 -; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: add r0, r1, #1 ; CHECK-NEXT: cmn r1, #1 +; CHECK-NEXT: mov r2, #-2147483648 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: movgt r2, r6 ; CHECK-NEXT: cmp r6, #-2147483648 ; CHECK-NEXT: movls r6, r8 -; CHECK-NEXT: cmn r1, #1 -; CHECK-NEXT: movne r6, r2 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: moveq r6, r2 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: cmn r0, #-2147483647 ; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: movlo r2, r0 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: movmi r7, r0 +; CHECK-NEXT: clz r0, r1 +; CHECK-NEXT: vmov.32 d0[0], r6 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: movne r7, r2 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: movmi r9, r1 -; CHECK-NEXT: moveq r7, r2 +; CHECK-NEXT: add r1, r9, #1 +; CHECK-NEXT: vmov.32 d1[0], r5 ; CHECK-NEXT: cmn r9, #1 ; CHECK-NEXT: mov r0, #-2147483648 -; CHECK-NEXT: vmov.32 d1[0], r6 +; CHECK-NEXT: clz r1, r1 ; CHECK-NEXT: movgt r0, r7 ; CHECK-NEXT: cmp r7, #-2147483648 -; CHECK-NEXT: vmov.32 d0[0], r5 ; CHECK-NEXT: movls r7, r8 -; CHECK-NEXT: cmn r9, #1 +; CHECK-NEXT: lsrs r1, r1, #5 ; CHECK-NEXT: vmov.32 d1[1], r4 -; CHECK-NEXT: movne r7, r0 +; CHECK-NEXT: moveq r7, r0 ; CHECK-NEXT: vmov.32 d0[1], r7 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} @@ -2649,75 +2689,96 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i32_mm: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vorr q4, q0, q0 -; CHECK-NEXT: mvn r9, #0 ; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov r8, s18 ; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: clz r7, r1 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvn r2, #0 -; CHECK-NEXT: movmi r2, r0 +; CHECK-NEXT: mvn r3, #0 +; CHECK-NEXT: movmi r3, r0 +; CHECK-NEXT: lsrs r7, r7, #5 +; CHECK-NEXT: movne r3, r0 ; CHECK-NEXT: mov r7, #0 -; CHECK-NEXT: moveq r2, r0 -; CHECK-NEXT: movpl r1, r7 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: movwgt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: movne r4, r2 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: moveq r4, r2 -; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvn r2, #0 -; CHECK-NEXT: movmi r2, r0 +; CHECK-NEXT: mov r11, #0 ; CHECK-NEXT: movpl r1, r7 -; CHECK-NEXT: moveq r2, r0 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: movwgt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: movne r5, r2 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: moveq r5, r2 +; CHECK-NEXT: clz r0, r1 +; CHECK-NEXT: movwgt r11, #1 +; CHECK-NEXT: cmp r11, #0 +; CHECK-NEXT: mvn r9, #0 +; CHECK-NEXT: movne r11, r3 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: vmov r8, s16 +; CHECK-NEXT: movne r11, r3 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: mvn r2, #0 -; CHECK-NEXT: movmi r2, r0 -; CHECK-NEXT: movpl r1, r7 -; CHECK-NEXT: moveq r2, r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: mvn r10, #0 +; CHECK-NEXT: movmi r10, r0 ; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: lsrs r1, r1, #5 +; CHECK-NEXT: movne r10, r0 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: movpl r4, r7 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: movwgt r6, #1 ; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: movne r6, r2 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: movne r6, r10 +; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: clz r2, r1 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: moveq r6, r2 +; CHECK-NEXT: mvn r3, #0 +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: lsr r2, r2, #5 +; CHECK-NEXT: movmi r3, r0 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: movne r3, r0 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: movpl r1, r7 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: clz r0, r1 +; CHECK-NEXT: movwgt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: movne r5, r3 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: movne r5, r3 ; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: clz r2, r1 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov.32 d1[0], r6 ; CHECK-NEXT: movmi r9, r0 +; CHECK-NEXT: vmov.32 d0[0], r5 +; CHECK-NEXT: lsrs r2, r2, #5 +; CHECK-NEXT: movne r9, r0 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: movpl r1, r7 -; CHECK-NEXT: moveq r9, r0 ; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: clz r0, r1 ; CHECK-NEXT: movwgt r7, #1 ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: vmov.32 d0[0], r5 ; CHECK-NEXT: movne r7, r9 -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: vmov.32 d1[1], r4 -; CHECK-NEXT: moveq r7, r9 -; CHECK-NEXT: vmov.32 d0[1], r7 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: movne r7, r9 +; CHECK-NEXT: clz r0, r4 +; CHECK-NEXT: vmov.32 d1[0], r7 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: movne r6, r10 +; CHECK-NEXT: vmov.32 d1[1], r11 +; CHECK-NEXT: vmov.32 d0[1], r6 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-NEXT: add sp, sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %conv = fptosi <4 x float> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -2734,66 +2795,83 @@ ; CHECK-NEON-NEXT: .vsave {d8, d9, d10} ; CHECK-NEON-NEXT: vpush {d8, d9, d10} ; CHECK-NEON-NEXT: vmov r0, s3 -; CHECK-NEON-NEXT: vmov.f32 s18, s2 +; CHECK-NEON-NEXT: vmov.f32 s20, s2 ; CHECK-NEON-NEXT: vmov.f32 s16, s1 -; CHECK-NEON-NEXT: vmov.f32 s20, s0 +; CHECK-NEON-NEXT: vmov.f32 s18, s0 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz ; CHECK-NEON-NEXT: mov r4, r0 ; CHECK-NEON-NEXT: vmov r0, s20 ; CHECK-NEON-NEXT: cmn r4, #-2147483647 ; CHECK-NEON-NEXT: mvn r2, #-2147483648 +; CHECK-NEON-NEXT: clz r3, r1 ; CHECK-NEON-NEXT: movlo r2, r4 ; CHECK-NEON-NEXT: mvn r7, #-2147483648 ; CHECK-NEON-NEXT: cmp r1, #0 -; CHECK-NEON-NEXT: mov r9, #0 ; CHECK-NEON-NEXT: movpl r4, r7 +; CHECK-NEON-NEXT: lsrs r3, r3, #5 +; CHECK-NEON-NEXT: movne r4, r2 +; CHECK-NEON-NEXT: mov r9, #0 +; CHECK-NEON-NEXT: cmp r1, #0 +; CHECK-NEON-NEXT: mov r2, #-2147483648 ; CHECK-NEON-NEXT: movpl r1, r9 -; CHECK-NEON-NEXT: moveq r4, r2 ; CHECK-NEON-NEXT: cmn r1, #1 -; CHECK-NEON-NEXT: mov r2, #-2147483648 -; CHECK-NEON-NEXT: mov r8, #-2147483648 +; CHECK-NEON-NEXT: add r1, r1, #1 ; CHECK-NEON-NEXT: movgt r2, r4 +; CHECK-NEON-NEXT: clz r1, r1 +; CHECK-NEON-NEXT: mov r8, #-2147483648 ; CHECK-NEON-NEXT: cmp r4, #-2147483648 ; CHECK-NEON-NEXT: movls r4, r8 -; CHECK-NEON-NEXT: cmn r1, #1 -; CHECK-NEON-NEXT: movne r4, r2 +; CHECK-NEON-NEXT: lsrs r1, r1, #5 +; CHECK-NEON-NEXT: moveq r4, r2 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz +; CHECK-NEON-NEXT: clz r2, r1 ; CHECK-NEON-NEXT: mov r5, r0 ; CHECK-NEON-NEXT: cmn r0, #-2147483647 ; CHECK-NEON-NEXT: mvn r0, #-2147483648 -; CHECK-NEON-NEXT: mov r2, #-2147483648 ; CHECK-NEON-NEXT: movlo r0, r5 ; CHECK-NEON-NEXT: cmp r1, #0 +; CHECK-NEON-NEXT: lsr r2, r2, #5 ; CHECK-NEON-NEXT: movpl r5, r7 +; CHECK-NEON-NEXT: cmp r2, #0 +; CHECK-NEON-NEXT: mov r2, #-2147483648 +; CHECK-NEON-NEXT: movne r5, r0 +; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: movpl r1, r9 -; CHECK-NEON-NEXT: moveq r5, r0 -; CHECK-NEON-NEXT: vmov r0, s18 +; CHECK-NEON-NEXT: add r0, r1, #1 ; CHECK-NEON-NEXT: cmn r1, #1 +; CHECK-NEON-NEXT: clz r0, r0 ; CHECK-NEON-NEXT: movgt r2, r5 ; CHECK-NEON-NEXT: cmp r5, #-2147483648 ; CHECK-NEON-NEXT: movls r5, r8 -; CHECK-NEON-NEXT: cmn r1, #1 -; CHECK-NEON-NEXT: movne r5, r2 +; CHECK-NEON-NEXT: lsrs r0, r0, #5 +; CHECK-NEON-NEXT: vmov r0, s18 +; CHECK-NEON-NEXT: moveq r5, r2 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz ; CHECK-NEON-NEXT: mov r6, r0 -; CHECK-NEON-NEXT: cmn r0, #-2147483647 -; CHECK-NEON-NEXT: mvn r0, #-2147483648 -; CHECK-NEON-NEXT: mov r2, #-2147483648 -; CHECK-NEON-NEXT: movlo r0, r6 +; CHECK-NEON-NEXT: clz r0, r1 +; CHECK-NEON-NEXT: cmn r6, #-2147483647 +; CHECK-NEON-NEXT: mvn r2, #-2147483648 +; CHECK-NEON-NEXT: movlo r2, r6 ; CHECK-NEON-NEXT: cmp r1, #0 +; CHECK-NEON-NEXT: lsr r0, r0, #5 ; CHECK-NEON-NEXT: movpl r6, r7 +; CHECK-NEON-NEXT: cmp r0, #0 +; CHECK-NEON-NEXT: movne r6, r2 +; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: movpl r1, r9 -; CHECK-NEON-NEXT: moveq r6, r0 -; CHECK-NEON-NEXT: vmov r0, s16 +; CHECK-NEON-NEXT: add r0, r1, #1 ; CHECK-NEON-NEXT: cmn r1, #1 +; CHECK-NEON-NEXT: mov r2, #-2147483648 +; CHECK-NEON-NEXT: clz r0, r0 ; CHECK-NEON-NEXT: movgt r2, r6 ; CHECK-NEON-NEXT: cmp r6, #-2147483648 ; CHECK-NEON-NEXT: movls r6, r8 -; CHECK-NEON-NEXT: cmn r1, #1 -; CHECK-NEON-NEXT: movne r6, r2 +; CHECK-NEON-NEXT: lsrs r0, r0, #5 +; CHECK-NEON-NEXT: vmov r0, s16 +; CHECK-NEON-NEXT: moveq r6, r2 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz ; CHECK-NEON-NEXT: cmn r0, #-2147483647 @@ -2801,18 +2879,23 @@ ; CHECK-NEON-NEXT: movlo r2, r0 ; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: movmi r7, r0 +; CHECK-NEON-NEXT: clz r0, r1 +; CHECK-NEON-NEXT: vmov.32 d0[0], r6 +; CHECK-NEON-NEXT: lsrs r0, r0, #5 +; CHECK-NEON-NEXT: movne r7, r2 +; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: movmi r9, r1 -; CHECK-NEON-NEXT: moveq r7, r2 +; CHECK-NEON-NEXT: add r1, r9, #1 +; CHECK-NEON-NEXT: vmov.32 d1[0], r5 ; CHECK-NEON-NEXT: cmn r9, #1 ; CHECK-NEON-NEXT: mov r0, #-2147483648 -; CHECK-NEON-NEXT: vmov.32 d1[0], r6 +; CHECK-NEON-NEXT: clz r1, r1 ; CHECK-NEON-NEXT: movgt r0, r7 ; CHECK-NEON-NEXT: cmp r7, #-2147483648 -; CHECK-NEON-NEXT: vmov.32 d0[0], r5 ; CHECK-NEON-NEXT: movls r7, r8 -; CHECK-NEON-NEXT: cmn r9, #1 +; CHECK-NEON-NEXT: lsrs r1, r1, #5 ; CHECK-NEON-NEXT: vmov.32 d1[1], r4 -; CHECK-NEON-NEXT: movne r7, r0 +; CHECK-NEON-NEXT: moveq r7, r0 ; CHECK-NEON-NEXT: vmov.32 d0[1], r7 ; CHECK-NEON-NEXT: vpop {d8, d9, d10} ; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} @@ -2825,83 +2908,105 @@ ; CHECK-FP16-NEXT: vpush {d8, d9} ; CHECK-FP16-NEXT: vmov.u16 r0, d0[3] ; CHECK-FP16-NEXT: vorr d8, d0, d0 +; CHECK-FP16-NEXT: vmov.u16 r5, d0[2] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi ; CHECK-FP16-NEXT: mov r4, r0 -; CHECK-FP16-NEXT: vmov.u16 r0, d8[2] -; CHECK-FP16-NEXT: vmov.u16 r2, d8[0] +; CHECK-FP16-NEXT: vmov.u16 r0, d8[0] ; CHECK-FP16-NEXT: cmn r4, #-2147483647 +; CHECK-FP16-NEXT: clz r2, r1 ; CHECK-FP16-NEXT: mvn r7, #-2147483648 ; CHECK-FP16-NEXT: mov r9, #0 +; CHECK-FP16-NEXT: vmov s0, r5 ; CHECK-FP16-NEXT: mov r8, #-2147483648 ; CHECK-FP16-NEXT: vmov s18, r0 ; CHECK-FP16-NEXT: mvn r0, #-2147483648 ; CHECK-FP16-NEXT: movlo r0, r4 ; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: movpl r4, r7 +; CHECK-FP16-NEXT: lsrs r2, r2, #5 +; CHECK-FP16-NEXT: movne r4, r0 +; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: movpl r1, r9 -; CHECK-FP16-NEXT: moveq r4, r0 ; CHECK-FP16-NEXT: cmn r1, #1 +; CHECK-FP16-NEXT: add r1, r1, #1 ; CHECK-FP16-NEXT: mov r0, #-2147483648 -; CHECK-FP16-NEXT: vmov s0, r2 +; CHECK-FP16-NEXT: clz r1, r1 ; CHECK-FP16-NEXT: movgt r0, r4 ; CHECK-FP16-NEXT: cmp r4, #-2147483648 ; CHECK-FP16-NEXT: movls r4, r8 -; CHECK-FP16-NEXT: cmn r1, #1 -; CHECK-FP16-NEXT: movne r4, r0 +; CHECK-FP16-NEXT: lsrs r1, r1, #5 +; CHECK-FP16-NEXT: moveq r4, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: vmov.f32 s0, s18 +; CHECK-FP16-NEXT: clz r2, r1 ; CHECK-FP16-NEXT: mov r5, r0 ; CHECK-FP16-NEXT: cmn r0, #-2147483647 ; CHECK-FP16-NEXT: mvn r0, #-2147483648 ; CHECK-FP16-NEXT: movlo r0, r5 ; CHECK-FP16-NEXT: cmp r1, #0 +; CHECK-FP16-NEXT: vmov.f32 s0, s18 +; CHECK-FP16-NEXT: lsr r2, r2, #5 ; CHECK-FP16-NEXT: movpl r5, r7 +; CHECK-FP16-NEXT: cmp r2, #0 +; CHECK-FP16-NEXT: movne r5, r0 +; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: movpl r1, r9 -; CHECK-FP16-NEXT: moveq r5, r0 ; CHECK-FP16-NEXT: cmn r1, #1 +; CHECK-FP16-NEXT: add r1, r1, #1 ; CHECK-FP16-NEXT: mov r0, #-2147483648 +; CHECK-FP16-NEXT: clz r1, r1 ; CHECK-FP16-NEXT: movgt r0, r5 ; CHECK-FP16-NEXT: cmp r5, #-2147483648 ; CHECK-FP16-NEXT: movls r5, r8 -; CHECK-FP16-NEXT: cmn r1, #1 -; CHECK-FP16-NEXT: movne r5, r0 +; CHECK-FP16-NEXT: lsrs r1, r1, #5 +; CHECK-FP16-NEXT: moveq r5, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: vmov.u16 r2, d8[1] ; CHECK-FP16-NEXT: mov r6, r0 -; CHECK-FP16-NEXT: cmn r0, #-2147483647 -; CHECK-FP16-NEXT: mvn r0, #-2147483648 -; CHECK-FP16-NEXT: movlo r0, r6 +; CHECK-FP16-NEXT: clz r0, r1 +; CHECK-FP16-NEXT: cmn r6, #-2147483647 +; CHECK-FP16-NEXT: mvn r2, #-2147483648 +; CHECK-FP16-NEXT: movlo r2, r6 ; CHECK-FP16-NEXT: cmp r1, #0 +; CHECK-FP16-NEXT: lsr r0, r0, #5 ; CHECK-FP16-NEXT: movpl r6, r7 +; CHECK-FP16-NEXT: cmp r0, #0 +; CHECK-FP16-NEXT: mov r0, #-2147483648 +; CHECK-FP16-NEXT: movne r6, r2 +; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: movpl r1, r9 -; CHECK-FP16-NEXT: moveq r6, r0 ; CHECK-FP16-NEXT: cmn r1, #1 -; CHECK-FP16-NEXT: mov r0, #-2147483648 +; CHECK-FP16-NEXT: add r1, r1, #1 ; CHECK-FP16-NEXT: movgt r0, r6 +; CHECK-FP16-NEXT: clz r1, r1 ; CHECK-FP16-NEXT: cmp r6, #-2147483648 ; CHECK-FP16-NEXT: movls r6, r8 -; CHECK-FP16-NEXT: cmn r1, #1 -; CHECK-FP16-NEXT: movne r6, r0 -; CHECK-FP16-NEXT: vmov s0, r2 +; CHECK-FP16-NEXT: lsrs r1, r1, #5 +; CHECK-FP16-NEXT: vmov.u16 r1, d8[1] +; CHECK-FP16-NEXT: moveq r6, r0 +; CHECK-FP16-NEXT: vmov s0, r1 ; CHECK-FP16-NEXT: bl __fixhfdi ; CHECK-FP16-NEXT: cmn r0, #-2147483647 ; CHECK-FP16-NEXT: mvn r2, #-2147483648 ; CHECK-FP16-NEXT: movlo r2, r0 ; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: movmi r7, r0 +; CHECK-FP16-NEXT: clz r0, r1 +; CHECK-FP16-NEXT: vmov.32 d0[0], r6 +; CHECK-FP16-NEXT: lsrs r0, r0, #5 +; CHECK-FP16-NEXT: movne r7, r2 +; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: movmi r9, r1 -; CHECK-FP16-NEXT: moveq r7, r2 +; CHECK-FP16-NEXT: add r1, r9, #1 +; CHECK-FP16-NEXT: vmov.32 d1[0], r5 ; CHECK-FP16-NEXT: cmn r9, #1 ; CHECK-FP16-NEXT: mov r0, #-2147483648 -; CHECK-FP16-NEXT: vmov.32 d1[0], r6 +; CHECK-FP16-NEXT: clz r1, r1 ; CHECK-FP16-NEXT: movgt r0, r7 ; CHECK-FP16-NEXT: cmp r7, #-2147483648 -; CHECK-FP16-NEXT: vmov.32 d0[0], r5 ; CHECK-FP16-NEXT: movls r7, r8 -; CHECK-FP16-NEXT: cmn r9, #1 +; CHECK-FP16-NEXT: lsrs r1, r1, #5 ; CHECK-FP16-NEXT: vmov.32 d1[1], r4 -; CHECK-FP16-NEXT: movne r7, r0 +; CHECK-FP16-NEXT: moveq r7, r0 ; CHECK-FP16-NEXT: vmov.32 d0[1], r7 ; CHECK-FP16-NEXT: vpop {d8, d9} ; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} @@ -3011,157 +3116,195 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NEON-LABEL: ustest_f16i32_mm: ; CHECK-NEON: @ %bb.0: @ %entry -; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEON-NEXT: .pad #4 +; CHECK-NEON-NEXT: sub sp, sp, #4 ; CHECK-NEON-NEXT: .vsave {d8, d9, d10} ; CHECK-NEON-NEXT: vpush {d8, d9, d10} ; CHECK-NEON-NEXT: vmov r0, s3 -; CHECK-NEON-NEXT: vmov.f32 s18, s2 -; CHECK-NEON-NEXT: vmov.f32 s16, s1 +; CHECK-NEON-NEXT: vmov.f32 s16, s2 +; CHECK-NEON-NEXT: vmov.f32 s18, s1 ; CHECK-NEON-NEXT: vmov.f32 s20, s0 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: vmov r2, s20 +; CHECK-NEON-NEXT: vmov r2, s18 +; CHECK-NEON-NEXT: clz r7, r1 ; CHECK-NEON-NEXT: cmp r1, #0 ; CHECK-NEON-NEXT: mvn r3, #0 -; CHECK-NEON-NEXT: mov r6, #0 ; CHECK-NEON-NEXT: movmi r3, r0 -; CHECK-NEON-NEXT: movpl r1, r6 -; CHECK-NEON-NEXT: moveq r3, r0 -; CHECK-NEON-NEXT: cmp r1, #0 +; CHECK-NEON-NEXT: lsrs r7, r7, #5 +; CHECK-NEON-NEXT: movne r3, r0 ; CHECK-NEON-NEXT: mov r7, #0 -; CHECK-NEON-NEXT: vmov r8, s18 -; CHECK-NEON-NEXT: movwgt r7, #1 -; CHECK-NEON-NEXT: cmp r7, #0 -; CHECK-NEON-NEXT: movne r7, r3 ; CHECK-NEON-NEXT: cmp r1, #0 +; CHECK-NEON-NEXT: mov r11, #0 +; CHECK-NEON-NEXT: movpl r1, r7 +; CHECK-NEON-NEXT: cmp r1, #0 +; CHECK-NEON-NEXT: clz r0, r1 +; CHECK-NEON-NEXT: movwgt r11, #1 +; CHECK-NEON-NEXT: cmp r11, #0 ; CHECK-NEON-NEXT: mvn r9, #0 -; CHECK-NEON-NEXT: moveq r7, r3 +; CHECK-NEON-NEXT: movne r11, r3 +; CHECK-NEON-NEXT: lsrs r0, r0, #5 +; CHECK-NEON-NEXT: vmov r8, s20 +; CHECK-NEON-NEXT: movne r11, r3 ; CHECK-NEON-NEXT: mov r0, r2 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz +; CHECK-NEON-NEXT: mov r4, r1 ; CHECK-NEON-NEXT: cmp r1, #0 -; CHECK-NEON-NEXT: mvn r2, #0 -; CHECK-NEON-NEXT: movmi r2, r0 -; CHECK-NEON-NEXT: movpl r1, r6 -; CHECK-NEON-NEXT: moveq r2, r0 -; CHECK-NEON-NEXT: cmp r1, #0 -; CHECK-NEON-NEXT: mov r4, #0 -; CHECK-NEON-NEXT: mov r0, r8 -; CHECK-NEON-NEXT: movwgt r4, #1 +; CHECK-NEON-NEXT: clz r1, r1 +; CHECK-NEON-NEXT: mvn r10, #0 +; CHECK-NEON-NEXT: movmi r10, r0 +; CHECK-NEON-NEXT: mov r6, #0 +; CHECK-NEON-NEXT: lsrs r1, r1, #5 +; CHECK-NEON-NEXT: movne r10, r0 ; CHECK-NEON-NEXT: cmp r4, #0 -; CHECK-NEON-NEXT: movne r4, r2 -; CHECK-NEON-NEXT: cmp r1, #0 -; CHECK-NEON-NEXT: moveq r4, r2 +; CHECK-NEON-NEXT: movpl r4, r7 +; CHECK-NEON-NEXT: cmp r4, #0 +; CHECK-NEON-NEXT: movwgt r6, #1 +; CHECK-NEON-NEXT: cmp r6, #0 +; CHECK-NEON-NEXT: mov r0, r8 +; CHECK-NEON-NEXT: movne r6, r10 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz +; CHECK-NEON-NEXT: clz r2, r1 ; CHECK-NEON-NEXT: cmp r1, #0 -; CHECK-NEON-NEXT: mvn r2, #0 -; CHECK-NEON-NEXT: movmi r2, r0 -; CHECK-NEON-NEXT: movpl r1, r6 -; CHECK-NEON-NEXT: moveq r2, r0 -; CHECK-NEON-NEXT: vmov r0, s16 -; CHECK-NEON-NEXT: cmp r1, #0 +; CHECK-NEON-NEXT: mvn r3, #0 ; CHECK-NEON-NEXT: mov r5, #0 +; CHECK-NEON-NEXT: lsr r2, r2, #5 +; CHECK-NEON-NEXT: movmi r3, r0 +; CHECK-NEON-NEXT: cmp r2, #0 +; CHECK-NEON-NEXT: movne r3, r0 +; CHECK-NEON-NEXT: cmp r1, #0 +; CHECK-NEON-NEXT: movpl r1, r7 +; CHECK-NEON-NEXT: cmp r1, #0 +; CHECK-NEON-NEXT: clz r0, r1 ; CHECK-NEON-NEXT: movwgt r5, #1 ; CHECK-NEON-NEXT: cmp r5, #0 -; CHECK-NEON-NEXT: movne r5, r2 -; CHECK-NEON-NEXT: cmp r1, #0 -; CHECK-NEON-NEXT: moveq r5, r2 +; CHECK-NEON-NEXT: movne r5, r3 +; CHECK-NEON-NEXT: lsrs r0, r0, #5 +; CHECK-NEON-NEXT: vmov r0, s16 +; CHECK-NEON-NEXT: movne r5, r3 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz +; CHECK-NEON-NEXT: clz r2, r1 ; CHECK-NEON-NEXT: cmp r1, #0 -; CHECK-NEON-NEXT: vmov.32 d1[0], r5 ; CHECK-NEON-NEXT: movmi r9, r0 -; CHECK-NEON-NEXT: movpl r1, r6 -; CHECK-NEON-NEXT: moveq r9, r0 +; CHECK-NEON-NEXT: vmov.32 d0[0], r5 +; CHECK-NEON-NEXT: lsrs r2, r2, #5 +; CHECK-NEON-NEXT: movne r9, r0 ; CHECK-NEON-NEXT: cmp r1, #0 -; CHECK-NEON-NEXT: movwgt r6, #1 -; CHECK-NEON-NEXT: cmp r6, #0 -; CHECK-NEON-NEXT: vmov.32 d0[0], r4 -; CHECK-NEON-NEXT: movne r6, r9 +; CHECK-NEON-NEXT: movpl r1, r7 ; CHECK-NEON-NEXT: cmp r1, #0 -; CHECK-NEON-NEXT: vmov.32 d1[1], r7 -; CHECK-NEON-NEXT: moveq r6, r9 +; CHECK-NEON-NEXT: clz r0, r1 +; CHECK-NEON-NEXT: movwgt r7, #1 +; CHECK-NEON-NEXT: cmp r7, #0 +; CHECK-NEON-NEXT: movne r7, r9 +; CHECK-NEON-NEXT: lsrs r0, r0, #5 +; CHECK-NEON-NEXT: movne r7, r9 +; CHECK-NEON-NEXT: clz r0, r4 +; CHECK-NEON-NEXT: vmov.32 d1[0], r7 +; CHECK-NEON-NEXT: lsrs r0, r0, #5 +; CHECK-NEON-NEXT: movne r6, r10 +; CHECK-NEON-NEXT: vmov.32 d1[1], r11 ; CHECK-NEON-NEXT: vmov.32 d0[1], r6 ; CHECK-NEON-NEXT: vpop {d8, d9, d10} -; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-NEON-NEXT: add sp, sp, #4 +; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; ; CHECK-FP16-LABEL: ustest_f16i32_mm: ; CHECK-FP16: @ %bb.0: @ %entry -; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-FP16-NEXT: .vsave {d8, d9} ; CHECK-FP16-NEXT: vpush {d8, d9} ; CHECK-FP16-NEXT: vmov.u16 r0, d0[3] ; CHECK-FP16-NEXT: vorr d8, d0, d0 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: vmov.u16 r2, d8[1] +; CHECK-FP16-NEXT: vmov.u16 r3, d8[1] ; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: vmov.u16 r7, d8[0] -; CHECK-FP16-NEXT: mov r5, #0 -; CHECK-FP16-NEXT: vmov.u16 r3, d8[2] -; CHECK-FP16-NEXT: movpl r1, r5 +; CHECK-FP16-NEXT: vmov.u16 r2, d8[0] ; CHECK-FP16-NEXT: mov r6, #0 +; CHECK-FP16-NEXT: mov r10, #0 ; CHECK-FP16-NEXT: mvn r8, #0 -; CHECK-FP16-NEXT: vmov s16, r2 +; CHECK-FP16-NEXT: vmov s0, r3 +; CHECK-FP16-NEXT: clz r3, r1 +; CHECK-FP16-NEXT: vmov s18, r2 ; CHECK-FP16-NEXT: mvn r2, #0 ; CHECK-FP16-NEXT: movmi r2, r0 -; CHECK-FP16-NEXT: vmov s0, r7 -; CHECK-FP16-NEXT: moveq r2, r0 +; CHECK-FP16-NEXT: lsrs r3, r3, #5 +; CHECK-FP16-NEXT: movne r2, r0 ; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: movwgt r6, #1 -; CHECK-FP16-NEXT: cmp r6, #0 -; CHECK-FP16-NEXT: movne r6, r2 +; CHECK-FP16-NEXT: movpl r1, r6 ; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: vmov s18, r3 -; CHECK-FP16-NEXT: moveq r6, r2 +; CHECK-FP16-NEXT: clz r0, r1 +; CHECK-FP16-NEXT: movwgt r10, #1 +; CHECK-FP16-NEXT: cmp r10, #0 +; CHECK-FP16-NEXT: movne r10, r2 +; CHECK-FP16-NEXT: lsrs r0, r0, #5 +; CHECK-FP16-NEXT: movne r10, r2 ; CHECK-FP16-NEXT: bl __fixhfdi ; CHECK-FP16-NEXT: vmov.f32 s0, s18 +; CHECK-FP16-NEXT: mov r4, r1 ; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: mvn r2, #0 -; CHECK-FP16-NEXT: movpl r1, r5 -; CHECK-FP16-NEXT: movmi r2, r0 -; CHECK-FP16-NEXT: mov r7, #0 -; CHECK-FP16-NEXT: moveq r2, r0 -; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: movwgt r7, #1 -; CHECK-FP16-NEXT: cmp r7, #0 -; CHECK-FP16-NEXT: movne r7, r2 -; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: moveq r7, r2 +; CHECK-FP16-NEXT: clz r1, r1 +; CHECK-FP16-NEXT: mvn r9, #0 +; CHECK-FP16-NEXT: mov r5, #0 +; CHECK-FP16-NEXT: movmi r9, r0 +; CHECK-FP16-NEXT: lsrs r1, r1, #5 +; CHECK-FP16-NEXT: movne r9, r0 +; CHECK-FP16-NEXT: cmp r4, #0 +; CHECK-FP16-NEXT: movpl r4, r6 +; CHECK-FP16-NEXT: cmp r4, #0 +; CHECK-FP16-NEXT: movwgt r5, #1 +; CHECK-FP16-NEXT: cmp r5, #0 +; CHECK-FP16-NEXT: movne r5, r9 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: vmov.f32 s0, s16 +; CHECK-FP16-NEXT: clz r2, r1 ; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: mvn r2, #0 -; CHECK-FP16-NEXT: movpl r1, r5 -; CHECK-FP16-NEXT: movmi r2, r0 -; CHECK-FP16-NEXT: mov r4, #0 -; CHECK-FP16-NEXT: moveq r2, r0 +; CHECK-FP16-NEXT: mvn r3, #0 +; CHECK-FP16-NEXT: mov r7, #0 +; CHECK-FP16-NEXT: lsr r2, r2, #5 +; CHECK-FP16-NEXT: movmi r3, r0 +; CHECK-FP16-NEXT: cmp r2, #0 +; CHECK-FP16-NEXT: movne r3, r0 ; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: movwgt r4, #1 -; CHECK-FP16-NEXT: cmp r4, #0 -; CHECK-FP16-NEXT: movne r4, r2 +; CHECK-FP16-NEXT: movpl r1, r6 ; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: moveq r4, r2 +; CHECK-FP16-NEXT: clz r0, r1 +; CHECK-FP16-NEXT: movwgt r7, #1 +; CHECK-FP16-NEXT: cmp r7, #0 +; CHECK-FP16-NEXT: movne r7, r3 +; CHECK-FP16-NEXT: lsrs r0, r0, #5 +; CHECK-FP16-NEXT: vmov.u16 r0, d8[2] +; CHECK-FP16-NEXT: movne r7, r3 +; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi +; CHECK-FP16-NEXT: clz r2, r1 ; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: vmov.32 d1[0], r4 ; CHECK-FP16-NEXT: movmi r8, r0 -; CHECK-FP16-NEXT: movpl r1, r5 -; CHECK-FP16-NEXT: moveq r8, r0 -; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: movwgt r5, #1 -; CHECK-FP16-NEXT: cmp r5, #0 ; CHECK-FP16-NEXT: vmov.32 d0[0], r7 -; CHECK-FP16-NEXT: movne r5, r8 +; CHECK-FP16-NEXT: lsrs r2, r2, #5 +; CHECK-FP16-NEXT: movne r8, r0 ; CHECK-FP16-NEXT: cmp r1, #0 -; CHECK-FP16-NEXT: vmov.32 d1[1], r6 -; CHECK-FP16-NEXT: moveq r5, r8 +; CHECK-FP16-NEXT: movpl r1, r6 +; CHECK-FP16-NEXT: cmp r1, #0 +; CHECK-FP16-NEXT: clz r0, r1 +; CHECK-FP16-NEXT: movwgt r6, #1 +; CHECK-FP16-NEXT: cmp r6, #0 +; CHECK-FP16-NEXT: movne r6, r8 +; CHECK-FP16-NEXT: lsrs r0, r0, #5 +; CHECK-FP16-NEXT: movne r6, r8 +; CHECK-FP16-NEXT: clz r0, r4 +; CHECK-FP16-NEXT: vmov.32 d1[0], r6 +; CHECK-FP16-NEXT: lsrs r0, r0, #5 +; CHECK-FP16-NEXT: movne r5, r9 +; CHECK-FP16-NEXT: vmov.32 d1[1], r10 ; CHECK-FP16-NEXT: vmov.32 d0[1], r5 ; CHECK-FP16-NEXT: vpop {d8, d9} -; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, pc} +; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} entry: %conv = fptosi <4 x half> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) diff --git a/llvm/test/CodeGen/ARM/func-argpassing-endian.ll b/llvm/test/CodeGen/ARM/func-argpassing-endian.ll --- a/llvm/test/CodeGen/ARM/func-argpassing-endian.ll +++ b/llvm/test/CodeGen/ARM/func-argpassing-endian.ll @@ -36,22 +36,12 @@ } define void @arg_v4i32(<4 x i32> %vec ) { -; CHECK-LE-LABEL: arg_v4i32: -; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: vmov d16, r0, r1 -; CHECK-LE-NEXT: movw r0, :lower16:var32 -; CHECK-LE-NEXT: movt r0, :upper16:var32 -; CHECK-LE-NEXT: vst1.32 {d16[0]}, [r0:32] -; CHECK-LE-NEXT: bx lr -; -; CHECK-BE-LABEL: arg_v4i32: -; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: vmov d16, r1, r0 -; CHECK-BE-NEXT: movw r0, :lower16:var32 -; CHECK-BE-NEXT: movt r0, :upper16:var32 -; CHECK-BE-NEXT: vrev64.32 q8, q8 -; CHECK-BE-NEXT: vst1.32 {d16[0]}, [r0:32] -; CHECK-BE-NEXT: bx lr +; CHECK-LABEL: arg_v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r1, :lower16:var32 +; CHECK-NEXT: movt r1, :upper16:var32 +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: bx lr %tmp = extractelement <4 x i32> %vec, i32 0 store i32 %tmp, ptr @var32 ret void @@ -100,13 +90,35 @@ } define <4 x i32> @return_v4i32() { -; CHECK-LABEL: return_v4i32: -; CHECK: @ %bb.0: -; CHECK-NEXT: mov r0, #42 -; CHECK-NEXT: mov r1, #43 -; CHECK-NEXT: mov r2, #44 -; CHECK-NEXT: mov r3, #45 -; CHECK-NEXT: bx lr +; CHECK-LE-LABEL: return_v4i32: +; CHECK-LE: @ %bb.0: +; CHECK-LE-NEXT: adr r0, .LCPI6_0 +; CHECK-LE-NEXT: vld1.64 {d16, d17}, [r0:128] +; CHECK-LE-NEXT: vmov r0, r1, d16 +; CHECK-LE-NEXT: vmov r2, r3, d17 +; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: .p2align 4 +; CHECK-LE-NEXT: @ %bb.1: +; CHECK-LE-NEXT: .LCPI6_0: +; CHECK-LE-NEXT: .long 42 @ double 9.1245819032257467E-313 +; CHECK-LE-NEXT: .long 43 +; CHECK-LE-NEXT: .long 44 @ double 9.5489810615176143E-313 +; CHECK-LE-NEXT: .long 45 +; +; CHECK-BE-LABEL: return_v4i32: +; CHECK-BE: @ %bb.0: +; CHECK-BE-NEXT: adr r0, .LCPI6_0 +; CHECK-BE-NEXT: vld1.64 {d16, d17}, [r0:128] +; CHECK-BE-NEXT: vmov r1, r0, d16 +; CHECK-BE-NEXT: vmov r3, r2, d17 +; CHECK-BE-NEXT: bx lr +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: @ %bb.1: +; CHECK-BE-NEXT: .LCPI6_0: +; CHECK-BE-NEXT: .long 42 @ double 8.912382324178626E-313 +; CHECK-BE-NEXT: .long 43 +; CHECK-BE-NEXT: .long 44 @ double 9.3367814824704935E-313 +; CHECK-BE-NEXT: .long 45 ret < 4 x i32> < i32 42, i32 43, i32 44, i32 45 > } diff --git a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll --- a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll @@ -71,12 +71,13 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ands r3, r2, #32 +; CHECK-NEXT: ubfx r3, r2, #5, #1 ; CHECK-NEXT: and r12, r2, #31 +; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: mov r3, r0 -; CHECK-NEXT: mov r4, #31 ; CHECK-NEXT: movne r3, r1 ; CHECK-NEXT: movne r1, r0 +; CHECK-NEXT: mov r4, #31 ; CHECK-NEXT: bic r2, r4, r2 ; CHECK-NEXT: lsl lr, r3, r12 ; CHECK-NEXT: lsr r0, r1, #1 diff --git a/llvm/test/CodeGen/ARM/funnel-shift.ll b/llvm/test/CodeGen/ARM/funnel-shift.ll --- a/llvm/test/CodeGen/ARM/funnel-shift.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift.ll @@ -49,24 +49,25 @@ ; SCALAR: @ %bb.0: ; SCALAR-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} ; SCALAR-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} -; SCALAR-NEXT: mov r8, r0 -; SCALAR-NEXT: ldr r0, [sp, #36] ; SCALAR-NEXT: mov r4, r1 -; SCALAR-NEXT: mov r6, r3 -; SCALAR-NEXT: and r1, r0, #31 +; SCALAR-NEXT: ldr r1, [sp, #36] +; SCALAR-NEXT: mov r8, r0 ; SCALAR-NEXT: ldr r0, [sp, #32] +; SCALAR-NEXT: and r1, r1, #31 +; SCALAR-NEXT: mov r6, r3 ; SCALAR-NEXT: mov r9, r2 ; SCALAR-NEXT: mov r2, #37 ; SCALAR-NEXT: mov r3, #0 ; SCALAR-NEXT: bl __aeabi_uldivmod ; SCALAR-NEXT: lsl r1, r6, #27 -; SCALAR-NEXT: ands r0, r2, #32 +; SCALAR-NEXT: ubfx r0, r2, #5, #1 ; SCALAR-NEXT: orr r1, r1, r9, lsr #5 +; SCALAR-NEXT: cmp r0, #0 ; SCALAR-NEXT: mov r3, r8 ; SCALAR-NEXT: and r6, r2, #31 -; SCALAR-NEXT: mov r7, #31 ; SCALAR-NEXT: movne r3, r1 ; SCALAR-NEXT: cmp r0, #0 +; SCALAR-NEXT: mov r7, #31 ; SCALAR-NEXT: lslne r1, r9, #27 ; SCALAR-NEXT: bic r2, r7, r2 ; SCALAR-NEXT: movne r4, r8 @@ -80,11 +81,11 @@ ; ; NEON-LABEL: fshl_i37: ; NEON: @ %bb.0: -; NEON-NEXT: .save {r4, r5, r6, r7, r11, lr} -; NEON-NEXT: push {r4, r5, r6, r7, r11, lr} +; NEON-NEXT: .save {r4, r5, r6, r7, r8, lr} +; NEON-NEXT: push {r4, r5, r6, r7, r8, lr} ; NEON-NEXT: mov r4, r1 ; NEON-NEXT: ldr r1, [sp, #28] -; NEON-NEXT: mov r6, r0 +; NEON-NEXT: mov r8, r0 ; NEON-NEXT: ldr r0, [sp, #24] ; NEON-NEXT: and r1, r1, #31 ; NEON-NEXT: mov r5, r3 @@ -95,21 +96,22 @@ ; NEON-NEXT: mov r0, #31 ; NEON-NEXT: bic r1, r0, r2 ; NEON-NEXT: lsl r0, r5, #27 -; NEON-NEXT: ands r12, r2, #32 +; NEON-NEXT: ubfx r3, r2, #5, #1 ; NEON-NEXT: orr r0, r0, r7, lsr #5 -; NEON-NEXT: mov r5, r6 +; NEON-NEXT: cmp r3, #0 +; NEON-NEXT: mov r5, r8 ; NEON-NEXT: and r2, r2, #31 ; NEON-NEXT: movne r5, r0 ; NEON-NEXT: lslne r0, r7, #27 -; NEON-NEXT: cmp r12, #0 -; NEON-NEXT: lsl r3, r5, r2 +; NEON-NEXT: cmp r3, #0 ; NEON-NEXT: lsr r0, r0, #1 -; NEON-NEXT: movne r4, r6 -; NEON-NEXT: orr r0, r3, r0, lsr r1 +; NEON-NEXT: movne r4, r8 ; NEON-NEXT: lsr r3, r5, #1 +; NEON-NEXT: lsl r6, r5, r2 +; NEON-NEXT: orr r0, r6, r0, lsr r1 ; NEON-NEXT: lsl r2, r4, r2 ; NEON-NEXT: orr r1, r2, r3, lsr r1 -; NEON-NEXT: pop {r4, r5, r6, r7, r11, pc} +; NEON-NEXT: pop {r4, r5, r6, r7, r8, pc} %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z) ret i37 %f } @@ -235,73 +237,39 @@ ; Verify that weird types are minimally supported. declare i37 @llvm.fshr.i37(i37, i37, i37) define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { -; SCALAR-LABEL: fshr_i37: -; SCALAR: @ %bb.0: -; SCALAR-NEXT: .save {r4, r5, r6, r7, r8, lr} -; SCALAR-NEXT: push {r4, r5, r6, r7, r8, lr} -; SCALAR-NEXT: mov r8, r0 -; SCALAR-NEXT: ldr r0, [sp, #28] -; SCALAR-NEXT: mov r4, r1 -; SCALAR-NEXT: mov r5, r3 -; SCALAR-NEXT: and r1, r0, #31 -; SCALAR-NEXT: ldr r0, [sp, #24] -; SCALAR-NEXT: mov r7, r2 -; SCALAR-NEXT: mov r2, #37 -; SCALAR-NEXT: mov r3, #0 -; SCALAR-NEXT: bl __aeabi_uldivmod -; SCALAR-NEXT: lsl r3, r5, #27 -; SCALAR-NEXT: add r0, r2, #27 -; SCALAR-NEXT: orr r3, r3, r7, lsr #5 -; SCALAR-NEXT: ands r2, r0, #32 -; SCALAR-NEXT: mov r5, r8 -; SCALAR-NEXT: mov r1, #31 -; SCALAR-NEXT: moveq r5, r3 -; SCALAR-NEXT: lsleq r3, r7, #27 -; SCALAR-NEXT: cmp r2, #0 -; SCALAR-NEXT: bic r1, r1, r0 -; SCALAR-NEXT: moveq r4, r8 -; SCALAR-NEXT: lsl r6, r5, #1 -; SCALAR-NEXT: and r7, r0, #31 -; SCALAR-NEXT: lsl r2, r4, #1 -; SCALAR-NEXT: lsl r6, r6, r1 -; SCALAR-NEXT: lsl r1, r2, r1 -; SCALAR-NEXT: orr r0, r6, r3, lsr r7 -; SCALAR-NEXT: orr r1, r1, r5, lsr r7 -; SCALAR-NEXT: pop {r4, r5, r6, r7, r8, pc} -; -; NEON-LABEL: fshr_i37: -; NEON: @ %bb.0: -; NEON-NEXT: .save {r4, r5, r6, r7, r8, lr} -; NEON-NEXT: push {r4, r5, r6, r7, r8, lr} -; NEON-NEXT: mov r4, r1 -; NEON-NEXT: ldr r1, [sp, #28] -; NEON-NEXT: mov r8, r0 -; NEON-NEXT: ldr r0, [sp, #24] -; NEON-NEXT: and r1, r1, #31 -; NEON-NEXT: mov r5, r3 -; NEON-NEXT: mov r7, r2 -; NEON-NEXT: mov r2, #37 -; NEON-NEXT: mov r3, #0 -; NEON-NEXT: bl __aeabi_uldivmod -; NEON-NEXT: lsl r3, r5, #27 -; NEON-NEXT: add r0, r2, #27 -; NEON-NEXT: orr r3, r3, r7, lsr #5 -; NEON-NEXT: ands r2, r0, #32 -; NEON-NEXT: mov r5, r8 -; NEON-NEXT: mov r1, #31 -; NEON-NEXT: moveq r5, r3 -; NEON-NEXT: lsleq r3, r7, #27 -; NEON-NEXT: cmp r2, #0 -; NEON-NEXT: bic r1, r1, r0 -; NEON-NEXT: moveq r4, r8 -; NEON-NEXT: lsl r6, r5, #1 -; NEON-NEXT: and r7, r0, #31 -; NEON-NEXT: lsl r2, r4, #1 -; NEON-NEXT: lsl r6, r6, r1 -; NEON-NEXT: lsl r1, r2, r1 -; NEON-NEXT: orr r0, r6, r3, lsr r7 -; NEON-NEXT: orr r1, r1, r5, lsr r7 -; NEON-NEXT: pop {r4, r5, r6, r7, r8, pc} +; CHECK-LABEL: fshr_i37: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: ldr r1, [sp, #28] +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: ldr r0, [sp, #24] +; CHECK-NEXT: and r1, r1, #31 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: mov r7, r2 +; CHECK-NEXT: mov r2, #37 +; CHECK-NEXT: mov r3, #0 +; CHECK-NEXT: bl __aeabi_uldivmod +; CHECK-NEXT: lsl r3, r5, #27 +; CHECK-NEXT: add r0, r2, #27 +; CHECK-NEXT: orr r3, r3, r7, lsr #5 +; CHECK-NEXT: ands r2, r0, #32 +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: mov r1, #31 +; CHECK-NEXT: moveq r5, r3 +; CHECK-NEXT: lsleq r3, r7, #27 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: bic r1, r1, r0 +; CHECK-NEXT: moveq r4, r8 +; CHECK-NEXT: lsl r6, r5, #1 +; CHECK-NEXT: and r7, r0, #31 +; CHECK-NEXT: lsl r2, r4, #1 +; CHECK-NEXT: lsl r6, r6, r1 +; CHECK-NEXT: lsl r1, r2, r1 +; CHECK-NEXT: orr r0, r6, r3, lsr r7 +; CHECK-NEXT: orr r1, r1, r5, lsr r7 +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, pc} %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z) ret i37 %f } diff --git a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll @@ -90,16 +90,13 @@ ; ; BE-LABEL: i56_or: ; BE: @ %bb.0: -; BE-NEXT: mov r1, r0 -; BE-NEXT: ldr r0, [r0] -; BE-NEXT: ldrh r2, [r1, #4]! -; BE-NEXT: ldrb r3, [r1, #2] -; BE-NEXT: orr r2, r3, r2, lsl #8 -; BE-NEXT: orr r0, r2, r0, lsl #24 -; BE-NEXT: orr r0, r0, #384 -; BE-NEXT: strb r0, [r1, #2] -; BE-NEXT: lsr r0, r0, #8 -; BE-NEXT: strh r0, [r1] +; BE-NEXT: ldrh r1, [r0, #4]! +; BE-NEXT: ldrb r2, [r0, #2] +; BE-NEXT: orr r1, r2, r1, lsl #8 +; BE-NEXT: orr r1, r1, #384 +; BE-NEXT: strb r1, [r0, #2] +; BE-NEXT: lsr r1, r1, #8 +; BE-NEXT: strh r1, [r0] ; BE-NEXT: mov pc, lr %aa = load i56, ptr %a %b = or i56 %aa, 384 diff --git a/llvm/test/CodeGen/ARM/inc-of-add.ll b/llvm/test/CodeGen/ARM/inc-of-add.ll --- a/llvm/test/CodeGen/ARM/inc-of-add.ll +++ b/llvm/test/CodeGen/ARM/inc-of-add.ll @@ -484,16 +484,16 @@ ; ARM6-LABEL: vector_i128_i64: ; ARM6: @ %bb.0: ; ARM6-NEXT: push {r11, lr} -; ARM6-NEXT: ldr lr, [sp, #8] -; ARM6-NEXT: ldr r12, [sp, #12] -; ARM6-NEXT: adds r0, r0, lr -; ARM6-NEXT: ldr lr, [sp, #16] -; ARM6-NEXT: adc r1, r1, r12 +; ARM6-NEXT: ldr r12, [sp, #8] +; ARM6-NEXT: ldr lr, [sp, #12] +; ARM6-NEXT: adds r0, r0, r12 +; ARM6-NEXT: ldr r12, [sp, #16] +; ARM6-NEXT: adc r1, r1, lr ; ARM6-NEXT: adds r0, r0, #1 -; ARM6-NEXT: ldr r12, [sp, #20] +; ARM6-NEXT: ldr lr, [sp, #20] ; ARM6-NEXT: adc r1, r1, #0 -; ARM6-NEXT: adds r2, r2, lr -; ARM6-NEXT: adc r3, r3, r12 +; ARM6-NEXT: adds r2, r2, r12 +; ARM6-NEXT: adc r3, r3, lr ; ARM6-NEXT: adds r2, r2, #1 ; ARM6-NEXT: adc r3, r3, #0 ; ARM6-NEXT: pop {r11, pc} @@ -514,14 +514,16 @@ ; THUMB6: @ %bb.0: ; THUMB6-NEXT: push {r4, r5, r7, lr} ; THUMB6-NEXT: mvns r4, r1 +; THUMB6-NEXT: add r1, sp, #16 +; THUMB6-NEXT: ldr r1, [r1, #4] ; THUMB6-NEXT: mvns r0, r0 -; THUMB6-NEXT: ldr r1, [sp, #20] ; THUMB6-NEXT: ldr r5, [sp, #16] ; THUMB6-NEXT: subs r0, r5, r0 ; THUMB6-NEXT: sbcs r1, r4 ; THUMB6-NEXT: mvns r4, r3 +; THUMB6-NEXT: add r3, sp, #24 +; THUMB6-NEXT: ldr r3, [r3, #4] ; THUMB6-NEXT: mvns r2, r2 -; THUMB6-NEXT: ldr r3, [sp, #28] ; THUMB6-NEXT: ldr r5, [sp, #24] ; THUMB6-NEXT: subs r2, r5, r2 ; THUMB6-NEXT: sbcs r3, r4 diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll --- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll @@ -65,20 +65,36 @@ ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldr r0, [r0] -; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: ldrb r1, [r0, #2] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #3] +; CHECK-ARMv6-NEXT: ldrh r0, [r0] +; CHECK-ARMv6-NEXT: rev16 r0, r0 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #24 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: load_i32_by_i8_bswap: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldr r0, [r0] -; CHECK-THUMBv6-NEXT: rev r0, r0 +; CHECK-THUMBv6-NEXT: ldrb r1, [r0, #2] +; CHECK-THUMBv6-NEXT: lsls r1, r1, #16 +; CHECK-THUMBv6-NEXT: ldrh r2, [r0] +; CHECK-THUMBv6-NEXT: rev16 r2, r2 +; CHECK-THUMBv6-NEXT: adds r1, r2, r1 +; CHECK-THUMBv6-NEXT: ldrb r0, [r0, #3] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv6-NEXT: adds r0, r1, r0 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: load_i32_by_i8_bswap: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldr r0, [r0] -; CHECK-THUMBv7-NEXT: rev r0, r0 +; CHECK-THUMBv7-NEXT: ldrb r1, [r0, #2] +; CHECK-THUMBv7-NEXT: lsls r1, r1, #16 +; CHECK-THUMBv7-NEXT: ldrh r2, [r0] +; CHECK-THUMBv7-NEXT: rev16 r2, r2 +; CHECK-THUMBv7-NEXT: adds r1, r2, r1 +; CHECK-THUMBv7-NEXT: ldrb r0, [r0, #3] +; CHECK-THUMBv7-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv7-NEXT: adds r0, r1, r0 ; CHECK-THUMBv7-NEXT: bx lr %tmp2 = load i8, ptr %arg, align 4 @@ -389,22 +405,38 @@ ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldr r0, [r0, #1] -; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: ldrb r1, [r0, #3] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #4] +; CHECK-ARMv6-NEXT: ldrh r0, [r0, #1] +; CHECK-ARMv6-NEXT: rev16 r0, r0 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #24 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK-THUMBv6: @ %bb.0: ; CHECK-THUMBv6-NEXT: movs r1, #1 -; CHECK-THUMBv6-NEXT: ldr r0, [r0, r1] -; CHECK-THUMBv6-NEXT: rev r0, r0 +; CHECK-THUMBv6-NEXT: ldrh r1, [r0, r1] +; CHECK-THUMBv6-NEXT: rev16 r1, r1 +; CHECK-THUMBv6-NEXT: ldrb r2, [r0, #3] +; CHECK-THUMBv6-NEXT: lsls r2, r2, #16 +; CHECK-THUMBv6-NEXT: adds r1, r1, r2 +; CHECK-THUMBv6-NEXT: ldrb r0, [r0, #4] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv6-NEXT: adds r0, r1, r0 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK-THUMBv7: @ %bb.0: ; CHECK-THUMBv7-NEXT: movs r1, #1 -; CHECK-THUMBv7-NEXT: ldr r0, [r0, r1] -; CHECK-THUMBv7-NEXT: rev r0, r0 +; CHECK-THUMBv7-NEXT: ldrh r1, [r0, r1] +; CHECK-THUMBv7-NEXT: rev16 r1, r1 +; CHECK-THUMBv7-NEXT: ldrb r2, [r0, #3] +; CHECK-THUMBv7-NEXT: lsls r2, r2, #16 +; CHECK-THUMBv7-NEXT: adds r1, r1, r2 +; CHECK-THUMBv7-NEXT: ldrb r0, [r0, #4] +; CHECK-THUMBv7-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv7-NEXT: adds r0, r1, r0 ; CHECK-THUMBv7-NEXT: bx lr @@ -446,22 +478,42 @@ ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: ldr r0, [r0, #-4] -; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: ldrb r1, [r0, #-2] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #-1] +; CHECK-ARMv6-NEXT: ldrh r0, [r0, #-4] +; CHECK-ARMv6-NEXT: rev16 r0, r0 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #24 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: load_i32_by_i8_neg_offset: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: subs r0, r0, #4 -; CHECK-THUMBv6-NEXT: ldr r0, [r0] -; CHECK-THUMBv6-NEXT: rev r0, r0 +; CHECK-THUMBv6-NEXT: subs r1, r0, #2 +; CHECK-THUMBv6-NEXT: ldrb r1, [r1] +; CHECK-THUMBv6-NEXT: lsls r1, r1, #16 +; CHECK-THUMBv6-NEXT: subs r2, r0, #4 +; CHECK-THUMBv6-NEXT: ldrh r2, [r2] +; CHECK-THUMBv6-NEXT: rev16 r2, r2 +; CHECK-THUMBv6-NEXT: adds r1, r2, r1 +; CHECK-THUMBv6-NEXT: subs r0, r0, #1 +; CHECK-THUMBv6-NEXT: ldrb r0, [r0] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv6-NEXT: adds r0, r1, r0 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: load_i32_by_i8_neg_offset: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: subs r0, r0, #4 -; CHECK-THUMBv7-NEXT: ldr r0, [r0] -; CHECK-THUMBv7-NEXT: rev r0, r0 +; CHECK-THUMBv7-NEXT: subs r1, r0, #2 +; CHECK-THUMBv7-NEXT: ldrb r1, [r1] +; CHECK-THUMBv7-NEXT: lsls r1, r1, #16 +; CHECK-THUMBv7-NEXT: subs r2, r0, #4 +; CHECK-THUMBv7-NEXT: ldrh r2, [r2] +; CHECK-THUMBv7-NEXT: rev16 r2, r2 +; CHECK-THUMBv7-NEXT: adds r1, r2, r1 +; CHECK-THUMBv7-NEXT: subs r0, r0, #1 +; CHECK-THUMBv7-NEXT: ldrb r0, [r0] +; CHECK-THUMBv7-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv7-NEXT: adds r0, r1, r0 ; CHECK-THUMBv7-NEXT: bx lr @@ -680,22 +732,38 @@ ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index: ; CHECK-ARMv6: @ %bb.0: ; CHECK-ARMv6-NEXT: add r0, r0, r1 -; CHECK-ARMv6-NEXT: ldr r0, [r0, #12] -; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: ldrb r1, [r0, #14] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #15] +; CHECK-ARMv6-NEXT: ldrh r0, [r0, #12] +; CHECK-ARMv6-NEXT: rev16 r0, r0 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #24 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: load_i32_by_i8_base_offset_index: ; CHECK-THUMBv6: @ %bb.0: ; CHECK-THUMBv6-NEXT: adds r0, r0, r1 -; CHECK-THUMBv6-NEXT: ldr r0, [r0, #12] -; CHECK-THUMBv6-NEXT: rev r0, r0 +; CHECK-THUMBv6-NEXT: ldrb r1, [r0, #14] +; CHECK-THUMBv6-NEXT: lsls r1, r1, #16 +; CHECK-THUMBv6-NEXT: ldrh r2, [r0, #12] +; CHECK-THUMBv6-NEXT: rev16 r2, r2 +; CHECK-THUMBv6-NEXT: adds r1, r2, r1 +; CHECK-THUMBv6-NEXT: ldrb r0, [r0, #15] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv6-NEXT: adds r0, r1, r0 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: load_i32_by_i8_base_offset_index: ; CHECK-THUMBv7: @ %bb.0: ; CHECK-THUMBv7-NEXT: adds r0, r0, r1 -; CHECK-THUMBv7-NEXT: ldr r0, [r0, #12] -; CHECK-THUMBv7-NEXT: rev r0, r0 +; CHECK-THUMBv7-NEXT: ldrb r1, [r0, #14] +; CHECK-THUMBv7-NEXT: lsls r1, r1, #16 +; CHECK-THUMBv7-NEXT: ldrh r2, [r0, #12] +; CHECK-THUMBv7-NEXT: rev16 r2, r2 +; CHECK-THUMBv7-NEXT: adds r1, r2, r1 +; CHECK-THUMBv7-NEXT: ldrb r0, [r0, #15] +; CHECK-THUMBv7-NEXT: lsls r0, r0, #24 +; CHECK-THUMBv7-NEXT: adds r0, r1, r0 ; CHECK-THUMBv7-NEXT: bx lr %tmp = add nuw nsw i32 %i, 3 %tmp2 = add nuw nsw i32 %i, 2 @@ -732,7 +800,7 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r0, r1, r0 +; CHECK-NEXT: add r0, r0, r1 ; CHECK-NEXT: mov r1, #65280 ; CHECK-NEXT: ldr r0, [r0, #13] ; CHECK-NEXT: and r2, r0, #65280 @@ -745,25 +813,41 @@ ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: add r0, r1, r0 -; CHECK-ARMv6-NEXT: ldr r0, [r0, #13] -; CHECK-ARMv6-NEXT: rev r0, r0 +; CHECK-ARMv6-NEXT: add r0, r0, r1 +; CHECK-ARMv6-NEXT: ldrb r1, [r0, #15] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #16] +; CHECK-ARMv6-NEXT: ldrh r0, [r0, #13] +; CHECK-ARMv6-NEXT: rev16 r0, r0 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #24 ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK-THUMBv6: @ %bb.0: +; CHECK-THUMBv6-NEXT: adds r0, r0, r1 +; CHECK-THUMBv6-NEXT: ldrb r1, [r0, #15] +; CHECK-THUMBv6-NEXT: lsls r1, r1, #16 +; CHECK-THUMBv6-NEXT: movs r2, #13 +; CHECK-THUMBv6-NEXT: ldrh r2, [r0, r2] +; CHECK-THUMBv6-NEXT: rev16 r2, r2 +; CHECK-THUMBv6-NEXT: adds r1, r2, r1 +; CHECK-THUMBv6-NEXT: ldrb r0, [r0, #16] +; CHECK-THUMBv6-NEXT: lsls r0, r0, #24 ; CHECK-THUMBv6-NEXT: adds r0, r1, r0 -; CHECK-THUMBv6-NEXT: movs r1, #13 -; CHECK-THUMBv6-NEXT: ldr r0, [r0, r1] -; CHECK-THUMBv6-NEXT: rev r0, r0 ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK-THUMBv7: @ %bb.0: +; CHECK-THUMBv7-NEXT: adds r0, r0, r1 +; CHECK-THUMBv7-NEXT: ldrb r1, [r0, #15] +; CHECK-THUMBv7-NEXT: lsls r1, r1, #16 +; CHECK-THUMBv7-NEXT: movs r2, #13 +; CHECK-THUMBv7-NEXT: ldrh r2, [r0, r2] +; CHECK-THUMBv7-NEXT: rev16 r2, r2 +; CHECK-THUMBv7-NEXT: adds r1, r2, r1 +; CHECK-THUMBv7-NEXT: ldrb r0, [r0, #16] +; CHECK-THUMBv7-NEXT: lsls r0, r0, #24 ; CHECK-THUMBv7-NEXT: adds r0, r1, r0 -; CHECK-THUMBv7-NEXT: movs r1, #13 -; CHECK-THUMBv7-NEXT: ldr r0, [r0, r1] -; CHECK-THUMBv7-NEXT: rev r0, r0 ; CHECK-THUMBv7-NEXT: bx lr %tmp = add nuw nsw i32 %i, 4 diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -438,8 +438,12 @@ ; ; CHECK-THUMBv7-LABEL: load_i32_by_i8_nonzero_offset_bswap: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldr.w r0, [r0, #1] -; CHECK-THUMBv7-NEXT: rev r0, r0 +; CHECK-THUMBv7-NEXT: ldrb r1, [r0, #1] +; CHECK-THUMBv7-NEXT: ldrb r2, [r0, #2] +; CHECK-THUMBv7-NEXT: ldrh.w r0, [r0, #3] +; CHECK-THUMBv7-NEXT: rev16 r0, r0 +; CHECK-THUMBv7-NEXT: orr.w r0, r0, r2, lsl #16 +; CHECK-THUMBv7-NEXT: orr.w r0, r0, r1, lsl #24 ; CHECK-THUMBv7-NEXT: bx lr @@ -494,8 +498,12 @@ ; ; CHECK-THUMBv7-LABEL: load_i32_by_i8_neg_offset_bswap: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldr r0, [r0, #-4] -; CHECK-THUMBv7-NEXT: rev r0, r0 +; CHECK-THUMBv7-NEXT: ldrb r1, [r0, #-4] +; CHECK-THUMBv7-NEXT: ldrb r2, [r0, #-3] +; CHECK-THUMBv7-NEXT: ldrh r0, [r0, #-2] +; CHECK-THUMBv7-NEXT: rev16 r0, r0 +; CHECK-THUMBv7-NEXT: orr.w r0, r0, r2, lsl #16 +; CHECK-THUMBv7-NEXT: orr.w r0, r0, r1, lsl #24 ; CHECK-THUMBv7-NEXT: bx lr @@ -663,19 +671,19 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) { ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r0, r1, r0 +; CHECK-NEXT: add r0, r0, r1 ; CHECK-NEXT: ldr r0, [r0, #13] ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK-ARMv6: @ %bb.0: -; CHECK-ARMv6-NEXT: add r0, r1, r0 +; CHECK-ARMv6-NEXT: add r0, r0, r1 ; CHECK-ARMv6-NEXT: ldr r0, [r0, #13] ; CHECK-ARMv6-NEXT: bx lr ; ; CHECK-THUMBv6-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: adds r0, r1, r0 +; CHECK-THUMBv6-NEXT: adds r0, r0, r1 ; CHECK-THUMBv6-NEXT: movs r1, #13 ; CHECK-THUMBv6-NEXT: ldr r0, [r0, r1] ; CHECK-THUMBv6-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/lowerMUL-newload.ll b/llvm/test/CodeGen/ARM/lowerMUL-newload.ll --- a/llvm/test/CodeGen/ARM/lowerMUL-newload.ll +++ b/llvm/test/CodeGen/ARM/lowerMUL-newload.ll @@ -24,7 +24,7 @@ ; CHECK-NEXT: vldr d16, [r0, #16] ; CHECK-NEXT: vldr d17, [r1, #16] ; CHECK-NEXT: vldr d18, [r2, #16] -; CHECK-NEXT: vmull.u16 q8, d17, d16 +; CHECK-NEXT: vmull.s16 q8, d17, d16 ; CHECK-NEXT: vaddw.u16 q8, q8, d18 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vstr d16, [r0, #16] @@ -67,12 +67,16 @@ define void @addmul_loadstore(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: addmul_loadstore: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr d16, [r2, #16] -; CHECK-NEXT: vldr d17, [r1, #16] -; CHECK-NEXT: vmull.u16 q9, d17, d16 -; CHECK-NEXT: vldr d17, [r0, #16] -; CHECK-NEXT: vmlal.u16 q9, d17, d16 -; CHECK-NEXT: vmovn.i32 d16, q9 +; CHECK-NEXT: vldr d16, [r1, #16] +; CHECK-NEXT: add r1, r2, #16 +; CHECK-NEXT: vldr d20, [r0, #16] +; CHECK-NEXT: vld1.16 {d17}, [r1:64] +; CHECK-NEXT: vmovl.s16 q9, d17 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmovl.u16 q10, d20 +; CHECK-NEXT: vmul.i32 q8, q8, q9 +; CHECK-NEXT: vmla.i32 q8, q10, q9 +; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vstr d16, [r0, #16] ; CHECK-NEXT: bx lr entry: @@ -96,23 +100,22 @@ define void @func1(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: func1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: add r3, r1, #16 +; CHECK-NEXT: vldr d16, [r1, #16] +; CHECK-NEXT: vldr d17, [r2, #16] +; CHECK-NEXT: vaddl.u16 q9, d17, d16 +; CHECK-NEXT: vmovn.i32 d18, q9 +; CHECK-NEXT: vldr d19, [r0, #16] +; CHECK-NEXT: vstr d18, [r0, #16] ; CHECK-NEXT: vldr d18, [r2, #16] -; CHECK-NEXT: vld1.16 {d16}, [r3:64] +; CHECK-NEXT: vmull.s16 q10, d17, d18 +; CHECK-NEXT: vmovl.s16 q11, d18 ; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vaddw.u16 q10, q8, d18 -; CHECK-NEXT: vmovn.i32 d19, q10 -; CHECK-NEXT: vldr d20, [r0, #16] -; CHECK-NEXT: vstr d19, [r0, #16] -; CHECK-NEXT: vldr d19, [r2, #16] -; CHECK-NEXT: vmull.s16 q11, d18, d19 -; CHECK-NEXT: vmovl.s16 q9, d19 -; CHECK-NEXT: vmla.i32 q11, q8, q9 -; CHECK-NEXT: vmovn.i32 d16, q11 +; CHECK-NEXT: vmla.i32 q10, q8, q11 +; CHECK-NEXT: vmovn.i32 d16, q10 ; CHECK-NEXT: vstr d16, [r1, #16] ; CHECK-NEXT: vldr d16, [r2, #16] -; CHECK-NEXT: vmlal.u16 q11, d16, d20 -; CHECK-NEXT: vmovn.i32 d16, q11 +; CHECK-NEXT: vmlal.s16 q10, d16, d19 +; CHECK-NEXT: vmovn.i32 d16, q10 ; CHECK-NEXT: vstr d16, [r0, #16] ; CHECK-NEXT: bx lr entry: @@ -160,25 +163,21 @@ ; CHECK-LABEL: func2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldr d16, [r1, #16] -; CHECK-NEXT: add r3, r0, #16 ; CHECK-NEXT: vldr d17, [r2, #16] ; CHECK-NEXT: vaddl.u16 q9, d17, d16 ; CHECK-NEXT: vmovn.i32 d18, q9 -; CHECK-NEXT: vld1.16 {d19}, [r3:64] +; CHECK-NEXT: vldr d19, [r0, #16] ; CHECK-NEXT: vstr d18, [r0, #16] ; CHECK-NEXT: vldr d18, [r2, #16] ; CHECK-NEXT: vmull.s16 q10, d17, d18 ; CHECK-NEXT: vmovl.s16 q11, d18 ; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vmovl.s16 q9, d19 ; CHECK-NEXT: vmla.i32 q10, q8, q11 ; CHECK-NEXT: vmovn.i32 d16, q10 ; CHECK-NEXT: vstr d16, [r1, #16] -; CHECK-NEXT: add r1, r2, #16 -; CHECK-NEXT: vld1.16 {d16}, [r1:64] -; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vmla.i32 q10, q8, q9 -; CHECK-NEXT: vadd.i32 q8, q10, q9 +; CHECK-NEXT: vldr d16, [r2, #16] +; CHECK-NEXT: vmlal.s16 q10, d16, d19 +; CHECK-NEXT: vaddw.u16 q8, q10, d19 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vstr d16, [r0, #16] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/machine-cse-cmp.ll b/llvm/test/CodeGen/ARM/machine-cse-cmp.ll --- a/llvm/test/CodeGen/ARM/machine-cse-cmp.ll +++ b/llvm/test/CodeGen/ARM/machine-cse-cmp.ll @@ -116,8 +116,11 @@ ; CHECK-NEXT: beq LBB3_2 ; CHECK-NEXT: @ %bb.1: @ %if.end ; CHECK-NEXT: subs r0, r2, #10 +; CHECK-NEXT: mov r9, #0 ; CHECK-NEXT: sbcs r0, r3, #0 -; CHECK-NEXT: bxlt lr +; CHECK-NEXT: movwlt r9, #1 +; CHECK-NEXT: cmp r9, #0 +; CHECK-NEXT: bxne lr ; CHECK-NEXT: LBB3_2: @ %if.end3 ; CHECK-NEXT: subs r0, r2, #10 ; CHECK-NEXT: sbc r3, r3, #0 diff --git a/llvm/test/CodeGen/ARM/memcpy-inline.ll b/llvm/test/CodeGen/ARM/memcpy-inline.ll --- a/llvm/test/CodeGen/ARM/memcpy-inline.ll +++ b/llvm/test/CodeGen/ARM/memcpy-inline.ll @@ -67,7 +67,7 @@ ; CHECK-NEXT: LPC1_0: ; CHECK-NEXT: add r1, pc ; CHECK-NEXT: movs r2, #15 -; CHECK-NEXT: vld1.8 {d16, d17}, [r1], r2 +; CHECK-NEXT: vld1.64 {d16, d17}, [r1], r2 ; CHECK-NEXT: vst1.8 {d16, d17}, [r0], r2 ; CHECK-NEXT: vld1.8 {d16, d17}, [r1] ; CHECK-NEXT: vst1.8 {d16, d17}, [r0] @@ -93,17 +93,17 @@ define void @t2(ptr nocapture %C) nounwind { ; CHECK-LABEL: t2: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movw r1, #16716 +; CHECK-NEXT: movt r1, #72 +; CHECK-NEXT: str r1, [r0, #32] ; CHECK-NEXT: movw r1, :lower16:(L_.str2-(LPC2_0+4)) ; CHECK-NEXT: movt r1, :upper16:(L_.str2-(LPC2_0+4)) ; CHECK-NEXT: LPC2_0: ; CHECK-NEXT: add r1, pc -; CHECK-NEXT: vld1.8 {d16, d17}, [r1]! +; CHECK-NEXT: vld1.64 {d16, d17}, [r1]! ; CHECK-NEXT: vst1.8 {d16, d17}, [r0]! ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vst1.8 {d16, d17}, [r0]! -; CHECK-NEXT: movw r1, #16716 -; CHECK-NEXT: movt r1, #72 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vst1.8 {d16, d17}, [r0] ; CHECK-NEXT: bx lr ; ; CHECK-T1-LABEL: t2: @@ -130,7 +130,7 @@ ; CHECK-NEXT: movt r1, :upper16:(L_.str3-(LPC3_0+4)) ; CHECK-NEXT: LPC3_0: ; CHECK-NEXT: add r1, pc -; CHECK-NEXT: vld1.8 {d16, d17}, [r1]! +; CHECK-NEXT: vld1.64 {d16, d17}, [r1]! ; CHECK-NEXT: vst1.8 {d16, d17}, [r0]! ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vst1.8 {d16}, [r0] diff --git a/llvm/test/CodeGen/ARM/neon-copy.ll b/llvm/test/CodeGen/ARM/neon-copy.ll --- a/llvm/test/CodeGen/ARM/neon-copy.ll +++ b/llvm/test/CodeGen/ARM/neon-copy.ll @@ -1270,8 +1270,24 @@ define <8 x i8> @getl(<16 x i8> %x) #0 { ; CHECK-LABEL: getl: ; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov.u8 r0, d0[1] +; CHECK-NEXT: vmov.u8 r1, d0[2] +; CHECK-NEXT: vmov.u8 r2, d0[3] +; CHECK-NEXT: vmov.u8 r3, d0[4] +; CHECK-NEXT: vmov.u8 r12, d0[5] +; CHECK-NEXT: vmov.u8 lr, d0[6] +; CHECK-NEXT: vmov.u8 r4, d0[7] +; CHECK-NEXT: vmov.8 d0[1], r0 +; CHECK-NEXT: vmov.8 d0[2], r1 +; CHECK-NEXT: vmov.8 d0[3], r2 +; CHECK-NEXT: vmov.8 d0[4], r3 +; CHECK-NEXT: vmov.8 d0[5], r12 +; CHECK-NEXT: vmov.8 d0[6], lr +; CHECK-NEXT: vmov.8 d0[7], r4 ; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop {r4, pc} %vecext = extractelement <16 x i8> %x, i32 0 %vecinit = insertelement <8 x i8> undef, i8 %vecext, i32 0 %vecext1 = extractelement <16 x i8> %x, i32 1 @@ -1396,9 +1412,12 @@ define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) { ; CHECK-LABEL: test_dup_v1i64_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.32 r0, d0[0] -; CHECK-NEXT: vmov.16 d16[0], r0 -; CHECK-NEXT: vdup.16 d0, d16[0] +; CHECK-NEXT: vmov.32 r0, d0[1] +; CHECK-NEXT: vmov.32 r1, d0[0] +; CHECK-NEXT: vmov d0, r1, r0 +; CHECK-NEXT: vmov.16 d0[1], r1 +; CHECK-NEXT: vmov.16 d0[2], r1 +; CHECK-NEXT: vmov.16 d0[3], r1 ; CHECK-NEXT: bx lr entry: %x = extractelement <1 x i64> %a, i32 0 @@ -1413,7 +1432,10 @@ define <2 x i32> @test_dup_v1i64_v2i32(<1 x i64> %a) { ; CHECK-LABEL: test_dup_v1i64_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 d0, d0[0] +; CHECK-NEXT: vmov.32 r0, d0[1] +; CHECK-NEXT: vmov.32 r1, d0[0] +; CHECK-NEXT: vmov d0, r1, r0 +; CHECK-NEXT: vmov.32 d0[1], r1 ; CHECK-NEXT: bx lr entry: %x = extractelement <1 x i64> %a, i32 0 @@ -1479,9 +1501,11 @@ define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.32 r0, d0[0] -; CHECK-NEXT: vmov.16 d16[0], r0 -; CHECK-NEXT: vdup.16 d0, d16[0] +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.16 d0[1], r0 +; CHECK-NEXT: vmov.16 d0[2], r0 +; CHECK-NEXT: vmov.16 d0[3], r0 +; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bx lr entry: %x = extractelement <2 x i64> %a, i32 0 @@ -1496,7 +1520,9 @@ define <2 x i32> @test_dup_v2i64_v2i32(<2 x i64> %a) { ; CHECK-LABEL: test_dup_v2i64_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 d0, d0[0] +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.32 d0[1], r0 +; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bx lr entry: %x = extractelement <2 x i64> %a, i32 0 @@ -1520,7 +1546,8 @@ define <2 x i32> @test_concat_same_v1i32_v1i32(<2 x i32> %a) { ; CHECK-LABEL: test_concat_same_v1i32_v1i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.32 d0, d0[0] +; CHECK-NEXT: vmov.32 r0, d0[0] +; CHECK-NEXT: vmov.32 d0[1], r0 ; CHECK-NEXT: bx lr entry: %0 = extractelement <2 x i32> %a, i32 0 @@ -1543,9 +1570,28 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: vmov.f64 d1, d2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.u8 r0, d0[0] +; CHECK-NEXT: vmov.u8 r1, d0[1] +; CHECK-NEXT: vmov.u8 r2, d0[2] +; CHECK-NEXT: vmov.u8 r3, d0[3] +; CHECK-NEXT: vmov.u8 r12, d0[4] +; CHECK-NEXT: vmov.u8 lr, d0[5] +; CHECK-NEXT: vmov.u8 r4, d0[6] +; CHECK-NEXT: vmov.8 d16[0], r0 +; CHECK-NEXT: vmov.u8 r0, d0[7] +; CHECK-NEXT: vmov.8 d16[1], r1 +; CHECK-NEXT: vmov.8 d16[2], r2 +; CHECK-NEXT: vmov.8 d16[3], r3 +; CHECK-NEXT: vmov.8 d16[4], r12 +; CHECK-NEXT: vmov.8 d16[5], lr +; CHECK-NEXT: vmov.8 d16[6], r4 +; CHECK-NEXT: vmov.8 d16[7], r0 +; CHECK-NEXT: vorr d17, d2, d2 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: pop {r4, pc} entry: %vecext = extractelement <8 x i8> %x, i32 0 %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 @@ -1570,8 +1616,25 @@ define <16 x i8> @test_concat_v16i8_v16i8_v8i8(<16 x i8> %x, <8 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f64 d1, d2 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: vmov.u8 r3, d2[0] +; CHECK-NEXT: vmov.u8 r0, d2[1] +; CHECK-NEXT: vmov.u8 r1, d2[2] +; CHECK-NEXT: vmov.u8 r4, d2[3] +; CHECK-NEXT: vmov.u8 r5, d2[4] +; CHECK-NEXT: vmov.u8 r2, d2[5] +; CHECK-NEXT: vmov.u8 lr, d2[6] +; CHECK-NEXT: vmov.u8 r12, d2[7] +; CHECK-NEXT: vmov.8 d1[0], r3 +; CHECK-NEXT: vmov.8 d1[1], r0 +; CHECK-NEXT: vmov.8 d1[2], r1 +; CHECK-NEXT: vmov.8 d1[3], r4 +; CHECK-NEXT: vmov.8 d1[4], r5 +; CHECK-NEXT: vmov.8 d1[5], r2 +; CHECK-NEXT: vmov.8 d1[6], lr +; CHECK-NEXT: vmov.8 d1[7], r12 +; CHECK-NEXT: pop {r4, r5, r11, pc} entry: %vecext = extractelement <16 x i8> %x, i32 0 %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 @@ -1611,9 +1674,41 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v8i8_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 -; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vorr d16, d1, d1 +; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: vmov.u8 r1, d0[1] +; CHECK-NEXT: vmov.u8 r3, d0[2] +; CHECK-NEXT: vmov.u8 r6, d0[3] +; CHECK-NEXT: vmov.u8 r5, d0[4] +; CHECK-NEXT: vmov.u8 r0, d0[5] +; CHECK-NEXT: vmov.u8 r2, d0[6] +; CHECK-NEXT: vmov.u8 lr, d0[7] +; CHECK-NEXT: vmov.u8 r12, d16[5] +; CHECK-NEXT: vmov.u8 r4, d16[6] +; CHECK-NEXT: vmov.8 d0[1], r1 +; CHECK-NEXT: vmov.u8 r1, d16[1] +; CHECK-NEXT: vmov.8 d0[2], r3 +; CHECK-NEXT: vmov.u8 r3, d16[3] +; CHECK-NEXT: vmov.8 d0[3], r6 +; CHECK-NEXT: vmov.u8 r6, d16[0] +; CHECK-NEXT: vmov.8 d0[4], r5 +; CHECK-NEXT: vmov.u8 r5, d16[2] +; CHECK-NEXT: vmov.8 d0[5], r0 +; CHECK-NEXT: vmov.u8 r0, d16[4] +; CHECK-NEXT: vmov.8 d0[6], r2 +; CHECK-NEXT: vmov.u8 r2, d16[7] +; CHECK-NEXT: vmov.8 d0[7], lr +; CHECK-NEXT: vmov.8 d1[0], r6 +; CHECK-NEXT: vmov.8 d1[1], r1 +; CHECK-NEXT: vmov.8 d1[2], r5 +; CHECK-NEXT: vmov.8 d1[3], r3 +; CHECK-NEXT: vmov.8 d1[4], r0 +; CHECK-NEXT: vmov.8 d1[5], r12 +; CHECK-NEXT: vmov.8 d1[6], r4 +; CHECK-NEXT: vmov.8 d1[7], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %vecext = extractelement <8 x i8> %x, i32 0 %vecinit = insertelement <16 x i8> undef, i8 %vecext, i32 0 @@ -1664,6 +1759,14 @@ ; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: vmov.u16 r0, d0[0] +; CHECK-NEXT: vmov.u16 r1, d0[1] +; CHECK-NEXT: vmov.u16 r2, d0[2] +; CHECK-NEXT: vmov.u16 r3, d0[3] +; CHECK-NEXT: vmov.16 d0[0], r0 +; CHECK-NEXT: vmov.16 d0[1], r1 +; CHECK-NEXT: vmov.16 d0[2], r2 +; CHECK-NEXT: vmov.16 d0[3], r3 ; CHECK-NEXT: vmov.f64 d1, d2 ; CHECK-NEXT: bx lr entry: @@ -1682,7 +1785,14 @@ define <8 x i16> @test_concat_v8i16_v8i16_v4i16(<8 x i16> %x, <4 x i16> %y) #0 { ; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f64 d1, d2 +; CHECK-NEXT: vmov.u16 r0, d2[0] +; CHECK-NEXT: vmov.u16 r1, d2[1] +; CHECK-NEXT: vmov.u16 r2, d2[2] +; CHECK-NEXT: vmov.u16 r3, d2[3] +; CHECK-NEXT: vmov.16 d1[0], r0 +; CHECK-NEXT: vmov.16 d1[1], r1 +; CHECK-NEXT: vmov.16 d1[2], r2 +; CHECK-NEXT: vmov.16 d1[3], r3 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <8 x i16> %x, i32 0 @@ -1707,9 +1817,25 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 { ; CHECK-LABEL: test_concat_v8i16_v4i16_v4i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 -; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov.u16 r2, d0[1] +; CHECK-NEXT: vorr d16, d0, d0 +; CHECK-NEXT: vmov.u16 r3, d0[2] +; CHECK-NEXT: vmov.u16 r0, d0[3] +; CHECK-NEXT: vmov.u16 r1, d1[0] +; CHECK-NEXT: vmov.u16 r4, d1[1] +; CHECK-NEXT: vmov.u16 lr, d1[2] +; CHECK-NEXT: vmov.u16 r12, d1[3] +; CHECK-NEXT: vmov.16 d16[1], r2 +; CHECK-NEXT: vmov.16 d16[2], r3 +; CHECK-NEXT: vmov.16 d16[3], r0 +; CHECK-NEXT: vmov.16 d17[0], r1 +; CHECK-NEXT: vmov.16 d17[1], r4 +; CHECK-NEXT: vmov.16 d17[2], lr +; CHECK-NEXT: vmov.16 d17[3], r12 +; CHECK-NEXT: vorr q0, q8, q8 +; CHECK-NEXT: pop {r4, pc} entry: %vecext = extractelement <4 x i16> %x, i32 0 %vecinit = insertelement <8 x i16> undef, i16 %vecext, i32 0 @@ -1744,7 +1870,10 @@ ; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: vmov.f64 d1, d2 +; CHECK-NEXT: vmov.32 r0, d0[1] +; CHECK-NEXT: vmov.32 d0[1], r0 +; CHECK-NEXT: vext.32 q8, q0, q0, #2 +; CHECK-NEXT: vext.32 q0, q8, q1, #2 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <2 x i32> %x, i32 0 @@ -1758,7 +1887,10 @@ define <4 x i32> @test_concat_v4i32_v4i32_v2i32(<4 x i32> %x, <2 x i32> %y) #0 { ; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f64 d1, d2 +; CHECK-NEXT: vmov.32 r0, d2[0] +; CHECK-NEXT: vmov.32 r1, d2[1] +; CHECK-NEXT: vmov.32 d1[0], r0 +; CHECK-NEXT: vmov.32 d1[1], r1 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <4 x i32> %x, i32 0 @@ -1809,7 +1941,10 @@ define <2 x i64> @test_concat_v2i64_v2i64_v1i64(<2 x i64> %x, <1 x i64> %y) #0 { ; CHECK-LABEL: test_concat_v2i64_v2i64_v1i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f64 d1, d2 +; CHECK-NEXT: vmov.32 r0, d2[0] +; CHECK-NEXT: vmov.32 r1, d2[1] +; CHECK-NEXT: vmov.32 d1[0], r0 +; CHECK-NEXT: vmov.32 d1[1], r1 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <2 x i64> %x, i32 0 @@ -1822,8 +1957,12 @@ define <2 x i64> @test_concat_v2i64_v1i64_v1i64(<1 x i64> %x, <1 x i64> %y) #0 { ; CHECK-LABEL: test_concat_v2i64_v1i64_v1i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 -; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 +; CHECK-NEXT: vmov.32 r0, d1[0] +; CHECK-NEXT: vorr d16, d0, d0 +; CHECK-NEXT: vmov.32 r1, d1[1] +; CHECK-NEXT: vmov.32 d17[0], r0 +; CHECK-NEXT: vmov.32 d17[1], r1 +; CHECK-NEXT: vorr q0, q8, q8 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <1 x i64> %x, i32 0 diff --git a/llvm/test/CodeGen/ARM/neon_vabs.ll b/llvm/test/CodeGen/ARM/neon_vabs.ll --- a/llvm/test/CodeGen/ARM/neon_vabs.ll +++ b/llvm/test/CodeGen/ARM/neon_vabs.ll @@ -149,7 +149,10 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vabdl.u16 q8, d17, d16 +; CHECK-NEXT: vsubl.u16 q8, d17, d16 +; CHECK-NEXT: vshr.s32 q9, q8, #31 +; CHECK-NEXT: vsra.s32 q8, q8, #31 +; CHECK-NEXT: veor q8, q9, q8 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -166,7 +169,10 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: vmov d17, r0, r1 -; CHECK-NEXT: vabdl.u8 q8, d17, d16 +; CHECK-NEXT: vsubl.u8 q8, d17, d16 +; CHECK-NEXT: vshr.s16 q9, q8, #15 +; CHECK-NEXT: vsra.s16 q8, q8, #15 +; CHECK-NEXT: veor q8, q9, q8 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr diff --git a/llvm/test/CodeGen/ARM/pr35103.ll b/llvm/test/CodeGen/ARM/pr35103.ll --- a/llvm/test/CodeGen/ARM/pr35103.ll +++ b/llvm/test/CodeGen/ARM/pr35103.ll @@ -6,15 +6,16 @@ ; CHECK-LABEL: foo: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: adds r2, r2, r0 +; CHECK-NEXT: adds lr, r2, r0 ; CHECK-NEXT: mov r12, #0 -; CHECK-NEXT: adc lr, r12, #0 +; CHECK-NEXT: adds lr, lr, r0 +; CHECK-NEXT: adc r12, r12, #0 ; CHECK-NEXT: adds r0, r2, r0 ; CHECK-NEXT: ldr r2, [sp, #8] ; CHECK-NEXT: adc r0, r12, #0 ; CHECK-NEXT: adds r1, r3, r1 ; CHECK-NEXT: adcs r1, r2, #0 -; CHECK-NEXT: adc r0, r0, lr +; CHECK-NEXT: adc r0, r0, #0 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr entry: diff --git a/llvm/test/CodeGen/ARM/rev.ll b/llvm/test/CodeGen/ARM/rev.ll --- a/llvm/test/CodeGen/ARM/rev.ll +++ b/llvm/test/CodeGen/ARM/rev.ll @@ -4,10 +4,37 @@ ; RUN: llc -mtriple=thumbv7m-none-eabi %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-V7 define i32 @test1(i32 %X) nounwind { -; CHECK-LABEL: test1: -; CHECK: @ %bb.0: -; CHECK-NEXT: rev16 r0, r0 -; CHECK-NEXT: bx lr +; CHECK-ARM-LABEL: test1: +; CHECK-ARM: @ %bb.0: +; CHECK-ARM-NEXT: mov r1, #16711680 +; CHECK-ARM-NEXT: and r2, r0, #16711680 +; CHECK-ARM-NEXT: and r1, r1, r0, lsr #8 +; CHECK-ARM-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-ARM-NEXT: lsl r2, r0, #8 +; CHECK-ARM-NEXT: lsr r0, r0, #8 +; CHECK-ARM-NEXT: uxth r2, r2 +; CHECK-ARM-NEXT: orr r1, r1, r2 +; CHECK-ARM-NEXT: uxtb r0, r0 +; CHECK-ARM-NEXT: orr r0, r1, r0 +; CHECK-ARM-NEXT: bx lr +; +; CHECK-V6-LABEL: test1: +; CHECK-V6: @ %bb.0: +; CHECK-V6-NEXT: rev16 r0, r0 +; CHECK-V6-NEXT: bx lr +; +; CHECK-V7-LABEL: test1: +; CHECK-V7: @ %bb.0: +; CHECK-V7-NEXT: mov.w r1, #16711680 +; CHECK-V7-NEXT: and r2, r0, #16711680 +; CHECK-V7-NEXT: and.w r1, r1, r0, lsr #8 +; CHECK-V7-NEXT: orr.w r1, r1, r2, lsl #8 +; CHECK-V7-NEXT: lsls r2, r0, #8 +; CHECK-V7-NEXT: uxth r2, r2 +; CHECK-V7-NEXT: add r1, r2 +; CHECK-V7-NEXT: ubfx r0, r0, #8, #8 +; CHECK-V7-NEXT: add r0, r1 +; CHECK-V7-NEXT: bx lr %tmp1 = lshr i32 %X, 8 %X15 = bitcast i32 %X to i32 %tmp4 = shl i32 %X15, 8 @@ -51,10 +78,30 @@ declare i16 @llvm.bswap.i16(i16) nounwind readnone define i32 @test4(i16 zeroext %a) nounwind { -; CHECK-LABEL: test4: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: revsh r0, r0 -; CHECK-NEXT: bx lr +; CHECK-ARM-LABEL: test4: +; CHECK-ARM: @ %bb.0: @ %entry +; CHECK-ARM-NEXT: lsl r1, r0, #8 +; CHECK-ARM-NEXT: uxtb16 r1, r1 +; CHECK-ARM-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARM-NEXT: asr r0, r0, #16 +; CHECK-ARM-NEXT: bx lr +; +; CHECK-V6-LABEL: test4: +; CHECK-V6: @ %bb.0: @ %entry +; CHECK-V6-NEXT: lsls r1, r0, #24 +; CHECK-V6-NEXT: lsrs r0, r0, #8 +; CHECK-V6-NEXT: lsls r0, r0, #16 +; CHECK-V6-NEXT: adds r0, r1, r0 +; CHECK-V6-NEXT: asrs r0, r0, #16 +; CHECK-V6-NEXT: bx lr +; +; CHECK-V7-LABEL: test4: +; CHECK-V7: @ %bb.0: @ %entry +; CHECK-V7-NEXT: mov.w r1, #16711680 +; CHECK-V7-NEXT: and.w r1, r1, r0, lsl #8 +; CHECK-V7-NEXT: orr.w r0, r1, r0, lsl #24 +; CHECK-V7-NEXT: asrs r0, r0, #16 +; CHECK-V7-NEXT: bx lr entry: %conv = zext i16 %a to i32 %shr9 = lshr i16 %a, 8 @@ -97,10 +144,37 @@ ; rdar://9609108 define i32 @test6(i32 %x) nounwind readnone { -; CHECK-LABEL: test6: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: rev16 r0, r0 -; CHECK-NEXT: bx lr +; CHECK-ARM-LABEL: test6: +; CHECK-ARM: @ %bb.0: @ %entry +; CHECK-ARM-NEXT: mov r1, #16711680 +; CHECK-ARM-NEXT: and r2, r0, #16711680 +; CHECK-ARM-NEXT: and r1, r1, r0, lsr #8 +; CHECK-ARM-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-ARM-NEXT: lsr r2, r0, #8 +; CHECK-ARM-NEXT: lsl r0, r0, #8 +; CHECK-ARM-NEXT: uxtb r2, r2 +; CHECK-ARM-NEXT: orr r1, r1, r2 +; CHECK-ARM-NEXT: uxth r0, r0 +; CHECK-ARM-NEXT: orr r0, r1, r0 +; CHECK-ARM-NEXT: bx lr +; +; CHECK-V6-LABEL: test6: +; CHECK-V6: @ %bb.0: @ %entry +; CHECK-V6-NEXT: rev16 r0, r0 +; CHECK-V6-NEXT: bx lr +; +; CHECK-V7-LABEL: test6: +; CHECK-V7: @ %bb.0: @ %entry +; CHECK-V7-NEXT: mov.w r1, #16711680 +; CHECK-V7-NEXT: and r2, r0, #16711680 +; CHECK-V7-NEXT: and.w r1, r1, r0, lsr #8 +; CHECK-V7-NEXT: orr.w r1, r1, r2, lsl #8 +; CHECK-V7-NEXT: ubfx r2, r0, #8, #8 +; CHECK-V7-NEXT: lsls r0, r0, #8 +; CHECK-V7-NEXT: add r1, r2 +; CHECK-V7-NEXT: uxth r0, r0 +; CHECK-V7-NEXT: add r0, r1 +; CHECK-V7-NEXT: bx lr entry: %and = shl i32 %x, 8 %shl = and i32 %and, 65280 @@ -143,10 +217,32 @@ } define i32 @test8(i32 %a) nounwind readnone { -; CHECK-LABEL: test8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: revsh r0, r0 -; CHECK-NEXT: bx lr +; CHECK-ARM-LABEL: test8: +; CHECK-ARM: @ %bb.0: @ %entry +; CHECK-ARM-NEXT: lsl r1, r0, #8 +; CHECK-ARM-NEXT: uxtb16 r1, r1 +; CHECK-ARM-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARM-NEXT: asr r0, r0, #16 +; CHECK-ARM-NEXT: bx lr +; +; CHECK-V6-LABEL: test8: +; CHECK-V6: @ %bb.0: @ %entry +; CHECK-V6-NEXT: movs r1, #255 +; CHECK-V6-NEXT: lsls r1, r1, #16 +; CHECK-V6-NEXT: lsls r2, r0, #8 +; CHECK-V6-NEXT: ands r2, r1 +; CHECK-V6-NEXT: lsls r0, r0, #24 +; CHECK-V6-NEXT: adds r0, r0, r2 +; CHECK-V6-NEXT: asrs r0, r0, #16 +; CHECK-V6-NEXT: bx lr +; +; CHECK-V7-LABEL: test8: +; CHECK-V7: @ %bb.0: @ %entry +; CHECK-V7-NEXT: mov.w r1, #16711680 +; CHECK-V7-NEXT: and.w r1, r1, r0, lsl #8 +; CHECK-V7-NEXT: orr.w r0, r1, r0, lsl #24 +; CHECK-V7-NEXT: asrs r0, r0, #16 +; CHECK-V7-NEXT: bx lr entry: %and = lshr i32 %a, 8 %shr4 = and i32 %and, 255 diff --git a/llvm/test/CodeGen/ARM/rotate.ll b/llvm/test/CodeGen/ARM/rotate.ll --- a/llvm/test/CodeGen/ARM/rotate.ll +++ b/llvm/test/CodeGen/ARM/rotate.ll @@ -7,8 +7,8 @@ ; CHECK-LABEL: testcase: ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vshr.u64 q9, q8, #8 -; CHECK-NEXT: vshl.i64 q8, q8, #56 +; CHECK-NEXT: vshl.i64 q9, q8, #56 +; CHECK-NEXT: vshr.u64 q8, q8, #8 ; CHECK-NEXT: vorr q0, q8, q9 ; CHECK-NEXT: bx lr %1 = load <2 x i64>, ptr %in diff --git a/llvm/test/CodeGen/ARM/sadd_sat.ll b/llvm/test/CodeGen/ARM/sadd_sat.ll --- a/llvm/test/CodeGen/ARM/sadd_sat.ll +++ b/llvm/test/CodeGen/ARM/sadd_sat.ll @@ -151,33 +151,34 @@ } define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind { -; CHECK-T1-LABEL: func16: -; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: adds r0, r0, r1 -; CHECK-T1-NEXT: ldr r1, .LCPI2_0 -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: blt .LBB2_2 -; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB2_2: -; CHECK-T1-NEXT: ldr r1, .LCPI2_1 -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: bgt .LBB2_4 -; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB2_4: -; CHECK-T1-NEXT: bx lr -; CHECK-T1-NEXT: .p2align 2 -; CHECK-T1-NEXT: @ %bb.5: -; CHECK-T1-NEXT: .LCPI2_0: -; CHECK-T1-NEXT: .long 32767 @ 0x7fff -; CHECK-T1-NEXT: .LCPI2_1: -; CHECK-T1-NEXT: .long 4294934528 @ 0xffff8000 +; CHECK-T16-LABEL: func16: +; CHECK-T16: @ %bb.0: +; CHECK-T16-NEXT: adds r0, r0, r1 +; CHECK-T16-NEXT: ldr r1, .LCPI2_0 +; CHECK-T16-NEXT: cmp r0, r1 +; CHECK-T16-NEXT: blt .LBB2_2 +; CHECK-T16-NEXT: @ %bb.1: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB2_2: +; CHECK-T16-NEXT: ldr r1, .LCPI2_1 +; CHECK-T16-NEXT: cmp r0, r1 +; CHECK-T16-NEXT: bgt .LBB2_4 +; CHECK-T16-NEXT: @ %bb.3: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB2_4: +; CHECK-T16-NEXT: bx lr +; CHECK-T16-NEXT: .p2align 2 +; CHECK-T16-NEXT: @ %bb.5: +; CHECK-T16-NEXT: .LCPI2_0: +; CHECK-T16-NEXT: .long 32767 @ 0x7fff +; CHECK-T16-NEXT: .LCPI2_1: +; CHECK-T16-NEXT: .long 4294934528 @ 0xffff8000 ; ; CHECK-T2NODSP-LABEL: func16: ; CHECK-T2NODSP: @ %bb.0: ; CHECK-T2NODSP-NEXT: add r0, r1 ; CHECK-T2NODSP-NEXT: ssat r0, #16, r0 +; CHECK-T2NODSP-NEXT: sxth r0, r0 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func16: @@ -210,6 +211,29 @@ ; CHECK-ARMBASEDSP-NEXT: asr r0, r0, #16 ; CHECK-ARMBASEDSP-NEXT: bx lr ; +; CHECK-T15TE-LABEL: func16: +; CHECK-T15TE: @ %bb.0: +; CHECK-T15TE-NEXT: adds r0, r0, r1 +; CHECK-T15TE-NEXT: ldr r1, .LCPI2_0 +; CHECK-T15TE-NEXT: cmp r0, r1 +; CHECK-T15TE-NEXT: blt .LBB2_2 +; CHECK-T15TE-NEXT: @ %bb.1: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB2_2: +; CHECK-T15TE-NEXT: ldr r1, .LCPI2_1 +; CHECK-T15TE-NEXT: cmp r0, r1 +; CHECK-T15TE-NEXT: bgt .LBB2_4 +; CHECK-T15TE-NEXT: @ %bb.3: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB2_4: +; CHECK-T15TE-NEXT: bx lr +; CHECK-T15TE-NEXT: .p2align 2 +; CHECK-T15TE-NEXT: @ %bb.5: +; CHECK-T15TE-NEXT: .LCPI2_0: +; CHECK-T15TE-NEXT: .long 32767 @ 0x7fff +; CHECK-T15TE-NEXT: .LCPI2_1: +; CHECK-T15TE-NEXT: .long 4294934528 @ 0xffff8000 +; ; CHECK-ARMDSP-LABEL: func16: ; CHECK-ARMDSP: @ %bb.0: ; CHECK-ARMDSP-NEXT: qadd16 r0, r0, r1 @@ -220,27 +244,28 @@ } define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind { -; CHECK-T1-LABEL: func8: -; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: adds r0, r0, r1 -; CHECK-T1-NEXT: movs r1, #127 -; CHECK-T1-NEXT: cmp r0, #127 -; CHECK-T1-NEXT: blt .LBB3_2 -; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB3_2: -; CHECK-T1-NEXT: mvns r1, r1 -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: bgt .LBB3_4 -; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB3_4: -; CHECK-T1-NEXT: bx lr +; CHECK-T16-LABEL: func8: +; CHECK-T16: @ %bb.0: +; CHECK-T16-NEXT: adds r0, r0, r1 +; CHECK-T16-NEXT: movs r1, #127 +; CHECK-T16-NEXT: cmp r0, #127 +; CHECK-T16-NEXT: blt .LBB3_2 +; CHECK-T16-NEXT: @ %bb.1: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB3_2: +; CHECK-T16-NEXT: mvns r1, r1 +; CHECK-T16-NEXT: cmp r0, r1 +; CHECK-T16-NEXT: bgt .LBB3_4 +; CHECK-T16-NEXT: @ %bb.3: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB3_4: +; CHECK-T16-NEXT: bx lr ; ; CHECK-T2NODSP-LABEL: func8: ; CHECK-T2NODSP: @ %bb.0: ; CHECK-T2NODSP-NEXT: add r0, r1 ; CHECK-T2NODSP-NEXT: ssat r0, #8, r0 +; CHECK-T2NODSP-NEXT: sxtb r0, r0 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func8: @@ -266,6 +291,23 @@ ; CHECK-ARMBASEDSP-NEXT: asr r0, r0, #24 ; CHECK-ARMBASEDSP-NEXT: bx lr ; +; CHECK-T15TE-LABEL: func8: +; CHECK-T15TE: @ %bb.0: +; CHECK-T15TE-NEXT: adds r0, r0, r1 +; CHECK-T15TE-NEXT: movs r1, #127 +; CHECK-T15TE-NEXT: cmp r0, #127 +; CHECK-T15TE-NEXT: blt .LBB3_2 +; CHECK-T15TE-NEXT: @ %bb.1: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB3_2: +; CHECK-T15TE-NEXT: mvns r1, r1 +; CHECK-T15TE-NEXT: cmp r0, r1 +; CHECK-T15TE-NEXT: bgt .LBB3_4 +; CHECK-T15TE-NEXT: @ %bb.3: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB3_4: +; CHECK-T15TE-NEXT: bx lr +; ; CHECK-ARMDSP-LABEL: func8: ; CHECK-ARMDSP: @ %bb.0: ; CHECK-ARMDSP-NEXT: qadd8 r0, r0, r1 @@ -276,27 +318,28 @@ } define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind { -; CHECK-T1-LABEL: func3: -; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: adds r0, r0, r1 -; CHECK-T1-NEXT: movs r1, #7 -; CHECK-T1-NEXT: cmp r0, #7 -; CHECK-T1-NEXT: blt .LBB4_2 -; CHECK-T1-NEXT: @ %bb.1: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB4_2: -; CHECK-T1-NEXT: mvns r1, r1 -; CHECK-T1-NEXT: cmp r0, r1 -; CHECK-T1-NEXT: bgt .LBB4_4 -; CHECK-T1-NEXT: @ %bb.3: -; CHECK-T1-NEXT: {{movs|mov}} r0, r1 -; CHECK-T1-NEXT: .LBB4_4: -; CHECK-T1-NEXT: bx lr +; CHECK-T16-LABEL: func3: +; CHECK-T16: @ %bb.0: +; CHECK-T16-NEXT: adds r0, r0, r1 +; CHECK-T16-NEXT: movs r1, #7 +; CHECK-T16-NEXT: cmp r0, #7 +; CHECK-T16-NEXT: blt .LBB4_2 +; CHECK-T16-NEXT: @ %bb.1: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB4_2: +; CHECK-T16-NEXT: mvns r1, r1 +; CHECK-T16-NEXT: cmp r0, r1 +; CHECK-T16-NEXT: bgt .LBB4_4 +; CHECK-T16-NEXT: @ %bb.3: +; CHECK-T16-NEXT: mov r0, r1 +; CHECK-T16-NEXT: .LBB4_4: +; CHECK-T16-NEXT: bx lr ; ; CHECK-T2NODSP-LABEL: func3: ; CHECK-T2NODSP: @ %bb.0: ; CHECK-T2NODSP-NEXT: add r0, r1 ; CHECK-T2NODSP-NEXT: ssat r0, #4, r0 +; CHECK-T2NODSP-NEXT: sbfx r0, r0, #0, #4 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func3: @@ -324,6 +367,23 @@ ; CHECK-ARMBASEDSP-NEXT: asr r0, r0, #28 ; CHECK-ARMBASEDSP-NEXT: bx lr ; +; CHECK-T15TE-LABEL: func3: +; CHECK-T15TE: @ %bb.0: +; CHECK-T15TE-NEXT: adds r0, r0, r1 +; CHECK-T15TE-NEXT: movs r1, #7 +; CHECK-T15TE-NEXT: cmp r0, #7 +; CHECK-T15TE-NEXT: blt .LBB4_2 +; CHECK-T15TE-NEXT: @ %bb.1: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB4_2: +; CHECK-T15TE-NEXT: mvns r1, r1 +; CHECK-T15TE-NEXT: cmp r0, r1 +; CHECK-T15TE-NEXT: bgt .LBB4_4 +; CHECK-T15TE-NEXT: @ %bb.3: +; CHECK-T15TE-NEXT: movs r0, r1 +; CHECK-T15TE-NEXT: .LBB4_4: +; CHECK-T15TE-NEXT: bx lr +; ; CHECK-ARMDSP-LABEL: func3: ; CHECK-ARMDSP: @ %bb.0: ; CHECK-ARMDSP-NEXT: lsl r0, r0, #28 diff --git a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll --- a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll +++ b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll @@ -55,7 +55,8 @@ ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: .save {r4, lr} ; CHECK-T1-NEXT: push {r4, lr} -; CHECK-T1-NEXT: ldr r3, [sp, #12] +; CHECK-T1-NEXT: add r2, sp, #8 +; CHECK-T1-NEXT: ldr r3, [r2, #4] ; CHECK-T1-NEXT: mov r2, r1 ; CHECK-T1-NEXT: eors r2, r3 ; CHECK-T1-NEXT: ldr r4, [sp, #8] @@ -84,11 +85,10 @@ ; ; CHECK-T2-LABEL: func64: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: ldr r2, [sp] -; CHECK-T2-NEXT: ldr.w r12, [sp, #4] -; CHECK-T2-NEXT: adds r0, r0, r2 -; CHECK-T2-NEXT: adc.w r2, r1, r12 -; CHECK-T2-NEXT: eor.w r3, r1, r12 +; CHECK-T2-NEXT: ldrd r12, r2, [sp] +; CHECK-T2-NEXT: eor.w r3, r1, r2 +; CHECK-T2-NEXT: adds.w r0, r0, r12 +; CHECK-T2-NEXT: adcs r2, r1 ; CHECK-T2-NEXT: eors r1, r2 ; CHECK-T2-NEXT: bics r1, r3 ; CHECK-T2-NEXT: it mi @@ -150,6 +150,7 @@ ; CHECK-T2NODSP-NEXT: sxth r1, r1 ; CHECK-T2NODSP-NEXT: add r0, r1 ; CHECK-T2NODSP-NEXT: ssat r0, #16, r0 +; CHECK-T2NODSP-NEXT: sxth r0, r0 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func16: @@ -196,6 +197,7 @@ ; CHECK-T2NODSP-NEXT: sxtb r1, r1 ; CHECK-T2NODSP-NEXT: add r0, r1 ; CHECK-T2NODSP-NEXT: ssat r0, #8, r0 +; CHECK-T2NODSP-NEXT: sxtb r0, r0 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func8: @@ -243,6 +245,7 @@ ; CHECK-T2NODSP-NEXT: lsls r1, r1, #28 ; CHECK-T2NODSP-NEXT: add.w r0, r0, r1, asr #28 ; CHECK-T2NODSP-NEXT: ssat r0, #4, r0 +; CHECK-T2NODSP-NEXT: sbfx r0, r0, #0, #4 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func4: diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -430,10 +430,11 @@ ; ARM-NEXT: ldrsb r4, [r0] ; ARM-NEXT: mov r0, #1 ; ARM-NEXT: bl f -; ARM-NEXT: and r0, r4, #255 -; ARM-NEXT: cmp r0, r0 +; ARM-NEXT: mov r0, #0 +; ARM-NEXT: cmp r0, #0 ; ARM-NEXT: bne .LBB8_3 ; ARM-NEXT: @ %bb.1: @ %while.body.preheader +; ARM-NEXT: and r0, r4, #255 ; ARM-NEXT: add r1, r4, #1 ; ARM-NEXT: mov r2, r0 ; ARM-NEXT: .LBB8_2: @ %while.body @@ -454,10 +455,11 @@ ; ARMT2-NEXT: ldrsb r4, [r0] ; ARMT2-NEXT: mov r0, #1 ; ARMT2-NEXT: bl f -; ARMT2-NEXT: uxtb r0, r4 -; ARMT2-NEXT: cmp r0, r0 +; ARMT2-NEXT: mov r0, #0 +; ARMT2-NEXT: cmp r0, #0 ; ARMT2-NEXT: popne {r4, pc} ; ARMT2-NEXT: .LBB8_1: @ %while.body.preheader +; ARMT2-NEXT: uxtb r0, r4 ; ARMT2-NEXT: add r1, r4, #1 ; ARMT2-NEXT: mov r2, r0 ; ARMT2-NEXT: .LBB8_2: @ %while.body @@ -472,16 +474,16 @@ ; ; THUMB1-LABEL: t9: ; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: .save {r4, lr} -; THUMB1-NEXT: push {r4, lr} -; THUMB1-NEXT: movs r1, #0 -; THUMB1-NEXT: ldrsb r4, [r0, r1] +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: movs r5, #0 +; THUMB1-NEXT: ldrsb r4, [r0, r5] ; THUMB1-NEXT: movs r0, #1 ; THUMB1-NEXT: bl f -; THUMB1-NEXT: uxtb r0, r4 -; THUMB1-NEXT: cmp r0, r0 +; THUMB1-NEXT: cmp r5, #0 ; THUMB1-NEXT: bne .LBB8_3 ; THUMB1-NEXT: @ %bb.1: @ %while.body.preheader +; THUMB1-NEXT: uxtb r0, r4 ; THUMB1-NEXT: adds r1, r4, #1 ; THUMB1-NEXT: mov r2, r0 ; THUMB1-NEXT: .LBB8_2: @ %while.body @@ -492,7 +494,7 @@ ; THUMB1-NEXT: cmp r3, r0 ; THUMB1-NEXT: blt .LBB8_2 ; THUMB1-NEXT: .LBB8_3: @ %while.end -; THUMB1-NEXT: pop {r4, pc} +; THUMB1-NEXT: pop {r4, r5, r7, pc} ; ; THUMB2-LABEL: t9: ; THUMB2: @ %bb.0: @ %entry @@ -501,11 +503,12 @@ ; THUMB2-NEXT: ldrsb.w r4, [r0] ; THUMB2-NEXT: movs r0, #1 ; THUMB2-NEXT: bl f -; THUMB2-NEXT: uxtb r0, r4 -; THUMB2-NEXT: cmp r0, r0 +; THUMB2-NEXT: movs r0, #0 +; THUMB2-NEXT: cmp r0, #0 ; THUMB2-NEXT: it ne ; THUMB2-NEXT: popne {r4, pc} ; THUMB2-NEXT: .LBB8_1: @ %while.body.preheader +; THUMB2-NEXT: uxtb r0, r4 ; THUMB2-NEXT: adds r1, r4, #1 ; THUMB2-NEXT: mov r2, r0 ; THUMB2-NEXT: .LBB8_2: @ %while.body @@ -520,16 +523,15 @@ ; ; V8MBASE-LABEL: t9: ; V8MBASE: @ %bb.0: @ %entry -; V8MBASE-NEXT: .save {r4, lr} -; V8MBASE-NEXT: push {r4, lr} -; V8MBASE-NEXT: movs r1, #0 -; V8MBASE-NEXT: ldrsb r4, [r0, r1] +; V8MBASE-NEXT: .save {r4, r5, r7, lr} +; V8MBASE-NEXT: push {r4, r5, r7, lr} +; V8MBASE-NEXT: movs r5, #0 +; V8MBASE-NEXT: ldrsb r4, [r0, r5] ; V8MBASE-NEXT: movs r0, #1 ; V8MBASE-NEXT: bl f -; V8MBASE-NEXT: uxtb r0, r4 -; V8MBASE-NEXT: cmp r0, r0 -; V8MBASE-NEXT: bne .LBB8_3 +; V8MBASE-NEXT: cbnz r5, .LBB8_3 ; V8MBASE-NEXT: @ %bb.1: @ %while.body.preheader +; V8MBASE-NEXT: uxtb r0, r4 ; V8MBASE-NEXT: adds r1, r4, #1 ; V8MBASE-NEXT: mov r2, r0 ; V8MBASE-NEXT: .LBB8_2: @ %while.body @@ -540,7 +542,7 @@ ; V8MBASE-NEXT: cmp r3, r0 ; V8MBASE-NEXT: blt .LBB8_2 ; V8MBASE-NEXT: .LBB8_3: @ %while.end -; V8MBASE-NEXT: pop {r4, pc} +; V8MBASE-NEXT: pop {r4, r5, r7, pc} entry: %0 = load i8, ptr %a %conv = sext i8 %0 to i32 @@ -569,44 +571,27 @@ define i1 @t10() { ; ARM-LABEL: t10: ; ARM: @ %bb.0: @ %entry -; ARM-NEXT: .save {r11, lr} -; ARM-NEXT: push {r11, lr} ; ARM-NEXT: .pad #8 ; ARM-NEXT: sub sp, sp, #8 -; ARM-NEXT: mvn r0, #2 -; ARM-NEXT: mvn r1, #7 -; ARM-NEXT: str r0, [sp, #4] ; ARM-NEXT: mvn r0, #7 ; ARM-NEXT: str r0, [sp] ; ARM-NEXT: mvn r0, #2 -; ARM-NEXT: bl __aeabi_idivmod -; ARM-NEXT: sub r0, r1, r0, lsl #3 -; ARM-NEXT: add r0, r0, #3 -; ARM-NEXT: rsbs r1, r0, #0 -; ARM-NEXT: adc r0, r0, r1 +; ARM-NEXT: str r0, [sp, #4] +; ARM-NEXT: mov r0, #1 ; ARM-NEXT: add sp, sp, #8 -; ARM-NEXT: pop {r11, lr} ; ARM-NEXT: mov pc, lr ; ; ARMT2-LABEL: t10: ; ARMT2: @ %bb.0: @ %entry -; ARMT2-NEXT: .save {r11, lr} -; ARMT2-NEXT: push {r11, lr} ; ARMT2-NEXT: .pad #8 ; ARMT2-NEXT: sub sp, sp, #8 -; ARMT2-NEXT: mvn r0, #2 -; ARMT2-NEXT: str r0, [sp, #4] ; ARMT2-NEXT: mvn r0, #7 ; ARMT2-NEXT: str r0, [sp] ; ARMT2-NEXT: mvn r0, #2 -; ARMT2-NEXT: mvn r1, #7 -; ARMT2-NEXT: bl __aeabi_idivmod -; ARMT2-NEXT: sub r0, r1, r0, lsl #3 -; ARMT2-NEXT: add r0, r0, #3 -; ARMT2-NEXT: clz r0, r0 -; ARMT2-NEXT: lsr r0, r0, #5 +; ARMT2-NEXT: str r0, [sp, #4] +; ARMT2-NEXT: mov r0, #1 ; ARMT2-NEXT: add sp, sp, #8 -; ARMT2-NEXT: pop {r11, pc} +; ARMT2-NEXT: bx lr ; ; THUMB1-LABEL: t10: ; THUMB1: @ %bb.0: @ %entry @@ -632,23 +617,15 @@ ; ; THUMB2-LABEL: t10: ; THUMB2: @ %bb.0: @ %entry -; THUMB2-NEXT: .save {r7, lr} -; THUMB2-NEXT: push {r7, lr} ; THUMB2-NEXT: .pad #8 ; THUMB2-NEXT: sub sp, #8 -; THUMB2-NEXT: mvn r0, #2 -; THUMB2-NEXT: str r0, [sp, #4] ; THUMB2-NEXT: mvn r0, #7 ; THUMB2-NEXT: str r0, [sp] ; THUMB2-NEXT: mvn r0, #2 -; THUMB2-NEXT: mvn r1, #7 -; THUMB2-NEXT: bl __aeabi_idivmod -; THUMB2-NEXT: sub.w r0, r1, r0, lsl #3 -; THUMB2-NEXT: adds r0, #3 -; THUMB2-NEXT: clz r0, r0 -; THUMB2-NEXT: lsrs r0, r0, #5 +; THUMB2-NEXT: str r0, [sp, #4] +; THUMB2-NEXT: movs r0, #1 ; THUMB2-NEXT: add sp, #8 -; THUMB2-NEXT: pop {r7, pc} +; THUMB2-NEXT: bx lr ; ; V8MBASE-LABEL: t10: ; V8MBASE: @ %bb.0: @ %entry @@ -710,13 +687,11 @@ ; ARMT2-NEXT: mov r0, #10 ; ARMT2-NEXT: bfi r1, r0, #12, #13 ; ARMT2-NEXT: mov r0, r1 +; ARMT2-NEXT: bfc r1, #0, #12 ; ARMT2-NEXT: bfc r0, #12, #20 ; ARMT2-NEXT: umull r2, r3, r0, r2 ; ARMT2-NEXT: add r2, r3, r3, lsl #2 ; ARMT2-NEXT: sub r0, r0, r2, lsl #1 -; ARMT2-NEXT: movw r2, #40960 -; ARMT2-NEXT: movt r2, #65024 -; ARMT2-NEXT: and r1, r1, r2 ; ARMT2-NEXT: orr r0, r1, r0 ; ARMT2-NEXT: str r0, [sp] ; ARMT2-NEXT: bfc r0, #12, #20 @@ -728,36 +703,32 @@ ; ; THUMB1-LABEL: t11: ; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: .save {r4, r5, r7, lr} -; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} ; THUMB1-NEXT: .pad #8 ; THUMB1-NEXT: sub sp, #8 -; THUMB1-NEXT: movs r4, #33 -; THUMB1-NEXT: ldr r0, [sp, #4] -; THUMB1-NEXT: orrs r0, r4 -; THUMB1-NEXT: ldr r1, .LCPI10_0 -; THUMB1-NEXT: ands r1, r0 -; THUMB1-NEXT: movs r0, #5 -; THUMB1-NEXT: lsls r0, r0, #13 -; THUMB1-NEXT: adds r5, r1, r0 +; THUMB1-NEXT: movs r0, #127 +; THUMB1-NEXT: lsls r0, r0, #25 +; THUMB1-NEXT: ldr r4, [sp, #4] +; THUMB1-NEXT: ands r4, r0 +; THUMB1-NEXT: movs r0, #33 ; THUMB1-NEXT: movs r1, #10 -; THUMB1-NEXT: mov r0, r4 ; THUMB1-NEXT: bl __aeabi_uidivmod -; THUMB1-NEXT: bics r5, r4 -; THUMB1-NEXT: orrs r5, r1 -; THUMB1-NEXT: str r5, [sp, #4] -; THUMB1-NEXT: ldr r0, .LCPI10_1 -; THUMB1-NEXT: ands r0, r5 -; THUMB1-NEXT: subs r1, r0, #3 +; THUMB1-NEXT: orrs r1, r4 +; THUMB1-NEXT: movs r0, #5 +; THUMB1-NEXT: lsls r0, r0, #13 +; THUMB1-NEXT: orrs r0, r1 +; THUMB1-NEXT: str r0, [sp, #4] +; THUMB1-NEXT: ldr r1, .LCPI10_0 +; THUMB1-NEXT: ands r1, r0 +; THUMB1-NEXT: subs r1, r1, #3 ; THUMB1-NEXT: rsbs r0, r1, #0 ; THUMB1-NEXT: adcs r0, r1 ; THUMB1-NEXT: add sp, #8 -; THUMB1-NEXT: pop {r4, r5, r7, pc} +; THUMB1-NEXT: pop {r4, pc} ; THUMB1-NEXT: .p2align 2 ; THUMB1-NEXT: @ %bb.1: ; THUMB1-NEXT: .LCPI10_0: -; THUMB1-NEXT: .long 4261412897 @ 0xfe000021 -; THUMB1-NEXT: .LCPI10_1: ; THUMB1-NEXT: .long 4095 @ 0xfff ; ; THUMB2-LABEL: t11: @@ -773,12 +744,10 @@ ; THUMB2-NEXT: mov r0, r1 ; THUMB2-NEXT: movt r2, #6553 ; THUMB2-NEXT: bfc r0, #12, #20 +; THUMB2-NEXT: bfc r1, #0, #12 ; THUMB2-NEXT: umull r2, r3, r0, r2 ; THUMB2-NEXT: add.w r2, r3, r3, lsl #2 ; THUMB2-NEXT: sub.w r0, r0, r2, lsl #1 -; THUMB2-NEXT: movw r2, #40960 -; THUMB2-NEXT: movt r2, #65024 -; THUMB2-NEXT: ands r1, r2 ; THUMB2-NEXT: orrs r0, r1 ; THUMB2-NEXT: str r0, [sp] ; THUMB2-NEXT: bfc r0, #12, #20 @@ -799,9 +768,10 @@ ; V8MBASE-NEXT: movw r0, #40963 ; V8MBASE-NEXT: adds r0, r1, r0 ; V8MBASE-NEXT: str r0, [sp] -; V8MBASE-NEXT: movs r1, #0 -; V8MBASE-NEXT: rsbs r0, r1, #0 -; V8MBASE-NEXT: adcs r0, r1 +; V8MBASE-NEXT: movs r0, #0 +; V8MBASE-NEXT: movs r1, #1 +; V8MBASE-NEXT: subs r1, r1, #1 +; V8MBASE-NEXT: adcs r0, r0 ; V8MBASE-NEXT: add sp, #4 ; V8MBASE-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/ARM/select_xform.ll b/llvm/test/CodeGen/ARM/select_xform.ll --- a/llvm/test/CodeGen/ARM/select_xform.ll +++ b/llvm/test/CodeGen/ARM/select_xform.ll @@ -6,18 +6,19 @@ define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind { ; ARM-LABEL: t1: ; ARM: @ %bb.0: -; ARM-NEXT: mov r0, r1 +; ARM-NEXT: mvn r0, #-2147483648 ; ARM-NEXT: cmp r2, #10 -; ARM-NEXT: suble r0, r0, #-2147483647 +; ARM-NEXT: movwgt r0, #0 +; ARM-NEXT: add r0, r0, r1 ; ARM-NEXT: bx lr ; ; T2-LABEL: t1: ; T2: @ %bb.0: -; T2-NEXT: mov r0, r1 -; T2-NEXT: mvn r1, #-2147483648 +; T2-NEXT: mvn r0, #-2147483648 ; T2-NEXT: cmp r2, #10 -; T2-NEXT: it le -; T2-NEXT: addle r0, r1 +; T2-NEXT: it gt +; T2-NEXT: movgt r0, #0 +; T2-NEXT: add r0, r1 ; T2-NEXT: bx lr %tmp1 = icmp sgt i32 %c, 10 %tmp2 = select i1 %tmp1, i32 0, i32 2147483647 @@ -28,17 +29,19 @@ define i32 @t2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; ARM-LABEL: t2: ; ARM: @ %bb.0: -; ARM-NEXT: mov r0, r1 +; ARM-NEXT: mov r0, #10 ; ARM-NEXT: cmp r2, #10 -; ARM-NEXT: suble r0, r0, #10 +; ARM-NEXT: movwgt r0, #0 +; ARM-NEXT: sub r0, r1, r0 ; ARM-NEXT: bx lr ; ; T2-LABEL: t2: ; T2: @ %bb.0: -; T2-NEXT: mov r0, r1 +; T2-NEXT: movs r0, #10 ; T2-NEXT: cmp r2, #10 -; T2-NEXT: it le -; T2-NEXT: suble r0, #10 +; T2-NEXT: it gt +; T2-NEXT: movgt r0, #0 +; T2-NEXT: subs r0, r1, r0 ; T2-NEXT: bx lr %tmp1 = icmp sgt i32 %c, 10 %tmp2 = select i1 %tmp1, i32 0, i32 10 @@ -50,16 +53,16 @@ ; ARM-LABEL: t3: ; ARM: @ %bb.0: ; ARM-NEXT: cmp r0, r1 -; ARM-NEXT: andge r3, r3, r2 -; ARM-NEXT: mov r0, r3 +; ARM-NEXT: mvnlt r2, #0 +; ARM-NEXT: and r0, r2, r3 ; ARM-NEXT: bx lr ; ; T2-LABEL: t3: ; T2: @ %bb.0: ; T2-NEXT: cmp r0, r1 -; T2-NEXT: it ge -; T2-NEXT: andge r3, r2 -; T2-NEXT: mov r0, r3 +; T2-NEXT: it lt +; T2-NEXT: movlt.w r2, #-1 +; T2-NEXT: and.w r0, r2, r3 ; T2-NEXT: bx lr %cond = icmp slt i32 %a, %b %z = select i1 %cond, i32 -1, i32 %x @@ -71,16 +74,16 @@ ; ARM-LABEL: t4: ; ARM: @ %bb.0: ; ARM-NEXT: cmp r0, r1 -; ARM-NEXT: orrge r3, r3, r2 -; ARM-NEXT: mov r0, r3 +; ARM-NEXT: movwlt r2, #0 +; ARM-NEXT: orr r0, r2, r3 ; ARM-NEXT: bx lr ; ; T2-LABEL: t4: ; T2: @ %bb.0: ; T2-NEXT: cmp r0, r1 -; T2-NEXT: it ge -; T2-NEXT: orrge r3, r2 -; T2-NEXT: mov r0, r3 +; T2-NEXT: it lt +; T2-NEXT: movlt r2, #0 +; T2-NEXT: orr.w r0, r2, r3 ; T2-NEXT: bx lr %cond = icmp slt i32 %a, %b %z = select i1 %cond, i32 0, i32 %x @@ -114,16 +117,16 @@ ; ARM-LABEL: t6: ; ARM: @ %bb.0: ; ARM-NEXT: cmp r0, r1 -; ARM-NEXT: eorlt r3, r3, r2 -; ARM-NEXT: mov r0, r3 +; ARM-NEXT: movge r2, #0 +; ARM-NEXT: eor r0, r2, r3 ; ARM-NEXT: bx lr ; ; T2-LABEL: t6: ; T2: @ %bb.0: ; T2-NEXT: cmp r0, r1 -; T2-NEXT: it lt -; T2-NEXT: eorlt r3, r2 -; T2-NEXT: mov r0, r3 +; T2-NEXT: it ge +; T2-NEXT: movge r2, #0 +; T2-NEXT: eor.w r0, r2, r3 ; T2-NEXT: bx lr %cond = icmp slt i32 %a, %b %tmp1 = select i1 %cond, i32 %c, i32 0 @@ -134,17 +137,19 @@ define i32 @t7(i32 %a, i32 %b, i32 %c) nounwind { ; ARM-LABEL: t7: ; ARM: @ %bb.0: @ %entry +; ARM-NEXT: mvn r3, #0 ; ARM-NEXT: cmp r0, r1 -; ARM-NEXT: andeq r2, r2, r2, lsl #1 -; ARM-NEXT: mov r0, r2 +; ARM-NEXT: lsleq r3, r2, #1 +; ARM-NEXT: and r0, r2, r3 ; ARM-NEXT: bx lr ; ; T2-LABEL: t7: ; T2: @ %bb.0: @ %entry +; T2-NEXT: mov.w r3, #-1 ; T2-NEXT: cmp r0, r1 ; T2-NEXT: it eq -; T2-NEXT: andeq.w r2, r2, r2, lsl #1 -; T2-NEXT: mov r0, r2 +; T2-NEXT: lsleq r3, r2, #1 +; T2-NEXT: and.w r0, r2, r3 ; T2-NEXT: bx lr entry: %tmp1 = shl i32 %c, 1 @@ -316,15 +321,19 @@ define i32 @t14(i32 %c, i32 %a) nounwind readnone ssp { ; ARM-LABEL: t14: ; ARM: @ %bb.0: @ %entry +; ARM-NEXT: mov r2, #0 ; ARM-NEXT: cmp r1, #10 -; ARM-NEXT: subgt r0, r0, #1 +; ARM-NEXT: mvngt r2, #0 +; ARM-NEXT: add r0, r2, r0 ; ARM-NEXT: bx lr ; ; T2-LABEL: t14: ; T2: @ %bb.0: @ %entry +; T2-NEXT: movs r2, #0 ; T2-NEXT: cmp r1, #10 ; T2-NEXT: it gt -; T2-NEXT: subgt r0, #1 +; T2-NEXT: movgt.w r2, #-1 +; T2-NEXT: add r0, r2 ; T2-NEXT: bx lr entry: %cmp = icmp sgt i32 %a, 10 @@ -337,19 +346,19 @@ define i32 @t15(i32 %p) { ; ARM-LABEL: t15: ; ARM: @ %bb.0: @ %entry -; ARM-NEXT: mov r1, #3 +; ARM-NEXT: mov r1, #2 ; ARM-NEXT: cmp r0, #8 -; ARM-NEXT: movwgt r1, #0 -; ARM-NEXT: mov r0, r1 +; ARM-NEXT: movwgt r1, #1 +; ARM-NEXT: eor r0, r1, #1 ; ARM-NEXT: bx lr ; ; T2-LABEL: t15: ; T2: @ %bb.0: @ %entry -; T2-NEXT: movs r1, #3 +; T2-NEXT: movs r1, #2 ; T2-NEXT: cmp r0, #8 ; T2-NEXT: it gt -; T2-NEXT: movgt r1, #0 -; T2-NEXT: mov r0, r1 +; T2-NEXT: movgt r1, #1 +; T2-NEXT: eor r0, r1, #1 ; T2-NEXT: bx lr entry: %cmp = icmp sgt i32 %p, 8 diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll --- a/llvm/test/CodeGen/ARM/shift-combine.ll +++ b/llvm/test/CodeGen/ARM/shift-combine.ll @@ -763,21 +763,40 @@ } define arm_aapcscc void @test_sext_shift8_mask8(ptr %p, ptr %q) { -; CHECK-COMMON-LABEL: test_sext_shift8_mask8: -; CHECK-COMMON: @ %bb.0: @ %entry -; CHECK-COMMON-NEXT: ldrb r0, [r0, #1] -; CHECK-COMMON-NEXT: str r0, [r1] -; CHECK-COMMON-NEXT: bx lr +; CHECK-ARM-LABEL: test_sext_shift8_mask8: +; CHECK-ARM: @ %bb.0: @ %entry +; CHECK-ARM-NEXT: ldrsh r0, [r0] +; CHECK-ARM-NEXT: ubfx r0, r0, #8, #8 +; CHECK-ARM-NEXT: str r0, [r1] +; CHECK-ARM-NEXT: bx lr ; ; CHECK-BE-LABEL: test_sext_shift8_mask8: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: ldrb r0, [r0] +; CHECK-BE-NEXT: ldrsh r0, [r0] +; CHECK-BE-NEXT: ubfx r0, r0, #8, #8 ; CHECK-BE-NEXT: str r0, [r1] ; CHECK-BE-NEXT: bx lr ; +; CHECK-THUMB-LABEL: test_sext_shift8_mask8: +; CHECK-THUMB: @ %bb.0: @ %entry +; CHECK-THUMB-NEXT: ldrsh.w r0, [r0] +; CHECK-THUMB-NEXT: ubfx r0, r0, #8, #8 +; CHECK-THUMB-NEXT: str r0, [r1] +; CHECK-THUMB-NEXT: bx lr +; +; CHECK-ALIGN-LABEL: test_sext_shift8_mask8: +; CHECK-ALIGN: @ %bb.0: @ %entry +; CHECK-ALIGN-NEXT: ldrsh.w r0, [r0] +; CHECK-ALIGN-NEXT: ubfx r0, r0, #8, #8 +; CHECK-ALIGN-NEXT: str r0, [r1] +; CHECK-ALIGN-NEXT: bx lr +; ; CHECK-V6M-LABEL: test_sext_shift8_mask8: ; CHECK-V6M: @ %bb.0: @ %entry -; CHECK-V6M-NEXT: ldrb r0, [r0, #1] +; CHECK-V6M-NEXT: movs r2, #0 +; CHECK-V6M-NEXT: ldrsh r0, [r0, r2] +; CHECK-V6M-NEXT: lsrs r0, r0, #8 +; CHECK-V6M-NEXT: uxtb r0, r0 ; CHECK-V6M-NEXT: str r0, [r1] ; CHECK-V6M-NEXT: bx lr entry: @@ -1222,24 +1241,87 @@ define <4 x i32> @or_tree_with_shifts_vec_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { ; CHECK-ARM-LABEL: or_tree_with_shifts_vec_i32: ; CHECK-ARM: @ %bb.0: -; CHECK-ARM-NEXT: vorr q8, q0, q2 -; CHECK-ARM-NEXT: vshl.i32 q8, q8, #16 -; CHECK-ARM-NEXT: vorr q8, q8, q1 -; CHECK-ARM-NEXT: vorr q0, q8, q3 +; CHECK-ARM-NEXT: vshl.i32 q8, q2, #16 +; CHECK-ARM-NEXT: vshl.i32 q9, q0, #16 +; CHECK-ARM-NEXT: vorr q8, q8, q3 +; CHECK-ARM-NEXT: vorr q9, q9, q1 +; CHECK-ARM-NEXT: vorr q0, q9, q8 ; CHECK-ARM-NEXT: bx lr ; ; CHECK-BE-LABEL: or_tree_with_shifts_vec_i32: ; CHECK-BE: @ %bb.0: ; CHECK-BE-NEXT: vrev64.32 q8, q2 ; CHECK-BE-NEXT: vrev64.32 q9, q0 -; CHECK-BE-NEXT: vorr q8, q9, q8 -; CHECK-BE-NEXT: vrev64.32 q9, q1 -; CHECK-BE-NEXT: vrev64.32 q10, q3 ; CHECK-BE-NEXT: vshl.i32 q8, q8, #16 -; CHECK-BE-NEXT: vorr q8, q8, q9 +; CHECK-BE-NEXT: vrev64.32 q10, q3 +; CHECK-BE-NEXT: vshl.i32 q9, q9, #16 +; CHECK-BE-NEXT: vrev64.32 q11, q1 ; CHECK-BE-NEXT: vorr q8, q8, q10 +; CHECK-BE-NEXT: vorr q9, q9, q11 +; CHECK-BE-NEXT: vorr q8, q9, q8 ; CHECK-BE-NEXT: vrev64.32 q0, q8 ; CHECK-BE-NEXT: bx lr +; +; CHECK-ALIGN-LABEL: or_tree_with_shifts_vec_i32: +; CHECK-ALIGN: @ %bb.0: +; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #16] +; CHECK-ALIGN-NEXT: orr.w r12, r12, r0 +; CHECK-ALIGN-NEXT: ldr r0, [sp] +; CHECK-ALIGN-NEXT: orr.w r12, r0, r12, lsl #16 +; CHECK-ALIGN-NEXT: ldr r0, [sp, #32] +; CHECK-ALIGN-NEXT: orr.w r0, r0, r12 +; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #20] +; CHECK-ALIGN-NEXT: orr.w r12, r12, r1 +; CHECK-ALIGN-NEXT: ldr r1, [sp, #4] +; CHECK-ALIGN-NEXT: orr.w r12, r1, r12, lsl #16 +; CHECK-ALIGN-NEXT: ldr r1, [sp, #36] +; CHECK-ALIGN-NEXT: orr.w r1, r1, r12 +; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #24] +; CHECK-ALIGN-NEXT: orr.w r12, r12, r2 +; CHECK-ALIGN-NEXT: ldr r2, [sp, #8] +; CHECK-ALIGN-NEXT: orr.w r12, r2, r12, lsl #16 +; CHECK-ALIGN-NEXT: ldr r2, [sp, #40] +; CHECK-ALIGN-NEXT: orr.w r2, r2, r12 +; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #28] +; CHECK-ALIGN-NEXT: orr.w r12, r12, r3 +; CHECK-ALIGN-NEXT: ldr r3, [sp, #12] +; CHECK-ALIGN-NEXT: orr.w r12, r3, r12, lsl #16 +; CHECK-ALIGN-NEXT: ldr r3, [sp, #44] +; CHECK-ALIGN-NEXT: orr.w r3, r3, r12 +; CHECK-ALIGN-NEXT: bx lr +; +; CHECK-V6M-LABEL: or_tree_with_shifts_vec_i32: +; CHECK-V6M: @ %bb.0: +; CHECK-V6M-NEXT: push {r4, lr} +; CHECK-V6M-NEXT: ldr r4, [sp, #24] +; CHECK-V6M-NEXT: orrs r4, r0 +; CHECK-V6M-NEXT: lsls r0, r4, #16 +; CHECK-V6M-NEXT: ldr r4, [sp, #8] +; CHECK-V6M-NEXT: orrs r4, r0 +; CHECK-V6M-NEXT: ldr r0, [sp, #40] +; CHECK-V6M-NEXT: orrs r0, r4 +; CHECK-V6M-NEXT: ldr r4, [sp, #28] +; CHECK-V6M-NEXT: orrs r4, r1 +; CHECK-V6M-NEXT: lsls r1, r4, #16 +; CHECK-V6M-NEXT: ldr r4, [sp, #12] +; CHECK-V6M-NEXT: orrs r4, r1 +; CHECK-V6M-NEXT: ldr r1, [sp, #44] +; CHECK-V6M-NEXT: orrs r1, r4 +; CHECK-V6M-NEXT: ldr r4, [sp, #32] +; CHECK-V6M-NEXT: orrs r4, r2 +; CHECK-V6M-NEXT: lsls r2, r4, #16 +; CHECK-V6M-NEXT: ldr r4, [sp, #16] +; CHECK-V6M-NEXT: orrs r4, r2 +; CHECK-V6M-NEXT: ldr r2, [sp, #48] +; CHECK-V6M-NEXT: orrs r2, r4 +; CHECK-V6M-NEXT: ldr r4, [sp, #36] +; CHECK-V6M-NEXT: orrs r4, r3 +; CHECK-V6M-NEXT: lsls r3, r4, #16 +; CHECK-V6M-NEXT: ldr r4, [sp, #20] +; CHECK-V6M-NEXT: orrs r4, r3 +; CHECK-V6M-NEXT: ldr r3, [sp, #52] +; CHECK-V6M-NEXT: orrs r3, r4 +; CHECK-V6M-NEXT: pop {r4, pc} %a.shifted = shl <4 x i32> %a, %c.shifted = shl <4 x i32> %c, %or.ab = or <4 x i32> %a.shifted, %b @@ -1271,6 +1353,72 @@ ; CHECK-BE-NEXT: vorr q8, q9, q8 ; CHECK-BE-NEXT: vrev64.32 q0, q8 ; CHECK-BE-NEXT: bx lr +; +; CHECK-ALIGN-LABEL: or_tree_with_mismatching_shifts_vec_i32: +; CHECK-ALIGN: @ %bb.0: +; CHECK-ALIGN-NEXT: push {r7, lr} +; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #24] +; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #40] +; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17 +; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #8] +; CHECK-ALIGN-NEXT: orr.w r0, lr, r0, lsl #16 +; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #44] +; CHECK-ALIGN-NEXT: orr.w r0, r0, r12 +; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #28] +; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17 +; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #12] +; CHECK-ALIGN-NEXT: orr.w r1, lr, r1, lsl #16 +; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #48] +; CHECK-ALIGN-NEXT: orr.w r1, r1, r12 +; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #32] +; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17 +; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #16] +; CHECK-ALIGN-NEXT: orr.w r2, lr, r2, lsl #16 +; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #52] +; CHECK-ALIGN-NEXT: orr.w r2, r2, r12 +; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #36] +; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17 +; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #20] +; CHECK-ALIGN-NEXT: orr.w r3, lr, r3, lsl #16 +; CHECK-ALIGN-NEXT: orr.w r3, r3, r12 +; CHECK-ALIGN-NEXT: pop {r7, pc} +; +; CHECK-V6M-LABEL: or_tree_with_mismatching_shifts_vec_i32: +; CHECK-V6M: @ %bb.0: +; CHECK-V6M-NEXT: push {r4, r5, r7, lr} +; CHECK-V6M-NEXT: ldr r4, [sp, #32] +; CHECK-V6M-NEXT: lsls r4, r4, #17 +; CHECK-V6M-NEXT: ldr r5, [sp, #48] +; CHECK-V6M-NEXT: orrs r5, r4 +; CHECK-V6M-NEXT: lsls r4, r0, #16 +; CHECK-V6M-NEXT: ldr r0, [sp, #16] +; CHECK-V6M-NEXT: orrs r0, r4 +; CHECK-V6M-NEXT: orrs r0, r5 +; CHECK-V6M-NEXT: ldr r4, [sp, #36] +; CHECK-V6M-NEXT: lsls r4, r4, #17 +; CHECK-V6M-NEXT: ldr r5, [sp, #52] +; CHECK-V6M-NEXT: orrs r5, r4 +; CHECK-V6M-NEXT: lsls r4, r1, #16 +; CHECK-V6M-NEXT: ldr r1, [sp, #20] +; CHECK-V6M-NEXT: orrs r1, r4 +; CHECK-V6M-NEXT: orrs r1, r5 +; CHECK-V6M-NEXT: ldr r4, [sp, #40] +; CHECK-V6M-NEXT: lsls r4, r4, #17 +; CHECK-V6M-NEXT: ldr r5, [sp, #56] +; CHECK-V6M-NEXT: orrs r5, r4 +; CHECK-V6M-NEXT: lsls r4, r2, #16 +; CHECK-V6M-NEXT: ldr r2, [sp, #24] +; CHECK-V6M-NEXT: orrs r2, r4 +; CHECK-V6M-NEXT: orrs r2, r5 +; CHECK-V6M-NEXT: ldr r4, [sp, #44] +; CHECK-V6M-NEXT: lsls r4, r4, #17 +; CHECK-V6M-NEXT: ldr r5, [sp, #60] +; CHECK-V6M-NEXT: orrs r5, r4 +; CHECK-V6M-NEXT: lsls r4, r3, #16 +; CHECK-V6M-NEXT: ldr r3, [sp, #28] +; CHECK-V6M-NEXT: orrs r3, r4 +; CHECK-V6M-NEXT: orrs r3, r5 +; CHECK-V6M-NEXT: pop {r4, r5, r7, pc} %a.shifted = shl <4 x i32> %a, %c.shifted = shl <4 x i32> %c, %or.ab = or <4 x i32> %a.shifted, %b diff --git a/llvm/test/CodeGen/ARM/smml.ll b/llvm/test/CodeGen/ARM/smml.ll --- a/llvm/test/CodeGen/ARM/smml.ll +++ b/llvm/test/CodeGen/ARM/smml.ll @@ -152,10 +152,13 @@ ; CHECK-V4-NEXT: .save {r11, lr} ; CHECK-V4-NEXT: push {r11, lr} ; CHECK-V4-NEXT: smull r2, r3, r0, r1 -; CHECK-V4-NEXT: rsbs r0, r2, #0 -; CHECK-V4-NEXT: rscs r0, r3, #0 -; CHECK-V4-NEXT: movge r0, #42 -; CHECK-V4-NEXT: movlt r0, #56 +; CHECK-V4-NEXT: mov r0, #0 +; CHECK-V4-NEXT: rsbs r1, r2, #0 +; CHECK-V4-NEXT: rscs r1, r3, #0 +; CHECK-V4-NEXT: movlt r0, #1 +; CHECK-V4-NEXT: cmp r0, #0 +; CHECK-V4-NEXT: moveq r0, #42 +; CHECK-V4-NEXT: movne r0, #56 ; CHECK-V4-NEXT: bl opaque ; CHECK-V4-NEXT: pop {r11, lr} ; CHECK-V4-NEXT: mov pc, lr @@ -194,14 +197,21 @@ ; CHECK-THUMB-NEXT: bl __aeabi_lmul ; CHECK-THUMB-NEXT: movs r2, #0 ; CHECK-THUMB-NEXT: rsbs r0, r0, #0 -; CHECK-THUMB-NEXT: sbcs r2, r1 +; CHECK-THUMB-NEXT: push {r2} +; CHECK-THUMB-NEXT: pop {r0} +; CHECK-THUMB-NEXT: sbcs r0, r1 ; CHECK-THUMB-NEXT: bge .LBB2_2 -; CHECK-THUMB-NEXT: @ %bb.1: @ %false +; CHECK-THUMB-NEXT: @ %bb.1: +; CHECK-THUMB-NEXT: movs r2, #1 +; CHECK-THUMB-NEXT: .LBB2_2: +; CHECK-THUMB-NEXT: cmp r2, #0 +; CHECK-THUMB-NEXT: beq .LBB2_4 +; CHECK-THUMB-NEXT: @ %bb.3: @ %false ; CHECK-THUMB-NEXT: movs r0, #56 -; CHECK-THUMB-NEXT: b .LBB2_3 -; CHECK-THUMB-NEXT: .LBB2_2: @ %true +; CHECK-THUMB-NEXT: b .LBB2_5 +; CHECK-THUMB-NEXT: .LBB2_4: @ %true ; CHECK-THUMB-NEXT: movs r0, #42 -; CHECK-THUMB-NEXT: .LBB2_3: @ %true +; CHECK-THUMB-NEXT: .LBB2_5: @ %true ; CHECK-THUMB-NEXT: bl opaque ; CHECK-THUMB-NEXT: pop {r7} ; CHECK-THUMB-NEXT: pop {r0} @@ -217,13 +227,19 @@ ; CHECK-THUMBV6-NEXT: bl __aeabi_lmul ; CHECK-THUMBV6-NEXT: movs r2, #0 ; CHECK-THUMBV6-NEXT: rsbs r0, r0, #0 -; CHECK-THUMBV6-NEXT: sbcs r2, r1 +; CHECK-THUMBV6-NEXT: mov r0, r2 +; CHECK-THUMBV6-NEXT: sbcs r0, r1 ; CHECK-THUMBV6-NEXT: bge .LBB2_2 -; CHECK-THUMBV6-NEXT: @ %bb.1: @ %false +; CHECK-THUMBV6-NEXT: @ %bb.1: +; CHECK-THUMBV6-NEXT: movs r2, #1 +; CHECK-THUMBV6-NEXT: .LBB2_2: +; CHECK-THUMBV6-NEXT: cmp r2, #0 +; CHECK-THUMBV6-NEXT: beq .LBB2_4 +; CHECK-THUMBV6-NEXT: @ %bb.3: @ %false ; CHECK-THUMBV6-NEXT: movs r0, #56 ; CHECK-THUMBV6-NEXT: bl opaque ; CHECK-THUMBV6-NEXT: pop {r7, pc} -; CHECK-THUMBV6-NEXT: .LBB2_2: @ %true +; CHECK-THUMBV6-NEXT: .LBB2_4: @ %true ; CHECK-THUMBV6-NEXT: movs r0, #42 ; CHECK-THUMBV6-NEXT: bl opaque ; CHECK-THUMBV6-NEXT: pop {r7, pc} @@ -236,9 +252,12 @@ ; CHECK-THUMBV6T2-NEXT: movs r2, #0 ; CHECK-THUMBV6T2-NEXT: rsbs r0, r0, #0 ; CHECK-THUMBV6T2-NEXT: sbcs.w r0, r2, r1 -; CHECK-THUMBV6T2-NEXT: ite lt -; CHECK-THUMBV6T2-NEXT: movlt r0, #56 -; CHECK-THUMBV6T2-NEXT: movge r0, #42 +; CHECK-THUMBV6T2-NEXT: it lt +; CHECK-THUMBV6T2-NEXT: movlt r2, #1 +; CHECK-THUMBV6T2-NEXT: cmp r2, #0 +; CHECK-THUMBV6T2-NEXT: ite ne +; CHECK-THUMBV6T2-NEXT: movne r0, #56 +; CHECK-THUMBV6T2-NEXT: moveq r0, #42 ; CHECK-THUMBV6T2-NEXT: bl opaque ; CHECK-THUMBV6T2-NEXT: pop {r7, pc} ; @@ -294,3 +313,5 @@ call void @opaque(i32 56) ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-CALLSITE: {{.*}} diff --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll @@ -304,13 +304,13 @@ ; ARM5-NEXT: mvn r2, #2 ; ARM5-NEXT: eor r0, r0, r2 ; ARM5-NEXT: orrs r5, r0, r1 -; ARM5-NEXT: ldr r0, [sp, #20] +; ARM5-NEXT: ldr r1, [sp, #20] +; ARM5-NEXT: ldr r0, [sp, #16] ; ARM5-NEXT: mvn r2, #8 +; ARM5-NEXT: and r1, r1, #1 ; ARM5-NEXT: mvn r3, #0 -; ARM5-NEXT: and r0, r0, #1 +; ARM5-NEXT: rsb r1, r1, #0 ; ARM5-NEXT: movne r5, #1 -; ARM5-NEXT: rsb r1, r0, #0 -; ARM5-NEXT: ldr r0, [sp, #16] ; ARM5-NEXT: bl __moddi3 ; ARM5-NEXT: eor r0, r0, #3 ; ARM5-NEXT: orrs r2, r0, r1 @@ -343,13 +343,13 @@ ; ARM6-NEXT: mvn r2, #2 ; ARM6-NEXT: eor r0, r0, r2 ; ARM6-NEXT: orrs r5, r0, r1 -; ARM6-NEXT: ldr r0, [sp, #20] +; ARM6-NEXT: ldr r1, [sp, #20] +; ARM6-NEXT: ldr r0, [sp, #16] ; ARM6-NEXT: mvn r2, #8 +; ARM6-NEXT: and r1, r1, #1 ; ARM6-NEXT: mvn r3, #0 -; ARM6-NEXT: and r0, r0, #1 +; ARM6-NEXT: rsb r1, r1, #0 ; ARM6-NEXT: movne r5, #1 -; ARM6-NEXT: rsb r1, r0, #0 -; ARM6-NEXT: ldr r0, [sp, #16] ; ARM6-NEXT: bl __moddi3 ; ARM6-NEXT: eor r0, r0, #3 ; ARM6-NEXT: orrs r2, r0, r1 @@ -360,42 +360,40 @@ ; ; ARM7-LABEL: test_srem_vec: ; ARM7: @ %bb.0: -; ARM7-NEXT: push {r4, r5, r6, r7, r11, lr} +; ARM7-NEXT: push {r4, r5, r6, lr} ; ARM7-NEXT: vpush {d8, d9} -; ARM7-NEXT: mov r6, r0 -; ARM7-NEXT: and r0, r3, #1 -; ARM7-NEXT: mov r5, r1 -; ARM7-NEXT: rsb r1, r0, #0 -; ARM7-NEXT: mov r0, r2 +; ARM7-NEXT: and r1, r1, #1 +; ARM7-NEXT: mov r5, r3 +; ARM7-NEXT: rsb r1, r1, #0 +; ARM7-NEXT: mov r6, r2 ; ARM7-NEXT: mov r2, #9 ; ARM7-NEXT: mov r3, #0 ; ARM7-NEXT: bl __moddi3 -; ARM7-NEXT: mov r7, r0 -; ARM7-NEXT: and r0, r5, #1 ; ARM7-NEXT: mov r4, r1 -; ARM7-NEXT: rsb r1, r0, #0 +; ARM7-NEXT: and r1, r5, #1 +; ARM7-NEXT: rsb r1, r1, #0 +; ARM7-NEXT: vmov.32 d8[0], r0 ; ARM7-NEXT: mov r0, r6 ; ARM7-NEXT: mov r2, #9 ; ARM7-NEXT: mov r3, #0 ; ARM7-NEXT: bl __moddi3 -; ARM7-NEXT: vmov.32 d8[0], r0 -; ARM7-NEXT: ldr r0, [sp, #44] -; ARM7-NEXT: ldr r2, [sp, #40] ; ARM7-NEXT: mov r5, r1 -; ARM7-NEXT: and r0, r0, #1 +; ARM7-NEXT: ldr r1, [sp, #36] +; ARM7-NEXT: ldr r2, [sp, #32] +; ARM7-NEXT: vmov.32 d9[0], r0 +; ARM7-NEXT: and r1, r1, #1 ; ARM7-NEXT: mvn r3, #0 -; ARM7-NEXT: rsb r1, r0, #0 -; ARM7-NEXT: vmov.32 d9[0], r7 +; ARM7-NEXT: rsb r1, r1, #0 ; ARM7-NEXT: mov r0, r2 ; ARM7-NEXT: mvn r2, #8 ; ARM7-NEXT: bl __moddi3 ; ARM7-NEXT: vmov.32 d16[0], r0 ; ARM7-NEXT: adr r0, .LCPI3_0 -; ARM7-NEXT: vmov.32 d9[1], r4 +; ARM7-NEXT: vmov.32 d9[1], r5 ; ARM7-NEXT: vld1.64 {d18, d19}, [r0:128] ; ARM7-NEXT: adr r0, .LCPI3_1 ; ARM7-NEXT: vmov.32 d16[1], r1 -; ARM7-NEXT: vmov.32 d8[1], r5 +; ARM7-NEXT: vmov.32 d8[1], r4 ; ARM7-NEXT: vand q8, q8, q9 ; ARM7-NEXT: vld1.64 {d20, d21}, [r0:128] ; ARM7-NEXT: adr r0, .LCPI3_2 @@ -415,7 +413,7 @@ ; ARM7-NEXT: vmov.32 r1, d18[1] ; ARM7-NEXT: vmov.32 r2, d16[0] ; ARM7-NEXT: vpop {d8, d9} -; ARM7-NEXT: pop {r4, r5, r6, r7, r11, pc} +; ARM7-NEXT: pop {r4, r5, r6, pc} ; ARM7-NEXT: .p2align 4 ; ARM7-NEXT: @ %bb.1: ; ARM7-NEXT: .LCPI3_0: @@ -436,42 +434,40 @@ ; ; ARM8-LABEL: test_srem_vec: ; ARM8: @ %bb.0: -; ARM8-NEXT: push {r4, r5, r6, r7, r11, lr} +; ARM8-NEXT: push {r4, r5, r6, lr} ; ARM8-NEXT: vpush {d8, d9} -; ARM8-NEXT: mov r6, r0 -; ARM8-NEXT: and r0, r3, #1 -; ARM8-NEXT: mov r5, r1 -; ARM8-NEXT: rsb r1, r0, #0 -; ARM8-NEXT: mov r0, r2 +; ARM8-NEXT: and r1, r1, #1 +; ARM8-NEXT: mov r5, r3 +; ARM8-NEXT: rsb r1, r1, #0 +; ARM8-NEXT: mov r6, r2 ; ARM8-NEXT: mov r2, #9 ; ARM8-NEXT: mov r3, #0 ; ARM8-NEXT: bl __moddi3 -; ARM8-NEXT: mov r7, r0 -; ARM8-NEXT: and r0, r5, #1 ; ARM8-NEXT: mov r4, r1 -; ARM8-NEXT: rsb r1, r0, #0 +; ARM8-NEXT: and r1, r5, #1 +; ARM8-NEXT: rsb r1, r1, #0 +; ARM8-NEXT: vmov.32 d8[0], r0 ; ARM8-NEXT: mov r0, r6 ; ARM8-NEXT: mov r2, #9 ; ARM8-NEXT: mov r3, #0 ; ARM8-NEXT: bl __moddi3 -; ARM8-NEXT: vmov.32 d8[0], r0 -; ARM8-NEXT: ldr r0, [sp, #44] -; ARM8-NEXT: ldr r2, [sp, #40] ; ARM8-NEXT: mov r5, r1 -; ARM8-NEXT: and r0, r0, #1 +; ARM8-NEXT: ldr r1, [sp, #36] +; ARM8-NEXT: ldr r2, [sp, #32] +; ARM8-NEXT: vmov.32 d9[0], r0 +; ARM8-NEXT: and r1, r1, #1 ; ARM8-NEXT: mvn r3, #0 -; ARM8-NEXT: rsb r1, r0, #0 -; ARM8-NEXT: vmov.32 d9[0], r7 +; ARM8-NEXT: rsb r1, r1, #0 ; ARM8-NEXT: mov r0, r2 ; ARM8-NEXT: mvn r2, #8 ; ARM8-NEXT: bl __moddi3 ; ARM8-NEXT: vmov.32 d16[0], r0 ; ARM8-NEXT: adr r0, .LCPI3_0 -; ARM8-NEXT: vmov.32 d9[1], r4 +; ARM8-NEXT: vmov.32 d9[1], r5 ; ARM8-NEXT: vld1.64 {d18, d19}, [r0:128] ; ARM8-NEXT: adr r0, .LCPI3_1 ; ARM8-NEXT: vmov.32 d16[1], r1 -; ARM8-NEXT: vmov.32 d8[1], r5 +; ARM8-NEXT: vmov.32 d8[1], r4 ; ARM8-NEXT: vand q8, q8, q9 ; ARM8-NEXT: vld1.64 {d20, d21}, [r0:128] ; ARM8-NEXT: adr r0, .LCPI3_2 @@ -491,7 +487,7 @@ ; ARM8-NEXT: vmov.32 r1, d18[1] ; ARM8-NEXT: vmov.32 r2, d16[0] ; ARM8-NEXT: vpop {d8, d9} -; ARM8-NEXT: pop {r4, r5, r6, r7, r11, pc} +; ARM8-NEXT: pop {r4, r5, r6, pc} ; ARM8-NEXT: .p2align 4 ; ARM8-NEXT: @ %bb.1: ; ARM8-NEXT: .LCPI3_0: @@ -512,42 +508,40 @@ ; ; NEON7-LABEL: test_srem_vec: ; NEON7: @ %bb.0: -; NEON7-NEXT: push {r4, r5, r6, r7, r11, lr} +; NEON7-NEXT: push {r4, r5, r6, lr} ; NEON7-NEXT: vpush {d8, d9} -; NEON7-NEXT: mov r6, r0 -; NEON7-NEXT: and r0, r3, #1 -; NEON7-NEXT: mov r5, r1 -; NEON7-NEXT: rsb r1, r0, #0 -; NEON7-NEXT: mov r0, r2 +; NEON7-NEXT: and r1, r1, #1 +; NEON7-NEXT: mov r5, r3 +; NEON7-NEXT: rsb r1, r1, #0 +; NEON7-NEXT: mov r6, r2 ; NEON7-NEXT: mov r2, #9 ; NEON7-NEXT: mov r3, #0 ; NEON7-NEXT: bl __moddi3 -; NEON7-NEXT: mov r7, r0 -; NEON7-NEXT: and r0, r5, #1 ; NEON7-NEXT: mov r4, r1 -; NEON7-NEXT: rsb r1, r0, #0 +; NEON7-NEXT: and r1, r5, #1 +; NEON7-NEXT: rsb r1, r1, #0 +; NEON7-NEXT: vmov.32 d8[0], r0 ; NEON7-NEXT: mov r0, r6 ; NEON7-NEXT: mov r2, #9 ; NEON7-NEXT: mov r3, #0 ; NEON7-NEXT: bl __moddi3 -; NEON7-NEXT: vmov.32 d8[0], r0 -; NEON7-NEXT: ldr r0, [sp, #44] -; NEON7-NEXT: ldr r2, [sp, #40] ; NEON7-NEXT: mov r5, r1 -; NEON7-NEXT: and r0, r0, #1 +; NEON7-NEXT: ldr r1, [sp, #36] +; NEON7-NEXT: ldr r2, [sp, #32] +; NEON7-NEXT: vmov.32 d9[0], r0 +; NEON7-NEXT: and r1, r1, #1 ; NEON7-NEXT: mvn r3, #0 -; NEON7-NEXT: rsb r1, r0, #0 -; NEON7-NEXT: vmov.32 d9[0], r7 +; NEON7-NEXT: rsb r1, r1, #0 ; NEON7-NEXT: mov r0, r2 ; NEON7-NEXT: mvn r2, #8 ; NEON7-NEXT: bl __moddi3 ; NEON7-NEXT: vmov.32 d16[0], r0 ; NEON7-NEXT: adr r0, .LCPI3_0 -; NEON7-NEXT: vmov.32 d9[1], r4 +; NEON7-NEXT: vmov.32 d9[1], r5 ; NEON7-NEXT: vld1.64 {d18, d19}, [r0:128] ; NEON7-NEXT: adr r0, .LCPI3_1 ; NEON7-NEXT: vmov.32 d16[1], r1 -; NEON7-NEXT: vmov.32 d8[1], r5 +; NEON7-NEXT: vmov.32 d8[1], r4 ; NEON7-NEXT: vand q8, q8, q9 ; NEON7-NEXT: vld1.64 {d20, d21}, [r0:128] ; NEON7-NEXT: adr r0, .LCPI3_2 @@ -567,7 +561,7 @@ ; NEON7-NEXT: vmov.32 r1, d18[1] ; NEON7-NEXT: vmov.32 r2, d16[0] ; NEON7-NEXT: vpop {d8, d9} -; NEON7-NEXT: pop {r4, r5, r6, r7, r11, pc} +; NEON7-NEXT: pop {r4, r5, r6, pc} ; NEON7-NEXT: .p2align 4 ; NEON7-NEXT: @ %bb.1: ; NEON7-NEXT: .LCPI3_0: @@ -588,42 +582,40 @@ ; ; NEON8-LABEL: test_srem_vec: ; NEON8: @ %bb.0: -; NEON8-NEXT: push {r4, r5, r6, r7, r11, lr} +; NEON8-NEXT: push {r4, r5, r6, lr} ; NEON8-NEXT: vpush {d8, d9} -; NEON8-NEXT: mov r6, r0 -; NEON8-NEXT: and r0, r3, #1 -; NEON8-NEXT: mov r5, r1 -; NEON8-NEXT: rsb r1, r0, #0 -; NEON8-NEXT: mov r0, r2 +; NEON8-NEXT: and r1, r1, #1 +; NEON8-NEXT: mov r5, r3 +; NEON8-NEXT: rsb r1, r1, #0 +; NEON8-NEXT: mov r6, r2 ; NEON8-NEXT: mov r2, #9 ; NEON8-NEXT: mov r3, #0 ; NEON8-NEXT: bl __moddi3 -; NEON8-NEXT: mov r7, r0 -; NEON8-NEXT: and r0, r5, #1 ; NEON8-NEXT: mov r4, r1 -; NEON8-NEXT: rsb r1, r0, #0 +; NEON8-NEXT: and r1, r5, #1 +; NEON8-NEXT: rsb r1, r1, #0 +; NEON8-NEXT: vmov.32 d8[0], r0 ; NEON8-NEXT: mov r0, r6 ; NEON8-NEXT: mov r2, #9 ; NEON8-NEXT: mov r3, #0 ; NEON8-NEXT: bl __moddi3 -; NEON8-NEXT: vmov.32 d8[0], r0 -; NEON8-NEXT: ldr r0, [sp, #44] -; NEON8-NEXT: ldr r2, [sp, #40] ; NEON8-NEXT: mov r5, r1 -; NEON8-NEXT: and r0, r0, #1 +; NEON8-NEXT: ldr r1, [sp, #36] +; NEON8-NEXT: ldr r2, [sp, #32] +; NEON8-NEXT: vmov.32 d9[0], r0 +; NEON8-NEXT: and r1, r1, #1 ; NEON8-NEXT: mvn r3, #0 -; NEON8-NEXT: rsb r1, r0, #0 -; NEON8-NEXT: vmov.32 d9[0], r7 +; NEON8-NEXT: rsb r1, r1, #0 ; NEON8-NEXT: mov r0, r2 ; NEON8-NEXT: mvn r2, #8 ; NEON8-NEXT: bl __moddi3 ; NEON8-NEXT: vmov.32 d16[0], r0 ; NEON8-NEXT: adr r0, .LCPI3_0 -; NEON8-NEXT: vmov.32 d9[1], r4 +; NEON8-NEXT: vmov.32 d9[1], r5 ; NEON8-NEXT: vld1.64 {d18, d19}, [r0:128] ; NEON8-NEXT: adr r0, .LCPI3_1 ; NEON8-NEXT: vmov.32 d16[1], r1 -; NEON8-NEXT: vmov.32 d8[1], r5 +; NEON8-NEXT: vmov.32 d8[1], r4 ; NEON8-NEXT: vand q8, q8, q9 ; NEON8-NEXT: vld1.64 {d20, d21}, [r0:128] ; NEON8-NEXT: adr r0, .LCPI3_2 @@ -643,7 +635,7 @@ ; NEON8-NEXT: vmov.32 r1, d18[1] ; NEON8-NEXT: vmov.32 r2, d16[0] ; NEON8-NEXT: vpop {d8, d9} -; NEON8-NEXT: pop {r4, r5, r6, r7, r11, pc} +; NEON8-NEXT: pop {r4, r5, r6, pc} ; NEON8-NEXT: .p2align 4 ; NEON8-NEXT: @ %bb.1: ; NEON8-NEXT: .LCPI3_0: diff --git a/llvm/test/CodeGen/ARM/ssub_sat.ll b/llvm/test/CodeGen/ARM/ssub_sat.ll --- a/llvm/test/CodeGen/ARM/ssub_sat.ll +++ b/llvm/test/CodeGen/ARM/ssub_sat.ll @@ -148,6 +148,7 @@ ; CHECK-T2NODSP: @ %bb.0: ; CHECK-T2NODSP-NEXT: subs r0, r0, r1 ; CHECK-T2NODSP-NEXT: ssat r0, #16, r0 +; CHECK-T2NODSP-NEXT: sxth r0, r0 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func16: @@ -211,6 +212,7 @@ ; CHECK-T2NODSP: @ %bb.0: ; CHECK-T2NODSP-NEXT: subs r0, r0, r1 ; CHECK-T2NODSP-NEXT: ssat r0, #8, r0 +; CHECK-T2NODSP-NEXT: sxtb r0, r0 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func8: @@ -267,6 +269,7 @@ ; CHECK-T2NODSP: @ %bb.0: ; CHECK-T2NODSP-NEXT: subs r0, r0, r1 ; CHECK-T2NODSP-NEXT: ssat r0, #4, r0 +; CHECK-T2NODSP-NEXT: sbfx r0, r0, #0, #4 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func3: diff --git a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll --- a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll +++ b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll @@ -56,7 +56,8 @@ ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: .save {r4, r5, r7, lr} ; CHECK-T1-NEXT: push {r4, r5, r7, lr} -; CHECK-T1-NEXT: ldr r2, [sp, #20] +; CHECK-T1-NEXT: add r2, sp, #16 +; CHECK-T1-NEXT: ldr r2, [r2, #4] ; CHECK-T1-NEXT: mov r5, r1 ; CHECK-T1-NEXT: eors r5, r2 ; CHECK-T1-NEXT: ldr r3, [sp, #16] @@ -86,11 +87,10 @@ ; ; CHECK-T2-LABEL: func64: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: ldr r2, [sp] -; CHECK-T2-NEXT: ldr.w r12, [sp, #4] -; CHECK-T2-NEXT: subs r0, r0, r2 -; CHECK-T2-NEXT: sbc.w r2, r1, r12 -; CHECK-T2-NEXT: eor.w r3, r1, r12 +; CHECK-T2-NEXT: ldrd r12, r2, [sp] +; CHECK-T2-NEXT: eor.w r3, r1, r2 +; CHECK-T2-NEXT: subs.w r0, r0, r12 +; CHECK-T2-NEXT: sbc.w r2, r1, r2 ; CHECK-T2-NEXT: eors r1, r2 ; CHECK-T2-NEXT: ands r1, r3 ; CHECK-T2-NEXT: it mi @@ -152,6 +152,7 @@ ; CHECK-T2NODSP-NEXT: sxth r1, r1 ; CHECK-T2NODSP-NEXT: subs r0, r0, r1 ; CHECK-T2NODSP-NEXT: ssat r0, #16, r0 +; CHECK-T2NODSP-NEXT: sxth r0, r0 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func16: @@ -198,6 +199,7 @@ ; CHECK-T2NODSP-NEXT: sxtb r1, r1 ; CHECK-T2NODSP-NEXT: subs r0, r0, r1 ; CHECK-T2NODSP-NEXT: ssat r0, #8, r0 +; CHECK-T2NODSP-NEXT: sxtb r0, r0 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func8: @@ -245,6 +247,7 @@ ; CHECK-T2NODSP-NEXT: lsls r1, r1, #28 ; CHECK-T2NODSP-NEXT: sub.w r0, r0, r1, asr #28 ; CHECK-T2NODSP-NEXT: ssat r0, #4, r0 +; CHECK-T2NODSP-NEXT: sbfx r0, r0, #0, #4 ; CHECK-T2NODSP-NEXT: bx lr ; ; CHECK-T2DSP-LABEL: func4: diff --git a/llvm/test/CodeGen/ARM/static-addr-hoisting.ll b/llvm/test/CodeGen/ARM/static-addr-hoisting.ll --- a/llvm/test/CodeGen/ARM/static-addr-hoisting.ll +++ b/llvm/test/CodeGen/ARM/static-addr-hoisting.ll @@ -7,9 +7,9 @@ ; CHECK-NEXT: movw r0, #16960 ; CHECK-NEXT: movs r1, #42 ; CHECK-NEXT: movt r0, #15 -; CHECK-NEXT: str.w r1, [r0, #42] ; CHECK-NEXT: str r1, [r0, #24] ; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: str.w r1, [r0, #42] ; CHECK-NEXT: movw r0, #20394 ; CHECK-NEXT: movt r0, #18 ; CHECK-NEXT: str r1, [r0] diff --git a/llvm/test/CodeGen/ARM/store-postinc.ll b/llvm/test/CodeGen/ARM/store-postinc.ll --- a/llvm/test/CodeGen/ARM/store-postinc.ll +++ b/llvm/test/CodeGen/ARM/store-postinc.ll @@ -789,21 +789,20 @@ define ptr @i128_0(ptr %p, i128 %v) { ; CHECK-T1-LABEL: i128_0: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: ldr r1, [sp, #4] -; CHECK-T1-NEXT: str r1, [r0, #12] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #8] ; CHECK-T1-NEXT: stm r0!, {r2, r3} +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] +; CHECK-T1-NEXT: str r1, [r0, #4] ; CHECK-T1-NEXT: subs r0, #8 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: i128_0: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] -; CHECK-T2-NEXT: str r1, [r0, #8] +; CHECK-T2-NEXT: ldrd r1, r12, [sp] ; CHECK-T2-NEXT: strd r2, r3, [r0] +; CHECK-T2-NEXT: strd r1, r12, [r0, #8] ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: i128_0: @@ -823,7 +822,8 @@ ; CHECK-T1-NEXT: movs r1, #3 ; CHECK-T1-NEXT: str r2, [r0, r1] ; CHECK-T1-NEXT: adds r0, r0, #3 -; CHECK-T1-NEXT: ldr r1, [sp, #4] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] ; CHECK-T1-NEXT: str r1, [r0, #12] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #8] @@ -833,9 +833,8 @@ ; CHECK-T2-LABEL: i128_3: ; CHECK-T2: @ %bb.0: ; CHECK-T2-NEXT: str r2, [r0, #3]! -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r2, [sp] +; CHECK-T2-NEXT: str r2, [r0, #12] ; CHECK-T2-NEXT: strd r3, r1, [r0, #4] ; CHECK-T2-NEXT: bx lr ; @@ -855,21 +854,21 @@ define ptr @i128_4(ptr %p, i128 %v) { ; CHECK-T1-LABEL: i128_4: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: ldr r1, [sp, #4] -; CHECK-T1-NEXT: str r1, [r0, #16] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #12] ; CHECK-T1-NEXT: str r3, [r0, #8] ; CHECK-T1-NEXT: str r2, [r0, #4] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] +; CHECK-T1-NEXT: str r1, [r0, #16] ; CHECK-T1-NEXT: adds r0, r0, #4 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: i128_4: ; CHECK-T2: @ %bb.0: ; CHECK-T2-NEXT: str r2, [r0, #4]! -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r2, [sp] +; CHECK-T2-NEXT: str r2, [r0, #12] ; CHECK-T2-NEXT: strd r3, r1, [r0, #4] ; CHECK-T2-NEXT: bx lr ; @@ -889,21 +888,21 @@ define ptr @i128_8(ptr %p, i128 %v) { ; CHECK-T1-LABEL: i128_8: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: ldr r1, [sp, #4] -; CHECK-T1-NEXT: str r1, [r0, #20] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #16] ; CHECK-T1-NEXT: str r3, [r0, #12] ; CHECK-T1-NEXT: str r2, [r0, #8] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] +; CHECK-T1-NEXT: str r1, [r0, #20] ; CHECK-T1-NEXT: adds r0, #8 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: i128_8: ; CHECK-T2: @ %bb.0: ; CHECK-T2-NEXT: str r2, [r0, #8]! -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r2, [sp] +; CHECK-T2-NEXT: str r2, [r0, #12] ; CHECK-T2-NEXT: strd r3, r1, [r0, #4] ; CHECK-T2-NEXT: bx lr ; @@ -923,21 +922,21 @@ define ptr @i128_16(ptr %p, i128 %v) { ; CHECK-T1-LABEL: i128_16: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: ldr r1, [sp, #4] -; CHECK-T1-NEXT: str r1, [r0, #28] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #24] ; CHECK-T1-NEXT: str r3, [r0, #20] ; CHECK-T1-NEXT: str r2, [r0, #16] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] +; CHECK-T1-NEXT: str r1, [r0, #28] ; CHECK-T1-NEXT: adds r0, #16 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: i128_16: ; CHECK-T2: @ %bb.0: ; CHECK-T2-NEXT: str r2, [r0, #16]! -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r2, [sp] +; CHECK-T2-NEXT: str r2, [r0, #12] ; CHECK-T2-NEXT: strd r3, r1, [r0, #4] ; CHECK-T2-NEXT: bx lr ; @@ -958,7 +957,8 @@ ; CHECK-T1-LABEL: i128_m1: ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: subs r0, r0, #1 -; CHECK-T1-NEXT: ldr r1, [sp, #4] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] ; CHECK-T1-NEXT: str r1, [r0, #12] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #8] @@ -969,9 +969,8 @@ ; CHECK-T2-LABEL: i128_m1: ; CHECK-T2: @ %bb.0: ; CHECK-T2-NEXT: str r2, [r0, #-1]! -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r2, [sp] +; CHECK-T2-NEXT: str r2, [r0, #12] ; CHECK-T2-NEXT: strd r3, r1, [r0, #4] ; CHECK-T2-NEXT: bx lr ; @@ -991,20 +990,20 @@ define ptr @i128_m4(ptr %p, i128 %v) { ; CHECK-T1-LABEL: i128_m4: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: ldr r1, [sp, #4] -; CHECK-T1-NEXT: str r1, [r0, #8] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #4] ; CHECK-T1-NEXT: str r3, [r0] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] +; CHECK-T1-NEXT: str r1, [r0, #8] ; CHECK-T1-NEXT: subs r0, r0, #4 ; CHECK-T1-NEXT: str r2, [r0] ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: i128_m4: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #8] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r12, [sp] +; CHECK-T2-NEXT: str.w r12, [r0, #8] ; CHECK-T2-NEXT: strd r3, r1, [r0] ; CHECK-T2-NEXT: str r2, [r0, #-4]! ; CHECK-T2-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/store-preinc.ll b/llvm/test/CodeGen/ARM/store-preinc.ll --- a/llvm/test/CodeGen/ARM/store-preinc.ll +++ b/llvm/test/CodeGen/ARM/store-preinc.ll @@ -789,21 +789,20 @@ define ptr @i128_0(ptr %p, i128 %v) { ; CHECK-T1-LABEL: i128_0: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: ldr r1, [sp, #4] -; CHECK-T1-NEXT: str r1, [r0, #12] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #8] ; CHECK-T1-NEXT: stm r0!, {r2, r3} +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] +; CHECK-T1-NEXT: str r1, [r0, #4] ; CHECK-T1-NEXT: subs r0, #8 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: i128_0: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] -; CHECK-T2-NEXT: str r1, [r0, #8] +; CHECK-T2-NEXT: ldrd r1, r12, [sp] ; CHECK-T2-NEXT: strd r2, r3, [r0] +; CHECK-T2-NEXT: strd r1, r12, [r0, #8] ; CHECK-T2-NEXT: bx lr ; ; CHECK-ARM-LABEL: i128_0: @@ -823,7 +822,8 @@ ; CHECK-T1-NEXT: movs r1, #3 ; CHECK-T1-NEXT: str r2, [r0, r1] ; CHECK-T1-NEXT: adds r0, r0, #3 -; CHECK-T1-NEXT: ldr r1, [sp, #4] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] ; CHECK-T1-NEXT: str r1, [r0, #12] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #8] @@ -833,9 +833,8 @@ ; CHECK-T2-LABEL: i128_3: ; CHECK-T2: @ %bb.0: ; CHECK-T2-NEXT: str r2, [r0, #3]! -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r2, [sp] +; CHECK-T2-NEXT: str r2, [r0, #12] ; CHECK-T2-NEXT: strd r3, r1, [r0, #4] ; CHECK-T2-NEXT: bx lr ; @@ -855,21 +854,21 @@ define ptr @i128_4(ptr %p, i128 %v) { ; CHECK-T1-LABEL: i128_4: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: ldr r1, [sp, #4] -; CHECK-T1-NEXT: str r1, [r0, #16] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #12] ; CHECK-T1-NEXT: str r3, [r0, #8] ; CHECK-T1-NEXT: str r2, [r0, #4] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] +; CHECK-T1-NEXT: str r1, [r0, #16] ; CHECK-T1-NEXT: adds r0, r0, #4 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: i128_4: ; CHECK-T2: @ %bb.0: ; CHECK-T2-NEXT: str r2, [r0, #4]! -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r2, [sp] +; CHECK-T2-NEXT: str r2, [r0, #12] ; CHECK-T2-NEXT: strd r3, r1, [r0, #4] ; CHECK-T2-NEXT: bx lr ; @@ -889,21 +888,21 @@ define ptr @i128_8(ptr %p, i128 %v) { ; CHECK-T1-LABEL: i128_8: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: ldr r1, [sp, #4] -; CHECK-T1-NEXT: str r1, [r0, #20] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #16] ; CHECK-T1-NEXT: str r3, [r0, #12] ; CHECK-T1-NEXT: str r2, [r0, #8] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] +; CHECK-T1-NEXT: str r1, [r0, #20] ; CHECK-T1-NEXT: adds r0, #8 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: i128_8: ; CHECK-T2: @ %bb.0: ; CHECK-T2-NEXT: str r2, [r0, #8]! -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r2, [sp] +; CHECK-T2-NEXT: str r2, [r0, #12] ; CHECK-T2-NEXT: strd r3, r1, [r0, #4] ; CHECK-T2-NEXT: bx lr ; @@ -923,21 +922,21 @@ define ptr @i128_16(ptr %p, i128 %v) { ; CHECK-T1-LABEL: i128_16: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: ldr r1, [sp, #4] -; CHECK-T1-NEXT: str r1, [r0, #28] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #24] ; CHECK-T1-NEXT: str r3, [r0, #20] ; CHECK-T1-NEXT: str r2, [r0, #16] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] +; CHECK-T1-NEXT: str r1, [r0, #28] ; CHECK-T1-NEXT: adds r0, #16 ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: i128_16: ; CHECK-T2: @ %bb.0: ; CHECK-T2-NEXT: str r2, [r0, #16]! -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r2, [sp] +; CHECK-T2-NEXT: str r2, [r0, #12] ; CHECK-T2-NEXT: strd r3, r1, [r0, #4] ; CHECK-T2-NEXT: bx lr ; @@ -958,7 +957,8 @@ ; CHECK-T1-LABEL: i128_m1: ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: subs r0, r0, #1 -; CHECK-T1-NEXT: ldr r1, [sp, #4] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] ; CHECK-T1-NEXT: str r1, [r0, #12] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #8] @@ -969,9 +969,8 @@ ; CHECK-T2-LABEL: i128_m1: ; CHECK-T2: @ %bb.0: ; CHECK-T2-NEXT: str r2, [r0, #-1]! -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #12] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r2, [sp] +; CHECK-T2-NEXT: str r2, [r0, #12] ; CHECK-T2-NEXT: strd r3, r1, [r0, #4] ; CHECK-T2-NEXT: bx lr ; @@ -991,20 +990,20 @@ define ptr @i128_m4(ptr %p, i128 %v) { ; CHECK-T1-LABEL: i128_m4: ; CHECK-T1: @ %bb.0: -; CHECK-T1-NEXT: ldr r1, [sp, #4] -; CHECK-T1-NEXT: str r1, [r0, #8] ; CHECK-T1-NEXT: ldr r1, [sp] ; CHECK-T1-NEXT: str r1, [r0, #4] ; CHECK-T1-NEXT: str r3, [r0] +; CHECK-T1-NEXT: mov r1, sp +; CHECK-T1-NEXT: ldr r1, [r1, #4] +; CHECK-T1-NEXT: str r1, [r0, #8] ; CHECK-T1-NEXT: subs r0, r0, #4 ; CHECK-T1-NEXT: str r2, [r0] ; CHECK-T1-NEXT: bx lr ; ; CHECK-T2-LABEL: i128_m4: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: ldr r1, [sp, #4] -; CHECK-T2-NEXT: str r1, [r0, #8] -; CHECK-T2-NEXT: ldr r1, [sp] +; CHECK-T2-NEXT: ldrd r1, r12, [sp] +; CHECK-T2-NEXT: str.w r12, [r0, #8] ; CHECK-T2-NEXT: strd r3, r1, [r0] ; CHECK-T2-NEXT: str r2, [r0, #-4]! ; CHECK-T2-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll b/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll --- a/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll +++ b/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll @@ -294,12 +294,9 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: movw r0, :lower16:t -; CHECK-NEXT: movt r0, :upper16:t -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: sub r0, r0, #17 +; CHECK-NEXT: mov r0, #1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: bhs .LBB12_2 +; CHECK-NEXT: bne .LBB12_2 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: bl abort ; CHECK-NEXT: .LBB12_2: @ %if.else diff --git a/llvm/test/CodeGen/ARM/sub-of-not.ll b/llvm/test/CodeGen/ARM/sub-of-not.ll --- a/llvm/test/CodeGen/ARM/sub-of-not.ll +++ b/llvm/test/CodeGen/ARM/sub-of-not.ll @@ -484,16 +484,16 @@ ; ARM6-LABEL: vector_i128_i64: ; ARM6: @ %bb.0: ; ARM6-NEXT: push {r11, lr} -; ARM6-NEXT: ldr lr, [sp, #8] -; ARM6-NEXT: ldr r12, [sp, #12] -; ARM6-NEXT: adds r0, lr, r0 -; ARM6-NEXT: ldr lr, [sp, #16] -; ARM6-NEXT: adc r1, r12, r1 +; ARM6-NEXT: ldr r12, [sp, #8] +; ARM6-NEXT: ldr lr, [sp, #12] +; ARM6-NEXT: adds r0, r12, r0 +; ARM6-NEXT: ldr r12, [sp, #16] +; ARM6-NEXT: adc r1, lr, r1 ; ARM6-NEXT: adds r0, r0, #1 -; ARM6-NEXT: ldr r12, [sp, #20] +; ARM6-NEXT: ldr lr, [sp, #20] ; ARM6-NEXT: adc r1, r1, #0 -; ARM6-NEXT: adds r2, lr, r2 -; ARM6-NEXT: adc r3, r12, r3 +; ARM6-NEXT: adds r2, r12, r2 +; ARM6-NEXT: adc r3, lr, r3 ; ARM6-NEXT: adds r2, r2, #1 ; ARM6-NEXT: adc r3, r3, #0 ; ARM6-NEXT: pop {r11, pc} @@ -514,14 +514,16 @@ ; THUMB6: @ %bb.0: ; THUMB6-NEXT: push {r4, r5, r7, lr} ; THUMB6-NEXT: mvns r4, r1 +; THUMB6-NEXT: add r1, sp, #16 +; THUMB6-NEXT: ldr r1, [r1, #4] ; THUMB6-NEXT: mvns r0, r0 -; THUMB6-NEXT: ldr r1, [sp, #20] ; THUMB6-NEXT: ldr r5, [sp, #16] ; THUMB6-NEXT: subs r0, r5, r0 ; THUMB6-NEXT: sbcs r1, r4 ; THUMB6-NEXT: mvns r4, r3 +; THUMB6-NEXT: add r3, sp, #24 +; THUMB6-NEXT: ldr r3, [r3, #4] ; THUMB6-NEXT: mvns r2, r2 -; THUMB6-NEXT: ldr r3, [sp, #28] ; THUMB6-NEXT: ldr r5, [sp, #24] ; THUMB6-NEXT: subs r2, r5, r2 ; THUMB6-NEXT: sbcs r3, r4 diff --git a/llvm/test/CodeGen/ARM/uadd_sat_plus.ll b/llvm/test/CodeGen/ARM/uadd_sat_plus.ll --- a/llvm/test/CodeGen/ARM/uadd_sat_plus.ll +++ b/llvm/test/CodeGen/ARM/uadd_sat_plus.ll @@ -46,8 +46,9 @@ ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: .save {r4, r5, r7, lr} ; CHECK-T1-NEXT: push {r4, r5, r7, lr} +; CHECK-T1-NEXT: add r2, sp, #16 +; CHECK-T1-NEXT: ldr r2, [r2, #4] ; CHECK-T1-NEXT: movs r5, #0 -; CHECK-T1-NEXT: ldr r2, [sp, #20] ; CHECK-T1-NEXT: ldr r3, [sp, #16] ; CHECK-T1-NEXT: adds r3, r0, r3 ; CHECK-T1-NEXT: adcs r2, r1 @@ -72,10 +73,10 @@ ; ; CHECK-T2-LABEL: func64: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: ldrd r2, r3, [sp] +; CHECK-T2-NEXT: ldrd r3, r2, [sp] ; CHECK-T2-NEXT: mov.w r12, #0 -; CHECK-T2-NEXT: adds r0, r0, r2 -; CHECK-T2-NEXT: adcs r1, r3 +; CHECK-T2-NEXT: adds r0, r0, r3 +; CHECK-T2-NEXT: adcs r1, r2 ; CHECK-T2-NEXT: adcs r2, r12, #0 ; CHECK-T2-NEXT: itt ne ; CHECK-T2-NEXT: movne.w r0, #-1 @@ -84,9 +85,8 @@ ; ; CHECK-ARM-LABEL: func64: ; CHECK-ARM: @ %bb.0: -; CHECK-ARM-NEXT: ldr r2, [sp] +; CHECK-ARM-NEXT: ldm sp, {r2, r3} ; CHECK-ARM-NEXT: mov r12, #0 -; CHECK-ARM-NEXT: ldr r3, [sp, #4] ; CHECK-ARM-NEXT: adds r0, r0, r2 ; CHECK-ARM-NEXT: adcs r1, r1, r3 ; CHECK-ARM-NEXT: adcs r2, r12, #0 diff --git a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll @@ -7,93 +7,89 @@ ; ARMV6: @ %bb.0: @ %start ; ARMV6-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ARMV6-NEXT: sub sp, sp, #28 -; ARMV6-NEXT: ldr r7, [sp, #72] -; ARMV6-NEXT: mov r6, r0 -; ARMV6-NEXT: str r0, [sp, #8] @ 4-byte Spill -; ARMV6-NEXT: ldr r4, [sp, #84] -; ARMV6-NEXT: umull r1, r0, r2, r7 -; ARMV6-NEXT: mov lr, r7 -; ARMV6-NEXT: umull r5, r10, r4, r2 -; ARMV6-NEXT: str r1, [r6] -; ARMV6-NEXT: ldr r6, [sp, #80] -; ARMV6-NEXT: umull r1, r7, r3, r6 -; ARMV6-NEXT: str r7, [sp, #12] @ 4-byte Spill -; ARMV6-NEXT: add r1, r5, r1 -; ARMV6-NEXT: umull r7, r5, r6, r2 -; ARMV6-NEXT: mov r6, lr +; ARMV6-NEXT: ldr r10, [sp, #72] +; ARMV6-NEXT: mov r7, r0 +; ARMV6-NEXT: str r0, [sp, #12] @ 4-byte Spill +; ARMV6-NEXT: add lr, sp, #76 +; ARMV6-NEXT: umull r4, r0, r2, r10 +; ARMV6-NEXT: ldm lr, {r1, r5, lr} +; ARMV6-NEXT: str r4, [r7] +; ARMV6-NEXT: umull r4, r7, r3, r5 +; ARMV6-NEXT: umull r6, r9, lr, r2 +; ARMV6-NEXT: str r7, [sp, #4] @ 4-byte Spill +; ARMV6-NEXT: add r4, r6, r4 +; ARMV6-NEXT: umull r7, r6, r5, r2 +; ARMV6-NEXT: ldr r5, [sp, #64] ; ARMV6-NEXT: str r7, [sp, #16] @ 4-byte Spill ; ARMV6-NEXT: mov r7, #0 -; ARMV6-NEXT: adds r1, r5, r1 -; ARMV6-NEXT: str r1, [sp, #4] @ 4-byte Spill -; ARMV6-NEXT: adc r1, r7, #0 -; ARMV6-NEXT: str r1, [sp, #24] @ 4-byte Spill -; ARMV6-NEXT: ldr r1, [sp, #64] -; ARMV6-NEXT: ldr r7, [sp, #76] -; ARMV6-NEXT: ldr r5, [sp, #64] -; ARMV6-NEXT: umull r12, r9, r7, r1 -; ARMV6-NEXT: ldr r1, [sp, #68] -; ARMV6-NEXT: umull r11, r8, r1, lr +; ARMV6-NEXT: adds r6, r6, r4 +; ARMV6-NEXT: str r6, [sp, #8] @ 4-byte Spill +; ARMV6-NEXT: ldr r6, [sp, #68] +; ARMV6-NEXT: adc r7, r7, #0 +; ARMV6-NEXT: str r7, [sp, #24] @ 4-byte Spill +; ARMV6-NEXT: umull r12, r7, r1, r5 +; ARMV6-NEXT: umull r11, r4, r6, r10 ; ARMV6-NEXT: add r12, r11, r12 -; ARMV6-NEXT: umull r11, lr, r5, lr -; ARMV6-NEXT: mov r5, r6 -; ARMV6-NEXT: mov r6, #0 -; ARMV6-NEXT: adds r12, lr, r12 -; ARMV6-NEXT: umull r2, lr, r2, r7 -; ARMV6-NEXT: adc r6, r6, #0 -; ARMV6-NEXT: str r6, [sp, #20] @ 4-byte Spill -; ARMV6-NEXT: ldr r6, [sp, #16] @ 4-byte Reload -; ARMV6-NEXT: adds r11, r11, r6 -; ARMV6-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; ARMV6-NEXT: adc r6, r12, r6 -; ARMV6-NEXT: mov r12, #0 -; ARMV6-NEXT: umlal r0, r12, r3, r5 +; ARMV6-NEXT: umull r11, r8, r5, r10 +; ARMV6-NEXT: mov r5, #0 +; ARMV6-NEXT: adds r12, r8, r12 +; ARMV6-NEXT: umull r2, r8, r2, r1 +; ARMV6-NEXT: adc r5, r5, #0 +; ARMV6-NEXT: str r5, [sp, #20] @ 4-byte Spill +; ARMV6-NEXT: ldr r5, [sp, #16] @ 4-byte Reload +; ARMV6-NEXT: adds r11, r11, r5 ; ARMV6-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; ARMV6-NEXT: str r6, [sp, #16] @ 4-byte Spill -; ARMV6-NEXT: ldr r6, [sp, #64] +; ARMV6-NEXT: adc r5, r12, r5 +; ARMV6-NEXT: mov r12, #0 +; ARMV6-NEXT: umlal r0, r12, r3, r10 +; ARMV6-NEXT: str r5, [sp, #16] @ 4-byte Spill +; ARMV6-NEXT: ldr r5, [sp, #12] @ 4-byte Reload ; ARMV6-NEXT: adds r0, r2, r0 ; ARMV6-NEXT: str r0, [r5, #4] -; ARMV6-NEXT: adcs r0, r12, lr +; ARMV6-NEXT: adcs r0, r12, r8 ; ARMV6-NEXT: mov r2, #0 -; ARMV6-NEXT: adc r2, r2, #0 -; ARMV6-NEXT: orrs lr, r6, r1 -; ARMV6-NEXT: ldr r6, [sp, #80] -; ARMV6-NEXT: movne lr, #1 -; ARMV6-NEXT: umlal r0, r2, r3, r7 -; ARMV6-NEXT: orrs r12, r6, r4 -; ARMV6-NEXT: movne r12, #1 +; ARMV6-NEXT: adc r12, r2, #0 +; ARMV6-NEXT: cmp r7, #0 +; ARMV6-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; ARMV6-NEXT: movne r7, #1 +; ARMV6-NEXT: cmp r4, #0 +; ARMV6-NEXT: umlal r0, r12, r3, r1 +; ARMV6-NEXT: movne r4, #1 +; ARMV6-NEXT: cmp r2, #0 +; ARMV6-NEXT: movne r2, #1 ; ARMV6-NEXT: cmp r9, #0 -; ARMV6-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; ARMV6-NEXT: mov r10, r2 +; ARMV6-NEXT: ldr r2, [sp, #64] ; ARMV6-NEXT: movne r9, #1 -; ARMV6-NEXT: cmp r8, #0 +; ARMV6-NEXT: orrs r8, r2, r6 +; ARMV6-NEXT: ldr r2, [sp, #80] ; ARMV6-NEXT: movne r8, #1 +; ARMV6-NEXT: orrs r2, r2, lr +; ARMV6-NEXT: movne r2, #1 ; ARMV6-NEXT: cmp r6, #0 ; ARMV6-NEXT: movne r6, #1 -; ARMV6-NEXT: cmp r10, #0 -; ARMV6-NEXT: movne r10, #1 ; ARMV6-NEXT: cmp r1, #0 ; ARMV6-NEXT: movne r1, #1 -; ARMV6-NEXT: cmp r7, #0 -; ARMV6-NEXT: movne r7, #1 -; ARMV6-NEXT: cmp r4, #0 -; ARMV6-NEXT: movne r4, #1 +; ARMV6-NEXT: cmp lr, #0 +; ARMV6-NEXT: movne lr, #1 ; ARMV6-NEXT: cmp r3, #0 ; ARMV6-NEXT: movne r3, #1 ; ARMV6-NEXT: adds r0, r0, r11 ; ARMV6-NEXT: str r0, [r5, #8] -; ARMV6-NEXT: and r1, r1, r7 +; ARMV6-NEXT: and r1, r6, r1 ; ARMV6-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; ARMV6-NEXT: orr r1, r1, r8 -; ARMV6-NEXT: orr r1, r1, r9 -; ARMV6-NEXT: adcs r0, r2, r0 +; ARMV6-NEXT: orr r1, r1, r4 +; ARMV6-NEXT: orr r1, r1, r7 +; ARMV6-NEXT: and r2, r8, r2 +; ARMV6-NEXT: adcs r0, r12, r0 ; ARMV6-NEXT: str r0, [r5, #12] -; ARMV6-NEXT: and r0, r4, r3 -; ARMV6-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; ARMV6-NEXT: and r0, lr, r3 +; ARMV6-NEXT: ldr r3, [sp, #24] @ 4-byte Reload +; ARMV6-NEXT: orr r0, r0, r9 ; ARMV6-NEXT: orr r0, r0, r10 -; ARMV6-NEXT: orr r0, r0, r6 -; ARMV6-NEXT: orr r0, r0, r2 -; ARMV6-NEXT: ldr r2, [sp, #20] @ 4-byte Reload -; ARMV6-NEXT: orr r1, r1, r2 -; ARMV6-NEXT: and r2, lr, r12 +; ARMV6-NEXT: orr r0, r0, r3 +; ARMV6-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; ARMV6-NEXT: orr r1, r1, r3 ; ARMV6-NEXT: orr r1, r2, r1 ; ARMV6-NEXT: orr r0, r1, r0 ; ARMV6-NEXT: mov r1, #0 @@ -108,101 +104,97 @@ ; ARMV7: @ %bb.0: @ %start ; ARMV7-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ARMV7-NEXT: sub sp, sp, #36 -; ARMV7-NEXT: ldr r5, [sp, #84] +; ARMV7-NEXT: ldr r6, [sp, #84] ; ARMV7-NEXT: mov r8, r0 ; ARMV7-NEXT: ldr r1, [sp, #72] -; ARMV7-NEXT: ldr r10, [sp, #80] -; ARMV7-NEXT: ldr r9, [sp, #76] -; ARMV7-NEXT: umull r4, lr, r5, r1 -; ARMV7-NEXT: umull r0, r7, r2, r10 -; ARMV7-NEXT: str r4, [sp, #24] @ 4-byte Spill -; ARMV7-NEXT: ldr r4, [sp, #88] -; ARMV7-NEXT: umull r1, r6, r1, r10 +; ARMV7-NEXT: ldr r7, [sp, #80] +; ARMV7-NEXT: ldr lr, [sp, #88] +; ARMV7-NEXT: umull r9, r10, r2, r6 +; ARMV7-NEXT: umull r4, r11, r6, r1 +; ARMV7-NEXT: umull r0, r5, r2, r7 +; ARMV7-NEXT: str r4, [sp, #16] @ 4-byte Spill +; ARMV7-NEXT: umull r1, r4, r1, r7 ; ARMV7-NEXT: str r0, [sp, #32] @ 4-byte Spill -; ARMV7-NEXT: umull r11, r0, r2, r5 -; ARMV7-NEXT: str r6, [sp, #20] @ 4-byte Spill -; ARMV7-NEXT: str r1, [sp, #28] @ 4-byte Spill -; ARMV7-NEXT: umull r6, r12, r3, r4 -; ARMV7-NEXT: ldr r1, [sp, #92] -; ARMV7-NEXT: str r0, [sp, #8] @ 4-byte Spill ; ARMV7-NEXT: mov r0, #0 -; ARMV7-NEXT: umlal r7, r0, r3, r10 -; ARMV7-NEXT: str r6, [sp, #16] @ 4-byte Spill -; ARMV7-NEXT: umull r6, r1, r1, r2 -; ARMV7-NEXT: umull r2, r4, r4, r2 -; ARMV7-NEXT: str r6, [sp, #4] @ 4-byte Spill -; ARMV7-NEXT: str r2, [sp, #12] @ 4-byte Spill -; ARMV7-NEXT: adds r2, r11, r7 -; ARMV7-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; ARMV7-NEXT: mov r11, #0 -; ARMV7-NEXT: str r4, [sp] @ 4-byte Spill -; ARMV7-NEXT: umull r6, r4, r9, r10 -; ARMV7-NEXT: adcs r9, r0, r7 +; ARMV7-NEXT: umlal r5, r0, r3, r7 +; ARMV7-NEXT: str r1, [sp, #28] @ 4-byte Spill +; ARMV7-NEXT: str r4, [sp, #8] @ 4-byte Spill +; ARMV7-NEXT: umull r1, r12, r3, lr +; ARMV7-NEXT: adds r5, r9, r5 +; ARMV7-NEXT: adcs r9, r0, r10 ; ARMV7-NEXT: ldr r0, [sp, #32] @ 4-byte Reload -; ARMV7-NEXT: adc r10, r11, #0 -; ARMV7-NEXT: stm r8, {r0, r2} -; ARMV7-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; ARMV7-NEXT: umlal r9, r10, r3, r5 -; ARMV7-NEXT: ldr r2, [sp, #20] @ 4-byte Reload -; ARMV7-NEXT: add r0, r6, r0 -; ARMV7-NEXT: adds r0, r2, r0 -; ARMV7-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; ARMV7-NEXT: adc r2, r11, #0 -; ARMV7-NEXT: str r2, [sp, #32] @ 4-byte Spill -; ARMV7-NEXT: ldr r2, [sp, #16] @ 4-byte Reload +; ARMV7-NEXT: str r1, [sp, #20] @ 4-byte Spill +; ARMV7-NEXT: ldr r1, [sp, #92] +; ARMV7-NEXT: umull r4, r1, r1, r2 +; ARMV7-NEXT: str r4, [sp, #12] @ 4-byte Spill +; ARMV7-NEXT: umull r2, r4, lr, r2 +; ARMV7-NEXT: str r2, [sp, #24] @ 4-byte Spill +; ARMV7-NEXT: ldr r2, [sp, #76] +; ARMV7-NEXT: str r4, [sp, #4] @ 4-byte Spill +; ARMV7-NEXT: stm r8, {r0, r5} +; ARMV7-NEXT: umull lr, r2, r2, r7 +; ARMV7-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; ARMV7-NEXT: mov r7, #0 +; ARMV7-NEXT: adc r10, r7, #0 +; ARMV7-NEXT: ldr r4, [sp, #12] @ 4-byte Reload +; ARMV7-NEXT: umlal r9, r10, r3, r6 +; ARMV7-NEXT: add r5, lr, r0 +; ARMV7-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; ARMV7-NEXT: mov lr, #0 +; ARMV7-NEXT: adds r5, r0, r5 +; ARMV7-NEXT: adc r0, r7, #0 +; ARMV7-NEXT: str r0, [sp, #32] @ 4-byte Spill +; ARMV7-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; ARMV7-NEXT: ldr r7, [sp, #28] @ 4-byte Reload -; ARMV7-NEXT: add r2, r6, r2 -; ARMV7-NEXT: ldr r6, [sp] @ 4-byte Reload -; ARMV7-NEXT: adds r2, r6, r2 -; ARMV7-NEXT: ldr r6, [sp, #12] @ 4-byte Reload -; ARMV7-NEXT: adc r11, r11, #0 -; ARMV7-NEXT: adds r7, r7, r6 -; ARMV7-NEXT: ldr r6, [sp, #92] -; ARMV7-NEXT: adc r0, r0, r2 -; ARMV7-NEXT: str r0, [sp, #28] @ 4-byte Spill +; ARMV7-NEXT: add r0, r4, r0 +; ARMV7-NEXT: ldr r4, [sp, #4] @ 4-byte Reload +; ARMV7-NEXT: adds r0, r4, r0 +; ARMV7-NEXT: ldr r4, [sp, #24] @ 4-byte Reload +; ARMV7-NEXT: adc lr, lr, #0 +; ARMV7-NEXT: adds r7, r7, r4 +; ARMV7-NEXT: adc r4, r5, r0 ; ARMV7-NEXT: ldr r0, [sp, #92] ; ARMV7-NEXT: cmp r3, #0 ; ARMV7-NEXT: movwne r3, #1 -; ARMV7-NEXT: ldr r2, [sp, #76] ; ARMV7-NEXT: cmp r0, #0 -; ARMV7-NEXT: movwne r0, #1 +; ARMV7-NEXT: mov r5, r0 +; ARMV7-NEXT: movwne r5, #1 +; ARMV7-NEXT: and r3, r5, r3 +; ARMV7-NEXT: ldr r5, [sp, #76] ; ARMV7-NEXT: cmp r1, #0 ; ARMV7-NEXT: movwne r1, #1 ; ARMV7-NEXT: cmp r12, #0 -; ARMV7-NEXT: and r0, r0, r3 +; ARMV7-NEXT: orr r1, r3, r1 ; ARMV7-NEXT: movwne r12, #1 ; ARMV7-NEXT: cmp r5, #0 -; ARMV7-NEXT: orr r0, r0, r1 -; ARMV7-NEXT: movwne r5, #1 +; ARMV7-NEXT: mov r3, r5 +; ARMV7-NEXT: movwne r3, #1 +; ARMV7-NEXT: cmp r6, #0 +; ARMV7-NEXT: movwne r6, #1 ; ARMV7-NEXT: cmp r2, #0 -; ARMV7-NEXT: mov r1, r2 -; ARMV7-NEXT: mov r3, r2 -; ARMV7-NEXT: movwne r1, #1 -; ARMV7-NEXT: cmp r4, #0 -; ARMV7-NEXT: ldr r2, [sp, #72] -; ARMV7-NEXT: movwne r4, #1 -; ARMV7-NEXT: cmp lr, #0 -; ARMV7-NEXT: and r1, r1, r5 -; ARMV7-NEXT: movwne lr, #1 -; ARMV7-NEXT: orrs r2, r2, r3 -; ARMV7-NEXT: ldr r3, [sp, #88] +; ARMV7-NEXT: and r3, r3, r6 ; ARMV7-NEXT: movwne r2, #1 -; ARMV7-NEXT: orr r1, r1, r4 -; ARMV7-NEXT: orr r0, r0, r12 -; ARMV7-NEXT: orrs r3, r3, r6 -; ARMV7-NEXT: orr r1, r1, lr +; ARMV7-NEXT: orr r2, r3, r2 +; ARMV7-NEXT: ldr r3, [sp, #72] +; ARMV7-NEXT: cmp r11, #0 +; ARMV7-NEXT: ldr r6, [sp, #88] +; ARMV7-NEXT: movwne r11, #1 +; ARMV7-NEXT: orrs r3, r3, r5 ; ARMV7-NEXT: movwne r3, #1 +; ARMV7-NEXT: orrs r6, r6, r0 +; ARMV7-NEXT: movwne r6, #1 ; ARMV7-NEXT: adds r7, r9, r7 ; ARMV7-NEXT: str r7, [r8, #8] -; ARMV7-NEXT: and r2, r2, r3 -; ARMV7-NEXT: ldr r7, [sp, #28] @ 4-byte Reload -; ARMV7-NEXT: orr r0, r0, r11 -; ARMV7-NEXT: adcs r7, r10, r7 -; ARMV7-NEXT: str r7, [r8, #12] -; ARMV7-NEXT: ldr r7, [sp, #32] @ 4-byte Reload -; ARMV7-NEXT: orr r1, r1, r7 -; ARMV7-NEXT: orr r1, r2, r1 -; ARMV7-NEXT: orr r0, r1, r0 +; ARMV7-NEXT: adcs r0, r10, r4 +; ARMV7-NEXT: str r0, [r8, #12] +; ARMV7-NEXT: orr r2, r2, r11 +; ARMV7-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; ARMV7-NEXT: orr r1, r1, r12 +; ARMV7-NEXT: orr r1, r1, lr +; ARMV7-NEXT: orr r0, r2, r0 +; ARMV7-NEXT: and r2, r3, r6 +; ARMV7-NEXT: orr r0, r2, r0 +; ARMV7-NEXT: orr r0, r0, r1 ; ARMV7-NEXT: mov r1, #0 ; ARMV7-NEXT: adc r1, r1, #0 ; ARMV7-NEXT: orr r0, r0, r1 diff --git a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll @@ -193,62 +193,74 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; ARM5-LABEL: test_urem_odd_setne: ; ARM5: @ %bb.0: -; ARM5-NEXT: mov r1, #13 -; ARM5-NEXT: mul r2, r0, r1 -; ARM5-NEXT: mov r0, #0 -; ARM5-NEXT: and r1, r2, #15 -; ARM5-NEXT: cmp r1, #3 -; ARM5-NEXT: movhi r0, #1 +; ARM5-NEXT: and r1, r0, #15 +; ARM5-NEXT: mov r2, #13 +; ARM5-NEXT: mul r3, r1, r2 +; ARM5-NEXT: lsr r1, r3, #6 +; ARM5-NEXT: orr r1, r1, r1, lsl #2 +; ARM5-NEXT: sub r0, r0, r1 +; ARM5-NEXT: ands r0, r0, #15 +; ARM5-NEXT: movne r0, #1 ; ARM5-NEXT: bx lr ; ; ARM6-LABEL: test_urem_odd_setne: ; ARM6: @ %bb.0: -; ARM6-NEXT: mov r1, #13 -; ARM6-NEXT: mul r0, r0, r1 ; ARM6-NEXT: and r1, r0, #15 -; ARM6-NEXT: mov r0, #0 -; ARM6-NEXT: cmp r1, #3 -; ARM6-NEXT: movhi r0, #1 +; ARM6-NEXT: mov r2, #13 +; ARM6-NEXT: smulbb r1, r1, r2 +; ARM6-NEXT: lsr r1, r1, #6 +; ARM6-NEXT: orr r1, r1, r1, lsl #2 +; ARM6-NEXT: sub r0, r0, r1 +; ARM6-NEXT: ands r0, r0, #15 +; ARM6-NEXT: movne r0, #1 ; ARM6-NEXT: bx lr ; ; ARM7-LABEL: test_urem_odd_setne: ; ARM7: @ %bb.0: -; ARM7-NEXT: mov r1, #13 -; ARM7-NEXT: mul r0, r0, r1 ; ARM7-NEXT: and r1, r0, #15 -; ARM7-NEXT: mov r0, #0 -; ARM7-NEXT: cmp r1, #3 -; ARM7-NEXT: movwhi r0, #1 +; ARM7-NEXT: mov r2, #13 +; ARM7-NEXT: smulbb r1, r1, r2 +; ARM7-NEXT: lsr r1, r1, #6 +; ARM7-NEXT: orr r1, r1, r1, lsl #2 +; ARM7-NEXT: sub r0, r0, r1 +; ARM7-NEXT: ands r0, r0, #15 +; ARM7-NEXT: movwne r0, #1 ; ARM7-NEXT: bx lr ; ; ARM8-LABEL: test_urem_odd_setne: ; ARM8: @ %bb.0: -; ARM8-NEXT: mov r1, #13 -; ARM8-NEXT: mul r0, r0, r1 ; ARM8-NEXT: and r1, r0, #15 -; ARM8-NEXT: mov r0, #0 -; ARM8-NEXT: cmp r1, #3 -; ARM8-NEXT: movwhi r0, #1 +; ARM8-NEXT: mov r2, #13 +; ARM8-NEXT: smulbb r1, r1, r2 +; ARM8-NEXT: lsr r1, r1, #6 +; ARM8-NEXT: orr r1, r1, r1, lsl #2 +; ARM8-NEXT: sub r0, r0, r1 +; ARM8-NEXT: ands r0, r0, #15 +; ARM8-NEXT: movwne r0, #1 ; ARM8-NEXT: bx lr ; ; NEON7-LABEL: test_urem_odd_setne: ; NEON7: @ %bb.0: -; NEON7-NEXT: mov r1, #13 -; NEON7-NEXT: mul r0, r0, r1 ; NEON7-NEXT: and r1, r0, #15 -; NEON7-NEXT: mov r0, #0 -; NEON7-NEXT: cmp r1, #3 -; NEON7-NEXT: movwhi r0, #1 +; NEON7-NEXT: mov r2, #13 +; NEON7-NEXT: smulbb r1, r1, r2 +; NEON7-NEXT: lsr r1, r1, #6 +; NEON7-NEXT: orr r1, r1, r1, lsl #2 +; NEON7-NEXT: sub r0, r0, r1 +; NEON7-NEXT: ands r0, r0, #15 +; NEON7-NEXT: movwne r0, #1 ; NEON7-NEXT: bx lr ; ; NEON8-LABEL: test_urem_odd_setne: ; NEON8: @ %bb.0: -; NEON8-NEXT: mov r1, #13 -; NEON8-NEXT: mul r0, r0, r1 ; NEON8-NEXT: and r1, r0, #15 -; NEON8-NEXT: mov r0, #0 -; NEON8-NEXT: cmp r1, #3 -; NEON8-NEXT: movwhi r0, #1 +; NEON8-NEXT: mov r2, #13 +; NEON8-NEXT: smulbb r1, r1, r2 +; NEON8-NEXT: lsr r1, r1, #6 +; NEON8-NEXT: orr r1, r1, r1, lsl #2 +; NEON8-NEXT: sub r0, r0, r1 +; NEON8-NEXT: ands r0, r0, #15 +; NEON8-NEXT: movwne r0, #1 ; NEON8-NEXT: bx lr %urem = urem i4 %X, 5 %cmp = icmp ne i4 %urem, 0 diff --git a/llvm/test/CodeGen/ARM/usub_sat_plus.ll b/llvm/test/CodeGen/ARM/usub_sat_plus.ll --- a/llvm/test/CodeGen/ARM/usub_sat_plus.ll +++ b/llvm/test/CodeGen/ARM/usub_sat_plus.ll @@ -45,37 +45,39 @@ ; CHECK-T1: @ %bb.0: ; CHECK-T1-NEXT: .save {r4, lr} ; CHECK-T1-NEXT: push {r4, lr} -; CHECK-T1-NEXT: mov r2, r1 -; CHECK-T1-NEXT: movs r1, #0 -; CHECK-T1-NEXT: ldr r4, [sp, #12] +; CHECK-T1-NEXT: add r2, sp, #8 +; CHECK-T1-NEXT: ldr r4, [r2, #4] +; CHECK-T1-NEXT: movs r2, #0 ; CHECK-T1-NEXT: ldr r3, [sp, #8] ; CHECK-T1-NEXT: subs r3, r0, r3 -; CHECK-T1-NEXT: sbcs r2, r4 -; CHECK-T1-NEXT: mov r0, r1 -; CHECK-T1-NEXT: adcs r0, r1 +; CHECK-T1-NEXT: sbcs r1, r4 +; CHECK-T1-NEXT: mov r0, r2 +; CHECK-T1-NEXT: adcs r0, r2 ; CHECK-T1-NEXT: movs r4, #1 ; CHECK-T1-NEXT: eors r4, r0 -; CHECK-T1-NEXT: mov r0, r1 +; CHECK-T1-NEXT: mov r0, r2 ; CHECK-T1-NEXT: beq .LBB1_3 ; CHECK-T1-NEXT: @ %bb.1: ; CHECK-T1-NEXT: cmp r4, #0 ; CHECK-T1-NEXT: beq .LBB1_4 ; CHECK-T1-NEXT: .LBB1_2: +; CHECK-T1-NEXT: mov r1, r2 ; CHECK-T1-NEXT: pop {r4, pc} ; CHECK-T1-NEXT: .LBB1_3: ; CHECK-T1-NEXT: mov r0, r3 ; CHECK-T1-NEXT: cmp r4, #0 ; CHECK-T1-NEXT: bne .LBB1_2 ; CHECK-T1-NEXT: .LBB1_4: +; CHECK-T1-NEXT: mov r2, r1 ; CHECK-T1-NEXT: mov r1, r2 ; CHECK-T1-NEXT: pop {r4, pc} ; ; CHECK-T2-LABEL: func64: ; CHECK-T2: @ %bb.0: -; CHECK-T2-NEXT: ldrd r2, r3, [sp] +; CHECK-T2-NEXT: ldrd r3, r2, [sp] ; CHECK-T2-NEXT: mov.w r12, #0 -; CHECK-T2-NEXT: subs r0, r0, r2 -; CHECK-T2-NEXT: sbcs r1, r3 +; CHECK-T2-NEXT: subs r0, r0, r3 +; CHECK-T2-NEXT: sbcs r1, r2 ; CHECK-T2-NEXT: adc r2, r12, #0 ; CHECK-T2-NEXT: eors r2, r2, #1 ; CHECK-T2-NEXT: itt ne @@ -85,9 +87,8 @@ ; ; CHECK-ARM-LABEL: func64: ; CHECK-ARM: @ %bb.0: -; CHECK-ARM-NEXT: ldr r2, [sp] +; CHECK-ARM-NEXT: ldm sp, {r2, r3} ; CHECK-ARM-NEXT: mov r12, #0 -; CHECK-ARM-NEXT: ldr r3, [sp, #4] ; CHECK-ARM-NEXT: subs r0, r0, r2 ; CHECK-ARM-NEXT: sbcs r1, r1, r3 ; CHECK-ARM-NEXT: adc r2, r12, #0 diff --git a/llvm/test/CodeGen/ARM/vcvt.ll b/llvm/test/CodeGen/ARM/vcvt.ll --- a/llvm/test/CodeGen/ARM/vcvt.ll +++ b/llvm/test/CodeGen/ARM/vcvt.ll @@ -245,7 +245,8 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: vcvt.u32.f32 q8, q8, #1 +; CHECK-NEXT: vadd.f32 q8, q8, q8 +; CHECK-NEXT: vcvt.u32.f32 q8, q8 ; CHECK-NEXT: vmovn.i32 d16, q8 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll --- a/llvm/test/CodeGen/ARM/vdup.ll +++ b/llvm/test/CodeGen/ARM/vdup.ll @@ -198,9 +198,9 @@ define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { ; CHECK-LABEL: v_shuffledupQ32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q8, r0 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov pc, lr %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer @@ -386,11 +386,9 @@ define <4 x i32> @tduplane(<4 x i32> %invec) { ; CHECK-LABEL: tduplane: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: mov r2, r1 ; CHECK-NEXT: mov r3, #255 -; CHECK-NEXT: vmov.32 r0, d16[1] -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r2, r0 ; CHECK-NEXT: mov pc, lr %in = extractelement <4 x i32> %invec, i32 1 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 @@ -403,9 +401,10 @@ define <2 x float> @check_f32(<4 x float> %v) nounwind { ; CHECK-LABEL: check_f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d16, r2, r3 -; CHECK-NEXT: vdup.32 d16, d16[1] -; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vdup.32 d2, d1[1] +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov r0, r1, d2 ; CHECK-NEXT: mov pc, lr %x = extractelement <4 x float> %v, i32 3 %1 = insertelement <2 x float> undef, float %x, i32 0 @@ -416,8 +415,7 @@ define <2 x i32> @check_i32(<4 x i32> %v) nounwind { ; CHECK-LABEL: check_i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d16, r2, r3 -; CHECK-NEXT: vdup.32 d16, d16[1] +; CHECK-NEXT: vdup.32 d16, r3 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %x = extractelement <4 x i32> %v, i32 3 @@ -430,7 +428,9 @@ ; CHECK-LABEL: check_i16: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.u16 r0, d16[3] ; CHECK-NEXT: vdup.16 d16, d16[3] +; CHECK-NEXT: vmov.16 d16[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %x = extractelement <8 x i16> %v, i32 3 @@ -443,7 +443,9 @@ ; CHECK-LABEL: check_i8: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmov.u8 r0, d16[3] ; CHECK-NEXT: vdup.8 d16, d16[3] +; CHECK-NEXT: vmov.8 d16[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %x = extractelement <16 x i8> %v, i32 3 diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-soft-float.ll @@ -139,21 +139,20 @@ define fp128 @test_v2f128_reassoc(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128_reassoc: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldr r12, [sp, #36] -; CHECK-NEXT: str r12, [sp, #12] ; CHECK-NEXT: ldr r12, [sp, #32] -; CHECK-NEXT: str r12, [sp, #8] -; CHECK-NEXT: ldr r12, [sp, #28] -; CHECK-NEXT: str r12, [sp, #4] -; CHECK-NEXT: ldr r12, [sp, #24] -; CHECK-NEXT: str r12, [sp] +; CHECK-NEXT: ldr r4, [sp, #40] +; CHECK-NEXT: ldr r5, [sp, #44] +; CHECK-NEXT: ldr lr, [sp, #36] +; CHECK-NEXT: stm sp, {r12, lr} +; CHECK-NEXT: str r4, [sp, #8] +; CHECK-NEXT: str r5, [sp, #12] ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call reassoc fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 0xL00000000000000008000000000000000, <2 x fp128> %a) ret fp128 %b @@ -162,21 +161,20 @@ define fp128 @test_v2f128_seq(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128_seq: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldr r12, [sp, #36] -; CHECK-NEXT: str r12, [sp, #12] ; CHECK-NEXT: ldr r12, [sp, #32] -; CHECK-NEXT: str r12, [sp, #8] -; CHECK-NEXT: ldr r12, [sp, #28] -; CHECK-NEXT: str r12, [sp, #4] -; CHECK-NEXT: ldr r12, [sp, #24] -; CHECK-NEXT: str r12, [sp] +; CHECK-NEXT: ldr r4, [sp, #40] +; CHECK-NEXT: ldr r5, [sp, #44] +; CHECK-NEXT: ldr lr, [sp, #36] +; CHECK-NEXT: stm sp, {r12, lr} +; CHECK-NEXT: str r4, [sp, #8] +; CHECK-NEXT: str r5, [sp, #12] ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 0xL00000000000000008000000000000000, <2 x fp128> %a) ret fp128 %b diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll @@ -43,9 +43,6 @@ ; CHECK-NEXT: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: bl __aeabi_f2h -; CHECK-NEXT: mov r1, #255 -; CHECK-NEXT: orr r1, r1, #65280 -; CHECK-NEXT: and r0, r0, r1 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr %b = call half @llvm.vector.reduce.fadd.f16.v1f16(half -0.0, <1 x half> %a) @@ -207,13 +204,11 @@ ; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr r4, [sp, #32] -; CHECK-NEXT: ldr r5, [sp, #40] -; CHECK-NEXT: ldr lr, [sp, #44] -; CHECK-NEXT: ldr r12, [sp, #36] -; CHECK-NEXT: stm sp, {r4, r12} -; CHECK-NEXT: str r5, [sp, #8] -; CHECK-NEXT: str lr, [sp, #12] +; CHECK-NEXT: add lr, sp, #36 +; CHECK-NEXT: ldr r5, [sp, #32] +; CHECK-NEXT: str r5, [sp] +; CHECK-NEXT: ldm lr, {r4, r12, lr} +; CHECK-NEXT: stmib sp, {r4, r12, lr} ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: pop {r4, r5, r11, lr} @@ -229,12 +224,11 @@ ; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldr r12, [sp, #36] -; CHECK-NEXT: ldr lr, [sp, #32] +; CHECK-NEXT: ldr r12, [sp, #32] ; CHECK-NEXT: ldr r4, [sp, #40] ; CHECK-NEXT: ldr r5, [sp, #44] -; CHECK-NEXT: str lr, [sp] -; CHECK-NEXT: str r12, [sp, #4] +; CHECK-NEXT: ldr lr, [sp, #36] +; CHECK-NEXT: stm sp, {r12, lr} ; CHECK-NEXT: str r4, [sp, #8] ; CHECK-NEXT: str r5, [sp, #12] ; CHECK-NEXT: bl __addtf3 diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll @@ -116,59 +116,53 @@ ; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #28 ; CHECK-NEXT: sub sp, sp, #28 -; CHECK-NEXT: ldr r5, [sp, #76] +; CHECK-NEXT: add r7, sp, #64 +; CHECK-NEXT: ldr r4, [sp, #76] ; CHECK-NEXT: mov r8, r3 -; CHECK-NEXT: ldr r6, [sp, #72] ; CHECK-NEXT: mov r9, r2 -; CHECK-NEXT: ldr r4, [sp, #68] +; CHECK-NEXT: ldm r7, {r5, r6, r7} ; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: ldr r7, [sp, #64] ; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: str r5, [sp, #12] -; CHECK-NEXT: str r6, [sp, #8] -; CHECK-NEXT: str r4, [sp, #4] -; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: str r4, [sp, #12] +; CHECK-NEXT: stm sp, {r5, r6, r7} ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: str r7, [sp] -; CHECK-NEXT: stmib sp, {r4, r6} -; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: stm sp, {r5, r6, r7} +; CHECK-NEXT: str r4, [sp, #12] ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: str r7, [sp] -; CHECK-NEXT: stmib sp, {r4, r6} -; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: stm sp, {r5, r6, r7} +; CHECK-NEXT: str r4, [sp, #12] ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: str r7, [sp] -; CHECK-NEXT: movgt r7, r11 +; CHECK-NEXT: stm sp, {r5, r6, r7} +; CHECK-NEXT: movgt r5, r11 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: stmib sp, {r4, r6} -; CHECK-NEXT: movgt r4, r10 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: movgt r6, r10 ; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: str r5, [sp, #12] -; CHECK-NEXT: movgt r6, r9 +; CHECK-NEXT: str r4, [sp, #12] +; CHECK-NEXT: movgt r7, r9 ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: movgt r5, r8 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: movgt r4, r8 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: add sp, sp, #28 ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: mov pc, lr diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll @@ -116,59 +116,53 @@ ; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #28 ; CHECK-NEXT: sub sp, sp, #28 -; CHECK-NEXT: ldr r5, [sp, #76] +; CHECK-NEXT: add r7, sp, #64 +; CHECK-NEXT: ldr r4, [sp, #76] ; CHECK-NEXT: mov r8, r3 -; CHECK-NEXT: ldr r6, [sp, #72] ; CHECK-NEXT: mov r9, r2 -; CHECK-NEXT: ldr r4, [sp, #68] +; CHECK-NEXT: ldm r7, {r5, r6, r7} ; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: ldr r7, [sp, #64] ; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: str r5, [sp, #12] -; CHECK-NEXT: str r6, [sp, #8] -; CHECK-NEXT: str r4, [sp, #4] -; CHECK-NEXT: str r7, [sp] +; CHECK-NEXT: str r4, [sp, #12] +; CHECK-NEXT: stm sp, {r5, r6, r7} ; CHECK-NEXT: bl __lttf2 ; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: str r7, [sp] -; CHECK-NEXT: stmib sp, {r4, r6} -; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: stm sp, {r5, r6, r7} +; CHECK-NEXT: str r4, [sp, #12] ; CHECK-NEXT: bl __lttf2 ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: str r7, [sp] -; CHECK-NEXT: stmib sp, {r4, r6} -; CHECK-NEXT: str r5, [sp, #12] +; CHECK-NEXT: stm sp, {r5, r6, r7} +; CHECK-NEXT: str r4, [sp, #12] ; CHECK-NEXT: bl __lttf2 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: str r7, [sp] -; CHECK-NEXT: movmi r7, r11 +; CHECK-NEXT: stm sp, {r5, r6, r7} +; CHECK-NEXT: movmi r5, r11 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: stmib sp, {r4, r6} -; CHECK-NEXT: movmi r4, r10 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: movmi r6, r10 ; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: str r5, [sp, #12] -; CHECK-NEXT: movmi r6, r9 +; CHECK-NEXT: str r4, [sp, #12] +; CHECK-NEXT: movmi r7, r9 ; CHECK-NEXT: bl __lttf2 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: movmi r5, r8 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: movmi r4, r8 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: add sp, sp, #28 ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: mov pc, lr diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-soft-float.ll @@ -104,21 +104,20 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldr r12, [sp, #36] -; CHECK-NEXT: str r12, [sp, #12] ; CHECK-NEXT: ldr r12, [sp, #32] -; CHECK-NEXT: str r12, [sp, #8] -; CHECK-NEXT: ldr r12, [sp, #28] -; CHECK-NEXT: str r12, [sp, #4] -; CHECK-NEXT: ldr r12, [sp, #24] -; CHECK-NEXT: str r12, [sp] +; CHECK-NEXT: ldr r4, [sp, #40] +; CHECK-NEXT: ldr r5, [sp, #44] +; CHECK-NEXT: ldr lr, [sp, #36] +; CHECK-NEXT: stm sp, {r12, lr} +; CHECK-NEXT: str r4, [sp, #8] +; CHECK-NEXT: str r5, [sp, #12] ; CHECK-NEXT: bl __multf3 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fast fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a) ret fp128 %b @@ -127,21 +126,20 @@ define fp128 @test_v2f128_strict(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128_strict: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r11, lr} -; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldr r12, [sp, #36] -; CHECK-NEXT: str r12, [sp, #12] ; CHECK-NEXT: ldr r12, [sp, #32] -; CHECK-NEXT: str r12, [sp, #8] -; CHECK-NEXT: ldr r12, [sp, #28] -; CHECK-NEXT: str r12, [sp, #4] -; CHECK-NEXT: ldr r12, [sp, #24] -; CHECK-NEXT: str r12, [sp] +; CHECK-NEXT: ldr r4, [sp, #40] +; CHECK-NEXT: ldr r5, [sp, #44] +; CHECK-NEXT: ldr lr, [sp, #36] +; CHECK-NEXT: stm sp, {r12, lr} +; CHECK-NEXT: str r4, [sp, #8] +; CHECK-NEXT: str r5, [sp, #12] ; CHECK-NEXT: bl __multf3 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: pop {r4, r5, r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a) ret fp128 %b diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll @@ -16,9 +16,6 @@ ; CHECK-NEXT: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: bl __aeabi_f2h -; CHECK-NEXT: mov r1, #255 -; CHECK-NEXT: orr r1, r1, #65280 -; CHECK-NEXT: and r0, r0, r1 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr %b = call half @llvm.vector.reduce.fmul.f16.v1f16(half 1.0, <1 x half> %a) @@ -69,12 +66,11 @@ ; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldr r12, [sp, #36] -; CHECK-NEXT: ldr lr, [sp, #32] +; CHECK-NEXT: ldr r12, [sp, #32] ; CHECK-NEXT: ldr r4, [sp, #40] ; CHECK-NEXT: ldr r5, [sp, #44] -; CHECK-NEXT: str lr, [sp] -; CHECK-NEXT: str r12, [sp, #4] +; CHECK-NEXT: ldr lr, [sp, #36] +; CHECK-NEXT: stm sp, {r12, lr} ; CHECK-NEXT: str r4, [sp, #8] ; CHECK-NEXT: str r5, [sp, #12] ; CHECK-NEXT: bl __multf3 diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll --- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll +++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll @@ -84,12 +84,13 @@ define void @lshrIllegalType(ptr %A) nounwind { ; CHECK-LABEL: lshrIllegalType: ; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128] -; CHECK-NEXT: vshr.u32 q8, q8, #3 -; CHECK-NEXT: vst1.32 {d16, d17}, [r0:128]! -; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: vld1.32 {d16, d17}, [r1:128]! +; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128] ; CHECK-NEXT: vshr.u32 q8, q8, #3 +; CHECK-NEXT: vshr.u32 q9, q9, #3 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: vst1.64 {d18, d19}, [r1:128] ; CHECK-NEXT: bx lr %tmp1 = load <8 x i32>, ptr %A %tmp2 = lshr <8 x i32> %tmp1, < i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> @@ -345,10 +346,9 @@ define <2 x i8> @test_truncate(<2 x i128> %in) { ; CHECK-LABEL: test_truncate: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.32 d16[0], r0 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vld1.32 {d16[1]}, [r0:32] -; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vldr s1, [sp] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: bx lr entry: %res = trunc <2 x i128> %in to <2 x i8> diff --git a/llvm/test/CodeGen/ARM/vlddup.ll b/llvm/test/CodeGen/ARM/vlddup.ll --- a/llvm/test/CodeGen/ARM/vlddup.ll +++ b/llvm/test/CodeGen/ARM/vlddup.ll @@ -17,10 +17,10 @@ define <8 x i8> @vld1dupi8_preinc(ptr noalias nocapture %a, i32 %b) nounwind { ; CHECK-LABEL: vld1dupi8_preinc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldr r2, [r0] -; CHECK-NEXT: add r3, r2, r1 +; CHECK-NEXT: ldr r3, [r0] +; CHECK-NEXT: ldrb r1, [r3, r1]! ; CHECK-NEXT: str r3, [r0] -; CHECK-NEXT: vld1.8 {d16[]}, [r3] +; CHECK-NEXT: vdup.8 d16, r1 ; CHECK-NEXT: vmov r2, r1, d16 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov pc, lr @@ -38,8 +38,9 @@ ; CHECK-LABEL: vld1dupi8_postinc_fixed: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: vld1.8 {d16[]}, [r3]! +; CHECK-NEXT: ldrb r1, [r3], #1 ; CHECK-NEXT: str r3, [r0] +; CHECK-NEXT: vdup.8 d16, r1 ; CHECK-NEXT: vmov r2, r1, d16 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov pc, lr @@ -57,8 +58,9 @@ ; CHECK-LABEL: vld1dupi8_postinc_register: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldr r3, [r0] -; CHECK-NEXT: vld1.8 {d16[]}, [r3], r1 +; CHECK-NEXT: ldrb r1, [r3], r1 ; CHECK-NEXT: str r3, [r0] +; CHECK-NEXT: vdup.8 d16, r1 ; CHECK-NEXT: vmov r2, r1, d16 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov pc, lr @@ -77,10 +79,10 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} -; CHECK-NEXT: ldr r2, [r0] -; CHECK-NEXT: add lr, r2, r1 +; CHECK-NEXT: ldr lr, [r0] +; CHECK-NEXT: ldrb r1, [lr, r1]! ; CHECK-NEXT: str lr, [r0] -; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr] +; CHECK-NEXT: vdup.8 q8, r1 ; CHECK-NEXT: vmov r12, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov r0, r12 @@ -102,8 +104,9 @@ ; CHECK-NEXT: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: ldr lr, [r0] -; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr]! +; CHECK-NEXT: ldrb r1, [lr], #1 ; CHECK-NEXT: str lr, [r0] +; CHECK-NEXT: vdup.8 q8, r1 ; CHECK-NEXT: vmov r12, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov r0, r12 @@ -125,8 +128,9 @@ ; CHECK-NEXT: .save {r11, lr} ; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: ldr lr, [r0] -; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr], r1 +; CHECK-NEXT: ldrb r1, [lr], r1 ; CHECK-NEXT: str lr, [r0] +; CHECK-NEXT: vdup.8 q8, r1 ; CHECK-NEXT: vmov r12, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov r0, r12 @@ -231,9 +235,9 @@ ; CHECK-LABEL: load_i32_dup_zext: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vdup.32 q8, r0 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov pc, lr %tmp1 = load i8, ptr %A, align 1 %tmp2 = zext i8 %tmp1 to i32 @@ -247,9 +251,9 @@ ; CHECK-LABEL: load_i32_dup_sext: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldrsb r0, [r0] -; CHECK-NEXT: vdup.32 q8, r0 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov pc, lr %tmp1 = load i8, ptr %A, align 1 %tmp2 = sext i8 %tmp1 to i32 @@ -348,10 +352,12 @@ ; CHECK-LABEL: vld2dupi8_postinc_fixed: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldr r2, [r1] -; CHECK-NEXT: vld2.8 {d16[], d17[]}, [r2]! +; CHECK-NEXT: vld2.8 {d16[0], d17[0]}, [r2]! +; CHECK-NEXT: vdup.8 d16, d16[0] ; CHECK-NEXT: str r2, [r1] +; CHECK-NEXT: vdup.8 d18, d17[0] ; CHECK-NEXT: vst1.8 {d16}, [r0:64]! -; CHECK-NEXT: vstr d17, [r0] +; CHECK-NEXT: vstr d18, [r0] ; CHECK-NEXT: mov pc, lr entry: %0 = load ptr, ptr %a, align 4 @@ -372,10 +378,12 @@ ; CHECK-LABEL: vld2dupi8_postinc_variable: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldr r3, [r1] -; CHECK-NEXT: vld2.8 {d16[], d17[]}, [r3], r2 +; CHECK-NEXT: vld2.8 {d16[0], d17[0]}, [r3], r2 +; CHECK-NEXT: vdup.8 d16, d16[0] ; CHECK-NEXT: str r3, [r1] +; CHECK-NEXT: vdup.8 d18, d17[0] ; CHECK-NEXT: vst1.8 {d16}, [r0:64]! -; CHECK-NEXT: vstr d17, [r0] +; CHECK-NEXT: vstr d18, [r0] ; CHECK-NEXT: mov pc, lr entry: %0 = load ptr, ptr %a, align 4 diff --git a/llvm/test/CodeGen/ARM/vldlane.ll b/llvm/test/CodeGen/ARM/vldlane.ll --- a/llvm/test/CodeGen/ARM/vldlane.ll +++ b/llvm/test/CodeGen/ARM/vldlane.ll @@ -913,33 +913,17 @@ ; we don't currently have a QQQQ_VFP2 super-regclass. (The "0" for the low ; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.) define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind { -; DEFAULT-LABEL: test_qqqq_regsequence_subreg: -; DEFAULT: @ %bb.0: -; DEFAULT-NEXT: add r0, sp, #24 -; DEFAULT-NEXT: vld1.32 {d21[0]}, [r0:32] -; DEFAULT-NEXT: add r0, sp, #28 -; DEFAULT-NEXT: vmov.i32 d20, #0x0 -; DEFAULT-NEXT: vld1.32 {d21[1]}, [r0:32] -; DEFAULT-NEXT: vld3.16 {d16[1], d18[1], d20[1]}, [r0] -; DEFAULT-NEXT: vadd.i16 q12, q8, q9 -; DEFAULT-NEXT: vadd.i16 q8, q10, q12 -; DEFAULT-NEXT: vmov r0, r1, d16 -; DEFAULT-NEXT: vmov r2, r3, d17 -; DEFAULT-NEXT: mov pc, lr -; -; BASIC-LABEL: test_qqqq_regsequence_subreg: -; BASIC: @ %bb.0: -; BASIC-NEXT: add r0, sp, #24 -; BASIC-NEXT: vld1.32 {d23[0]}, [r0:32] -; BASIC-NEXT: add r0, sp, #28 -; BASIC-NEXT: vmov.i32 d22, #0x0 -; BASIC-NEXT: vld1.32 {d23[1]}, [r0:32] -; BASIC-NEXT: vld3.16 {d18[1], d20[1], d22[1]}, [r0] -; BASIC-NEXT: vadd.i16 q8, q9, q10 -; BASIC-NEXT: vadd.i16 q8, q11, q8 -; BASIC-NEXT: vmov r0, r1, d16 -; BASIC-NEXT: vmov r2, r3, d17 -; BASIC-NEXT: mov pc, lr +; CHECK-LABEL: test_qqqq_regsequence_subreg: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 d4, #0x0 +; CHECK-NEXT: vldr s10, [sp, #24] +; CHECK-NEXT: vldr s11, [sp, #28] +; CHECK-NEXT: vld3.16 {d0[1], d2[1], d4[1]}, [r0] +; CHECK-NEXT: vadd.i16 q8, q0, q1 +; CHECK-NEXT: vadd.i16 q8, q2, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp63 = extractvalue [6 x i64] %b, 5 %tmp64 = zext i64 %tmp63 to i128 %tmp65 = shl i128 %tmp64, 64 diff --git a/llvm/test/CodeGen/ARM/vrev.ll b/llvm/test/CodeGen/ARM/vrev.ll --- a/llvm/test/CodeGen/ARM/vrev.ll +++ b/llvm/test/CodeGen/ARM/vrev.ll @@ -273,8 +273,9 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128] ; CHECK-NEXT: vadd.f32 d18, d17, d17 -; CHECK-NEXT: vrev64.32 d16, d16 -; CHECK-NEXT: vrev64.32 d17, d18 +; CHECK-NEXT: vext.32 q8, q8, q8, #2 +; CHECK-NEXT: vext.32 q8, q8, q9, #2 +; CHECK-NEXT: vrev64.32 q8, q8 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x float>, ptr %v, align 16 diff --git a/llvm/test/CodeGen/ARM/vselect_imax.ll b/llvm/test/CodeGen/ARM/vselect_imax.ll --- a/llvm/test/CodeGen/ARM/vselect_imax.ll +++ b/llvm/test/CodeGen/ARM/vselect_imax.ll @@ -1,9 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: opt < %s -passes='print' -mtriple=arm-apple-ios6.0.0 -mcpu=cortex-a8 2>&1 -disable-output | FileCheck %s --check-prefix=COST ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s ; Make sure that ARM backend with NEON handles vselect. define void @vmax_v4i32(ptr %m, <4 x i32> %a, <4 x i32> %b) { -; CHECK: vmax.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} +; CHECK-LABEL: vmax_v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: add r1, sp, #8 +; CHECK-NEXT: vldr d17, [sp] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmax.s32 q8, q8, q9 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: mov pc, lr %cmpres = icmp sgt <4 x i32> %a, %b %maxres = select <4 x i1> %cmpres, <4 x i32> %a, <4 x i32> %b store <4 x i32> %maxres, ptr %m @@ -14,12 +23,21 @@ %T1_10 = type <16 x i1> ; CHECK-LABEL: func_blend10: define void @func_blend10(ptr %loadaddr, ptr %loadaddr2, +; CHECK-LABEL: func_blend10: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]! +; CHECK-NEXT: vld1.16 {d18, d19}, [r0:128]! +; CHECK-NEXT: vmin.s16 q8, q9, q8 +; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128] +; CHECK-NEXT: vmin.s16 q9, q9, q10 +; CHECK-NEXT: vst1.16 {d16, d17}, [r3:128]! +; CHECK-NEXT: vst1.64 {d18, d19}, [r3:128] +; CHECK-NEXT: mov pc, lr ptr %blend, ptr %storeaddr) { %v0 = load %T0_10, ptr %loadaddr %v1 = load %T0_10, ptr %loadaddr2 %c = icmp slt %T0_10 %v0, %v1 -; CHECK: vmin.s16 -; CHECK: vmin.s16 ; COST: func_blend10 ; COST: cost of 0 {{.*}} icmp ; COST: cost of 4 {{.*}} select @@ -31,12 +49,21 @@ %T1_14 = type <8 x i1> ; CHECK-LABEL: func_blend14: define void @func_blend14(ptr %loadaddr, ptr %loadaddr2, +; CHECK-LABEL: func_blend14: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16, d17}, [r1:128]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0:128]! +; CHECK-NEXT: vmin.s32 q8, q9, q8 +; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128] +; CHECK-NEXT: vmin.s32 q9, q9, q10 +; CHECK-NEXT: vst1.32 {d16, d17}, [r3:128]! +; CHECK-NEXT: vst1.64 {d18, d19}, [r3:128] +; CHECK-NEXT: mov pc, lr ptr %blend, ptr %storeaddr) { %v0 = load %T0_14, ptr %loadaddr %v1 = load %T0_14, ptr %loadaddr2 %c = icmp slt %T0_14 %v0, %v1 -; CHECK: vmin.s32 -; CHECK: vmin.s32 ; COST: func_blend14 ; COST: cost of 0 {{.*}} icmp ; COST: cost of 4 {{.*}} select @@ -48,9 +75,27 @@ %T1_15 = type <16 x i1> ; CHECK-LABEL: func_blend15: define void @func_blend15(ptr %loadaddr, ptr %loadaddr2, +; CHECK-LABEL: func_blend15: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16, d17}, [r1:128]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0:128]! +; CHECK-NEXT: vmin.s32 q8, q9, q8 +; CHECK-NEXT: vld1.32 {d20, d21}, [r1:128]! +; CHECK-NEXT: vld1.32 {d22, d23}, [r0:128]! +; CHECK-NEXT: vmin.s32 q10, q11, q10 +; CHECK-NEXT: vld1.32 {d24, d25}, [r1:128]! +; CHECK-NEXT: vld1.32 {d26, d27}, [r0:128]! +; CHECK-NEXT: vmin.s32 q12, q13, q12 +; CHECK-NEXT: vld1.64 {d18, d19}, [r1:128] +; CHECK-NEXT: vld1.64 {d22, d23}, [r0:128] +; CHECK-NEXT: add r0, r3, #48 +; CHECK-NEXT: vmin.s32 q9, q11, q9 +; CHECK-NEXT: vst1.32 {d16, d17}, [r3:128]! +; CHECK-NEXT: vst1.32 {d20, d21}, [r3:128]! +; CHECK-NEXT: vst1.64 {d24, d25}, [r3:128] +; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128] +; CHECK-NEXT: mov pc, lr ptr %blend, ptr %storeaddr) { -; CHECK: vmin.s32 -; CHECK: vmin.s32 %v0 = load %T0_15, ptr %loadaddr %v1 = load %T0_15, ptr %loadaddr2 %c = icmp slt %T0_15 %v0, %v1 @@ -67,7 +112,6 @@ %T0_18 = type <4 x i64> %T1_18 = type <4 x i1> define void @func_blend18(ptr %loadaddr, ptr %loadaddr2, - ptr %blend, ptr %storeaddr) { ; CHECK-LABEL: func_blend18: ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} @@ -118,6 +162,7 @@ ; CHECK-NEXT: vst1.64 {d18, d19}, [r3:128] ; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} ; CHECK-NEXT: mov pc, lr + ptr %blend, ptr %storeaddr) { %v0 = load %T0_18, ptr %loadaddr %v1 = load %T0_18, ptr %loadaddr2 %c = icmp slt %T0_18 %v0, %v1 @@ -131,7 +176,6 @@ %T0_19 = type <8 x i64> %T1_19 = type <8 x i1> define void @func_blend19(ptr %loadaddr, ptr %loadaddr2, - ptr %blend, ptr %storeaddr) { ; CHECK-LABEL: func_blend19: ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, lr} @@ -213,19 +257,21 @@ ; CHECK-NEXT: cmp lr, #0 ; CHECK-NEXT: mvnne lr, #0 ; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: vdup.32 d31, lr ; CHECK-NEXT: mvnne r6, #0 -; CHECK-NEXT: vdup.32 d30, r2 +; CHECK-NEXT: vdup.32 d31, lr ; CHECK-NEXT: vdup.32 d3, r6 -; CHECK-NEXT: vbit q11, q13, q15 +; CHECK-NEXT: add r0, r3, #48 +; CHECK-NEXT: vdup.32 d30, r2 ; CHECK-NEXT: vdup.32 d2, r12 +; CHECK-NEXT: vbit q11, q13, q15 ; CHECK-NEXT: vst1.64 {d28, d29}, [r3:128]! ; CHECK-NEXT: vbit q8, q9, q1 ; CHECK-NEXT: vst1.64 {d20, d21}, [r3:128]! -; CHECK-NEXT: vst1.64 {d22, d23}, [r3:128]! -; CHECK-NEXT: vst1.64 {d16, d17}, [r3:128] +; CHECK-NEXT: vst1.64 {d22, d23}, [r3:128] +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] ; CHECK-NEXT: pop {r4, r5, r6, lr} ; CHECK-NEXT: mov pc, lr + ptr %blend, ptr %storeaddr) { %v0 = load %T0_19, ptr %loadaddr %v1 = load %T0_19, ptr %loadaddr2 %c = icmp slt %T0_19 %v0, %v1 @@ -239,7 +285,6 @@ %T0_20 = type <16 x i64> %T1_20 = type <16 x i1> define void @func_blend20(ptr %loadaddr, ptr %loadaddr2, - ptr %blend, ptr %storeaddr) { ; CHECK-LABEL: func_blend20: ; CHECK: @ %bb.0: ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} @@ -249,192 +294,196 @@ ; CHECK-NEXT: mov r8, r1 ; CHECK-NEXT: mov lr, r0 ; CHECK-NEXT: vld1.64 {d16, d17}, [r8:128]! -; CHECK-NEXT: add r9, r0, #64 -; CHECK-NEXT: add r10, r1, #64 +; CHECK-NEXT: add r9, r0, #96 ; CHECK-NEXT: mov r12, #0 -; CHECK-NEXT: vld1.64 {d22, d23}, [lr:128]! -; CHECK-NEXT: vld1.64 {d18, d19}, [r8:128]! -; CHECK-NEXT: vld1.64 {d20, d21}, [lr:128]! -; CHECK-NEXT: vmov r6, r4, d19 -; CHECK-NEXT: vmov r5, r7, d21 -; CHECK-NEXT: vld1.64 {d4, d5}, [r9:128]! -; CHECK-NEXT: vld1.64 {d6, d7}, [r10:128]! -; CHECK-NEXT: vld1.64 {d0, d1}, [r10:128]! -; CHECK-NEXT: vld1.64 {d2, d3}, [r9:128]! -; CHECK-NEXT: subs r6, r5, r6 -; CHECK-NEXT: sbcs r4, r7, r4 -; CHECK-NEXT: vmov r5, r6, d18 -; CHECK-NEXT: vmov r7, r2, d20 +; CHECK-NEXT: vld1.64 {d24, d25}, [lr:128]! +; CHECK-NEXT: vmov r4, r5, d16 +; CHECK-NEXT: vmov r6, r7, d24 +; CHECK-NEXT: subs r4, r6, r4 +; CHECK-NEXT: sbcs r4, r7, r5 +; CHECK-NEXT: vmov r5, r6, d17 +; CHECK-NEXT: vmov r7, r2, d25 ; CHECK-NEXT: mov r4, #0 ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: vdup.32 d31, r4 ; CHECK-NEXT: subs r5, r7, r5 ; CHECK-NEXT: sbcs r2, r2, r6 -; CHECK-NEXT: vmov r4, r5, d3 +; CHECK-NEXT: add r5, r0, #64 ; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: vld1.64 {d22, d23}, [r5:128]! ; CHECK-NEXT: movlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d30, r2 -; CHECK-NEXT: vmov r0, r2, d1 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r5, r2 -; CHECK-NEXT: vmov r4, r5, d2 -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d9, r0 -; CHECK-NEXT: vmov r0, r2, d0 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r5, r2 -; CHECK-NEXT: vmov r4, r5, d5 -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: movlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d8, r0 -; CHECK-NEXT: vmov r0, r2, d7 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r5, r2 -; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vld1.64 {d20, d21}, [r5:128] +; CHECK-NEXT: vdup.32 d27, r2 +; CHECK-NEXT: add r2, r1, #64 +; CHECK-NEXT: vld1.64 {d30, d31}, [r2:128]! +; CHECK-NEXT: vmov r5, r6, d20 +; CHECK-NEXT: add r1, r1, #96 +; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128] +; CHECK-NEXT: vdup.32 d26, r4 +; CHECK-NEXT: vmov r2, r4, d18 +; CHECK-NEXT: vbit q8, q12, q13 +; CHECK-NEXT: vld1.64 {d24, d25}, [lr:128]! +; CHECK-NEXT: vld1.64 {d26, d27}, [r8:128]! +; CHECK-NEXT: vld1.64 {d8, d9}, [r1:128]! +; CHECK-NEXT: vld1.64 {d4, d5}, [r9:128]! +; CHECK-NEXT: vld1.64 {d28, d29}, [r1:128] +; CHECK-NEXT: subs r2, r5, r2 +; CHECK-NEXT: sbcs r2, r6, r4 +; CHECK-NEXT: vmov r6, r7, d25 +; CHECK-NEXT: vmov r2, r5, d27 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: subs r2, r6, r2 +; CHECK-NEXT: sbcs r2, r7, r5 +; CHECK-NEXT: vmov r6, r7, d24 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: mvnne r2, #0 +; CHECK-NEXT: vdup.32 d3, r2 +; CHECK-NEXT: vmov r2, r5, d26 +; CHECK-NEXT: subs r2, r6, r2 +; CHECK-NEXT: sbcs r2, r7, r5 +; CHECK-NEXT: vmov r7, r0, d5 +; CHECK-NEXT: vmov r2, r6, d9 +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: mvnne r5, #0 +; CHECK-NEXT: vdup.32 d2, r5 +; CHECK-NEXT: subs r2, r7, r2 +; CHECK-NEXT: sbcs r0, r0, r6 +; CHECK-NEXT: vmov r6, r7, d4 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: vdup.32 d11, r0 -; CHECK-NEXT: vmov r0, r2, d6 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r5, r2 -; CHECK-NEXT: vmov r4, r5, d23 +; CHECK-NEXT: vmov r0, r2, d8 +; CHECK-NEXT: subs r0, r6, r0 +; CHECK-NEXT: sbcs r0, r7, r2 +; CHECK-NEXT: vmov r6, r7, d23 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: vdup.32 d10, r0 -; CHECK-NEXT: vmov r0, r2, d17 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r5, r2 -; CHECK-NEXT: vmov r4, r5, d22 +; CHECK-NEXT: vmov r0, r2, d31 +; CHECK-NEXT: subs r0, r6, r0 +; CHECK-NEXT: sbcs r0, r7, r2 +; CHECK-NEXT: vmov r6, r7, d22 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d25, r0 -; CHECK-NEXT: vmov r0, r2, d16 -; CHECK-NEXT: subs r0, r4, r0 -; CHECK-NEXT: sbcs r0, r5, r2 +; CHECK-NEXT: vdup.32 d1, r0 +; CHECK-NEXT: vmov r0, r2, d30 +; CHECK-NEXT: subs r0, r6, r0 +; CHECK-NEXT: sbcs r0, r7, r2 +; CHECK-NEXT: vmov r2, r6, d21 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d24, r0 -; CHECK-NEXT: vorr q13, q12, q12 -; CHECK-NEXT: vbsl q13, q11, q8 -; CHECK-NEXT: vld1.64 {d24, d25}, [r9:128]! -; CHECK-NEXT: vorr q8, q5, q5 -; CHECK-NEXT: vld1.64 {d28, d29}, [r10:128]! -; CHECK-NEXT: vbsl q8, q2, q3 +; CHECK-NEXT: vdup.32 d0, r0 +; CHECK-NEXT: vmov r0, r1, d19 +; CHECK-NEXT: vbif q11, q15, q0 ; CHECK-NEXT: vld1.64 {d6, d7}, [r8:128]! -; CHECK-NEXT: vld1.64 {d22, d23}, [r8:128] -; CHECK-NEXT: vld1.64 {d4, d5}, [lr:128]! -; CHECK-NEXT: vbif q10, q9, q15 -; CHECK-NEXT: vorr q9, q4, q4 -; CHECK-NEXT: vmov r0, r2, d22 -; CHECK-NEXT: vbsl q9, q1, q0 -; CHECK-NEXT: vld1.64 {d30, d31}, [lr:128] -; CHECK-NEXT: mov lr, #0 -; CHECK-NEXT: vmov r7, r5, d30 -; CHECK-NEXT: vld1.64 {d0, d1}, [r9:128] -; CHECK-NEXT: vld1.64 {d2, d3}, [r10:128] -; CHECK-NEXT: subs r0, r7, r0 -; CHECK-NEXT: sbcs r0, r5, r2 -; CHECK-NEXT: vmov r5, r4, d24 -; CHECK-NEXT: vmov r0, r7, d28 -; CHECK-NEXT: movlt lr, #1 -; CHECK-NEXT: cmp lr, #0 -; CHECK-NEXT: mvnne lr, #0 -; CHECK-NEXT: subs r0, r5, r0 -; CHECK-NEXT: sbcs r0, r4, r7 -; CHECK-NEXT: vmov r7, r5, d29 -; CHECK-NEXT: vmov r4, r6, d25 +; CHECK-NEXT: vorr q15, q5, q5 +; CHECK-NEXT: vld1.64 {d0, d1}, [r8:128] +; CHECK-NEXT: vmov r10, r8, d7 +; CHECK-NEXT: vbsl q15, q2, q4 +; CHECK-NEXT: vld1.64 {d4, d5}, [r9:128] +; CHECK-NEXT: vbif q12, q13, q1 +; CHECK-NEXT: vld1.64 {d2, d3}, [lr:128]! +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs r0, r6, r1 +; CHECK-NEXT: vmov r5, r6, d28 +; CHECK-NEXT: vmov r7, r1, d4 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: subs r7, r4, r7 -; CHECK-NEXT: mov r4, #0 -; CHECK-NEXT: sbcs r7, r6, r5 -; CHECK-NEXT: vmov r5, r1, d31 -; CHECK-NEXT: vmov r7, r6, d23 -; CHECK-NEXT: movlt r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: mvnne r4, #0 -; CHECK-NEXT: subs r7, r5, r7 -; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: vdup.32 d27, r0 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: vdup.32 d26, r4 +; CHECK-NEXT: vbit q9, q10, q13 +; CHECK-NEXT: subs r5, r7, r5 ; CHECK-NEXT: sbcs r1, r1, r6 -; CHECK-NEXT: vmov r6, r2, d5 -; CHECK-NEXT: vmov r1, r7, d7 -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: mvnne r5, #0 -; CHECK-NEXT: subs r1, r6, r1 -; CHECK-NEXT: sbcs r1, r2, r7 -; CHECK-NEXT: vmov r6, r7, d4 +; CHECK-NEXT: vmov r5, r6, d2 +; CHECK-NEXT: vmov r1, r4, d6 +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: subs r1, r5, r1 +; CHECK-NEXT: vmov r2, r5, d5 +; CHECK-NEXT: sbcs r1, r6, r4 +; CHECK-NEXT: vmov r6, r7, d29 +; CHECK-NEXT: vmov r4, r9, d3 ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: vdup.32 d9, r1 -; CHECK-NEXT: vmov r1, r2, d6 -; CHECK-NEXT: subs r1, r6, r1 -; CHECK-NEXT: sbcs r1, r7, r2 -; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: subs r2, r2, r6 +; CHECK-NEXT: sbcs r2, r5, r7 +; CHECK-NEXT: vmov r6, r5, d0 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movlt r2, #1 +; CHECK-NEXT: subs r4, r4, r10 +; CHECK-NEXT: sbcs r4, r9, r8 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mvnne r4, #0 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: vdup.32 d9, r4 +; CHECK-NEXT: mvnne r2, #0 +; CHECK-NEXT: vdup.32 d8, r1 +; CHECK-NEXT: vorr q10, q4, q4 +; CHECK-NEXT: vdup.32 d27, r2 +; CHECK-NEXT: vdup.32 d26, r0 +; CHECK-NEXT: vmov r0, r8, d1 +; CHECK-NEXT: vbsl q10, q1, q3 +; CHECK-NEXT: vld1.64 {d2, d3}, [lr:128] +; CHECK-NEXT: vbsl q13, q2, q14 +; CHECK-NEXT: vmov r4, r1, d2 +; CHECK-NEXT: vmov r2, r7, d3 +; CHECK-NEXT: subs r6, r4, r6 +; CHECK-NEXT: sbcs r1, r1, r5 ; CHECK-NEXT: mov r1, #0 ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: mvnne r1, #0 -; CHECK-NEXT: vdup.32 d8, r1 -; CHECK-NEXT: vmov r1, r2, d2 -; CHECK-NEXT: vbif q2, q3, q4 -; CHECK-NEXT: vdup.32 d7, r5 -; CHECK-NEXT: vdup.32 d9, r4 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vdup.32 d8, r0 -; CHECK-NEXT: mov r0, r3 -; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128]! -; CHECK-NEXT: vbif q12, q14, q4 -; CHECK-NEXT: vdup.32 d6, lr -; CHECK-NEXT: vbit q11, q15, q3 -; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128]! -; CHECK-NEXT: subs r1, r6, r1 -; CHECK-NEXT: mov r6, #0 -; CHECK-NEXT: sbcs r1, r7, r2 -; CHECK-NEXT: vmov r1, r2, d3 -; CHECK-NEXT: movlt r6, #1 -; CHECK-NEXT: subs r1, r4, r1 -; CHECK-NEXT: sbcs r1, r5, r2 +; CHECK-NEXT: subs r0, r2, r0 +; CHECK-NEXT: sbcs r0, r7, r8 ; CHECK-NEXT: movlt r12, #1 ; CHECK-NEXT: cmp r12, #0 ; CHECK-NEXT: mvnne r12, #0 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: vdup.32 d27, r12 -; CHECK-NEXT: mvnne r6, #0 -; CHECK-NEXT: vdup.32 d26, r6 -; CHECK-NEXT: vorr q10, q13, q13 -; CHECK-NEXT: vbsl q10, q0, q1 -; CHECK-NEXT: vst1.64 {d4, d5}, [r0:128]! -; CHECK-NEXT: vst1.64 {d22, d23}, [r0:128] -; CHECK-NEXT: add r0, r3, #64 +; CHECK-NEXT: add r0, r3, #112 +; CHECK-NEXT: vdup.32 d7, r12 +; CHECK-NEXT: vdup.32 d6, r1 +; CHECK-NEXT: vorr q14, q3, q3 +; CHECK-NEXT: vbsl q14, q1, q0 +; CHECK-NEXT: vst1.64 {d26, d27}, [r0:128] +; CHECK-NEXT: mov r0, r3 ; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128]! -; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128]! ; CHECK-NEXT: vst1.64 {d24, d25}, [r0:128]! ; CHECK-NEXT: vst1.64 {d20, d21}, [r0:128] +; CHECK-NEXT: add r0, r3, #80 +; CHECK-NEXT: vst1.64 {d18, d19}, [r0:128]! +; CHECK-NEXT: vst1.64 {d30, d31}, [r0:128] +; CHECK-NEXT: add r0, r3, #48 +; CHECK-NEXT: vst1.64 {d28, d29}, [r0:128]! +; CHECK-NEXT: vst1.64 {d22, d23}, [r0:128] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: mov pc, lr + ptr %blend, ptr %storeaddr) { %v0 = load %T0_20, ptr %loadaddr %v1 = load %T0_20, ptr %loadaddr2 %c = icmp slt %T0_20 %v0, %v1 diff --git a/llvm/test/CodeGen/ARM/vuzp.ll b/llvm/test/CodeGen/ARM/vuzp.ll --- a/llvm/test/CodeGen/ARM/vuzp.ll +++ b/llvm/test/CodeGen/ARM/vuzp.ll @@ -451,52 +451,35 @@ define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, ; CHECK-LABEL: vuzp_wide_type: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: add r12, sp, #32 -; CHECK-NEXT: add lr, sp, #48 -; CHECK-NEXT: vld1.32 {d17[0]}, [r12:32] -; CHECK-NEXT: add r12, sp, #24 -; CHECK-NEXT: vld1.32 {d16[0]}, [r12:32] -; CHECK-NEXT: add r12, sp, #56 -; CHECK-NEXT: vld1.32 {d19[0]}, [r12:32] -; CHECK-NEXT: vld1.32 {d18[0]}, [lr:32] -; CHECK-NEXT: add lr, sp, #40 -; CHECK-NEXT: vld1.32 {d20[0]}, [lr:32] +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: ldr r12, [sp, #68] -; CHECK-NEXT: ldr r4, [r12] -; CHECK-NEXT: vmov.32 d23[0], r4 -; CHECK-NEXT: add r4, sp, #64 -; CHECK-NEXT: vld1.32 {d24[0]}, [r4:32] -; CHECK-NEXT: add r4, sp, #36 -; CHECK-NEXT: vcgt.u32 q10, q12, q10 -; CHECK-NEXT: vld1.32 {d17[1]}, [r4:32] -; CHECK-NEXT: add r4, sp, #28 -; CHECK-NEXT: vld1.32 {d16[1]}, [r4:32] -; CHECK-NEXT: add r4, sp, #60 -; CHECK-NEXT: vld1.32 {d19[1]}, [r4:32] -; CHECK-NEXT: add r4, sp, #52 -; CHECK-NEXT: vld1.32 {d18[1]}, [r4:32] -; CHECK-NEXT: add r4, r12, #4 -; CHECK-NEXT: vcgt.u32 q8, q9, q8 +; CHECK-NEXT: add r11, sp, #24 +; CHECK-NEXT: vldmia r11, {s0, s1, s2, s3, s4} +; CHECK-NEXT: add r11, sp, #48 +; CHECK-NEXT: ldr lr, [r12], #4 +; CHECK-NEXT: vldmia r11, {s8, s9, s10, s11, s12} +; CHECK-NEXT: vcgt.u32 q10, q3, q1 +; CHECK-NEXT: vmov.32 d17[0], lr +; CHECK-NEXT: vcgt.u32 q9, q2, q0 +; CHECK-NEXT: vmov.u8 lr, d17[3] +; CHECK-NEXT: vmovn.i32 d18, q9 ; CHECK-NEXT: vmovn.i32 d19, q10 -; CHECK-NEXT: vmov.u8 lr, d23[3] -; CHECK-NEXT: vmovn.i32 d18, q8 -; CHECK-NEXT: vmovn.i16 d22, q9 -; CHECK-NEXT: vldr d18, .LCPI23_0 -; CHECK-NEXT: vmov.8 d17[0], lr -; CHECK-NEXT: vtbl.8 d16, {d22, d23}, d18 +; CHECK-NEXT: vldr d20, .LCPI23_0 +; CHECK-NEXT: vmovn.i16 d16, q9 +; CHECK-NEXT: vmov.8 d19[0], lr +; CHECK-NEXT: vtbl.8 d18, {d16, d17}, d20 +; CHECK-NEXT: vld1.8 {d19[1]}, [r12] +; CHECK-NEXT: add r12, sp, #8 +; CHECK-NEXT: vshl.i8 q8, q9, #7 ; CHECK-NEXT: vmov d19, r2, r3 -; CHECK-NEXT: vld1.8 {d17[1]}, [r4] -; CHECK-NEXT: add r4, sp, #8 +; CHECK-NEXT: vld1.64 {d20, d21}, [r12] ; CHECK-NEXT: vmov d18, r0, r1 -; CHECK-NEXT: vshl.i8 q8, q8, #7 -; CHECK-NEXT: vld1.64 {d20, d21}, [r4] ; CHECK-NEXT: vshr.s8 q8, q8, #7 ; CHECK-NEXT: vbsl q8, q9, q10 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/ARM/wide-compares.ll b/llvm/test/CodeGen/ARM/wide-compares.ll --- a/llvm/test/CodeGen/ARM/wide-compares.ll +++ b/llvm/test/CodeGen/ARM/wide-compares.ll @@ -18,25 +18,41 @@ ; CHECK-THUMB1-NOMOV: @ %bb.0: @ %entry ; CHECK-THUMB1-NOMOV-NEXT: subs r0, r0, r2 ; CHECK-THUMB1-NOMOV-NEXT: sbcs r1, r3 -; CHECK-THUMB1-NOMOV-NEXT: bge .LBB0_2 -; CHECK-THUMB1-NOMOV-NEXT: @ %bb.1: @ %bb1 -; CHECK-THUMB1-NOMOV-NEXT: movs r0, #1 -; CHECK-THUMB1-NOMOV-NEXT: bx lr +; CHECK-THUMB1-NOMOV-NEXT: bge .LBB0_3 +; CHECK-THUMB1-NOMOV-NEXT: @ %bb.1: @ %entry +; CHECK-THUMB1-NOMOV-NEXT: movs r0, #0 +; CHECK-THUMB1-NOMOV-NEXT: cmp r0, #0 +; CHECK-THUMB1-NOMOV-NEXT: beq .LBB0_4 ; CHECK-THUMB1-NOMOV-NEXT: .LBB0_2: @ %bb2 ; CHECK-THUMB1-NOMOV-NEXT: movs r0, #2 ; CHECK-THUMB1-NOMOV-NEXT: bx lr +; CHECK-THUMB1-NOMOV-NEXT: .LBB0_3: +; CHECK-THUMB1-NOMOV-NEXT: movs r0, #1 +; CHECK-THUMB1-NOMOV-NEXT: cmp r0, #0 +; CHECK-THUMB1-NOMOV-NEXT: bne .LBB0_2 +; CHECK-THUMB1-NOMOV-NEXT: .LBB0_4: @ %bb1 +; CHECK-THUMB1-NOMOV-NEXT: movs r0, #1 +; CHECK-THUMB1-NOMOV-NEXT: bx lr ; ; CHECK-THUMB1-LABEL: test_slt1: ; CHECK-THUMB1: @ %bb.0: @ %entry ; CHECK-THUMB1-NEXT: subs r0, r0, r2 ; CHECK-THUMB1-NEXT: sbcs r1, r3 -; CHECK-THUMB1-NEXT: bge .LBB0_2 -; CHECK-THUMB1-NEXT: @ %bb.1: @ %bb1 -; CHECK-THUMB1-NEXT: movs r0, #1 -; CHECK-THUMB1-NEXT: bx lr +; CHECK-THUMB1-NEXT: bge .LBB0_3 +; CHECK-THUMB1-NEXT: @ %bb.1: @ %entry +; CHECK-THUMB1-NEXT: movs r0, #0 +; CHECK-THUMB1-NEXT: cmp r0, #0 +; CHECK-THUMB1-NEXT: beq .LBB0_4 ; CHECK-THUMB1-NEXT: .LBB0_2: @ %bb2 ; CHECK-THUMB1-NEXT: movs r0, #2 ; CHECK-THUMB1-NEXT: bx lr +; CHECK-THUMB1-NEXT: .LBB0_3: +; CHECK-THUMB1-NEXT: movs r0, #1 +; CHECK-THUMB1-NEXT: cmp r0, #0 +; CHECK-THUMB1-NEXT: bne .LBB0_2 +; CHECK-THUMB1-NEXT: .LBB0_4: @ %bb1 +; CHECK-THUMB1-NEXT: movs r0, #1 +; CHECK-THUMB1-NEXT: bx lr ; ; CHECK-THUMB2-LABEL: test_slt1: ; CHECK-THUMB2: @ %bb.0: @ %entry @@ -61,8 +77,11 @@ ; CHECK-ARM: @ %bb.0: @ %entry ; CHECK-ARM-NEXT: push {r11, lr} ; CHECK-ARM-NEXT: subs r0, r0, r2 +; CHECK-ARM-NEXT: mov r12, #0 ; CHECK-ARM-NEXT: sbcs r0, r1, r3 -; CHECK-ARM-NEXT: bge .LBB1_2 +; CHECK-ARM-NEXT: movwlt r12, #1 +; CHECK-ARM-NEXT: cmp r12, #0 +; CHECK-ARM-NEXT: beq .LBB1_2 ; CHECK-ARM-NEXT: @ %bb.1: @ %bb1 ; CHECK-ARM-NEXT: bl f ; CHECK-ARM-NEXT: pop {r11, pc} @@ -76,13 +95,21 @@ ; CHECK-THUMB1-NOMOV-NEXT: push {r7, lr} ; CHECK-THUMB1-NOMOV-NEXT: subs r0, r0, r2 ; CHECK-THUMB1-NOMOV-NEXT: sbcs r1, r3 -; CHECK-THUMB1-NOMOV-NEXT: bge .LBB1_2 -; CHECK-THUMB1-NOMOV-NEXT: @ %bb.1: @ %bb1 -; CHECK-THUMB1-NOMOV-NEXT: bl f -; CHECK-THUMB1-NOMOV-NEXT: b .LBB1_3 +; CHECK-THUMB1-NOMOV-NEXT: bge .LBB1_3 +; CHECK-THUMB1-NOMOV-NEXT: @ %bb.1: @ %entry +; CHECK-THUMB1-NOMOV-NEXT: movs r0, #0 +; CHECK-THUMB1-NOMOV-NEXT: cmp r0, #0 +; CHECK-THUMB1-NOMOV-NEXT: beq .LBB1_4 ; CHECK-THUMB1-NOMOV-NEXT: .LBB1_2: @ %bb2 ; CHECK-THUMB1-NOMOV-NEXT: bl g -; CHECK-THUMB1-NOMOV-NEXT: .LBB1_3: @ %bb1 +; CHECK-THUMB1-NOMOV-NEXT: b .LBB1_5 +; CHECK-THUMB1-NOMOV-NEXT: .LBB1_3: +; CHECK-THUMB1-NOMOV-NEXT: movs r0, #1 +; CHECK-THUMB1-NOMOV-NEXT: cmp r0, #0 +; CHECK-THUMB1-NOMOV-NEXT: bne .LBB1_2 +; CHECK-THUMB1-NOMOV-NEXT: .LBB1_4: @ %bb1 +; CHECK-THUMB1-NOMOV-NEXT: bl f +; CHECK-THUMB1-NOMOV-NEXT: .LBB1_5: @ %bb1 ; CHECK-THUMB1-NOMOV-NEXT: pop {r7} ; CHECK-THUMB1-NOMOV-NEXT: pop {r0} ; CHECK-THUMB1-NOMOV-NEXT: bx r0 @@ -92,20 +119,32 @@ ; CHECK-THUMB1-NEXT: push {r7, lr} ; CHECK-THUMB1-NEXT: subs r0, r0, r2 ; CHECK-THUMB1-NEXT: sbcs r1, r3 -; CHECK-THUMB1-NEXT: bge .LBB1_2 -; CHECK-THUMB1-NEXT: @ %bb.1: @ %bb1 -; CHECK-THUMB1-NEXT: bl f -; CHECK-THUMB1-NEXT: pop {r7, pc} +; CHECK-THUMB1-NEXT: bge .LBB1_3 +; CHECK-THUMB1-NEXT: @ %bb.1: @ %entry +; CHECK-THUMB1-NEXT: movs r0, #0 +; CHECK-THUMB1-NEXT: cmp r0, #0 +; CHECK-THUMB1-NEXT: beq .LBB1_4 ; CHECK-THUMB1-NEXT: .LBB1_2: @ %bb2 ; CHECK-THUMB1-NEXT: bl g ; CHECK-THUMB1-NEXT: pop {r7, pc} +; CHECK-THUMB1-NEXT: .LBB1_3: +; CHECK-THUMB1-NEXT: movs r0, #1 +; CHECK-THUMB1-NEXT: cmp r0, #0 +; CHECK-THUMB1-NEXT: bne .LBB1_2 +; CHECK-THUMB1-NEXT: .LBB1_4: @ %bb1 +; CHECK-THUMB1-NEXT: bl f +; CHECK-THUMB1-NEXT: pop {r7, pc} ; ; CHECK-THUMB2-LABEL: test_slt2: ; CHECK-THUMB2: @ %bb.0: @ %entry ; CHECK-THUMB2-NEXT: push {r7, lr} ; CHECK-THUMB2-NEXT: subs r0, r0, r2 +; CHECK-THUMB2-NEXT: mov.w r12, #0 ; CHECK-THUMB2-NEXT: sbcs.w r0, r1, r3 -; CHECK-THUMB2-NEXT: bge .LBB1_2 +; CHECK-THUMB2-NEXT: it lt +; CHECK-THUMB2-NEXT: movlt.w r12, #1 +; CHECK-THUMB2-NEXT: cmp.w r12, #0 +; CHECK-THUMB2-NEXT: beq .LBB1_2 ; CHECK-THUMB2-NEXT: @ %bb.1: @ %bb1 ; CHECK-THUMB2-NEXT: bl f ; CHECK-THUMB2-NEXT: pop {r7, pc} @@ -129,19 +168,17 @@ define i64 @test_slt_select(i64 %c, i64 %d, i64 %a, i64 %b) { ; CHECK-ARM-LABEL: test_slt_select: ; CHECK-ARM: @ %bb.0: @ %entry -; CHECK-ARM-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-ARM-NEXT: ldr r12, [sp, #32] +; CHECK-ARM-NEXT: push {r4, r5, r6, lr} +; CHECK-ARM-NEXT: add lr, sp, #16 ; CHECK-ARM-NEXT: mov r6, #0 -; CHECK-ARM-NEXT: ldr lr, [sp, #24] -; CHECK-ARM-NEXT: ldr r7, [sp, #36] -; CHECK-ARM-NEXT: ldr r5, [sp, #28] -; CHECK-ARM-NEXT: subs r4, lr, r12 -; CHECK-ARM-NEXT: sbcs r7, r5, r7 +; CHECK-ARM-NEXT: ldm lr, {r4, r5, r12, lr} +; CHECK-ARM-NEXT: subs r4, r4, r12 +; CHECK-ARM-NEXT: sbcs r5, r5, lr ; CHECK-ARM-NEXT: movwlo r6, #1 ; CHECK-ARM-NEXT: cmp r6, #0 ; CHECK-ARM-NEXT: moveq r0, r2 ; CHECK-ARM-NEXT: moveq r1, r3 -; CHECK-ARM-NEXT: pop {r4, r5, r6, r7, r11, pc} +; CHECK-ARM-NEXT: pop {r4, r5, r6, pc} ; ; CHECK-THUMB1-NOMOV-LABEL: test_slt_select: ; CHECK-THUMB1-NOMOV: @ %bb.0: @ %entry @@ -149,8 +186,10 @@ ; CHECK-THUMB1-NOMOV-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-THUMB1-NOMOV-NEXT: .pad #4 ; CHECK-THUMB1-NOMOV-NEXT: sub sp, #4 -; CHECK-THUMB1-NOMOV-NEXT: ldr r4, [sp, #36] -; CHECK-THUMB1-NOMOV-NEXT: ldr r5, [sp, #28] +; CHECK-THUMB1-NOMOV-NEXT: add r4, sp, #32 +; CHECK-THUMB1-NOMOV-NEXT: ldr r4, [r4, #4] +; CHECK-THUMB1-NOMOV-NEXT: add r5, sp, #24 +; CHECK-THUMB1-NOMOV-NEXT: ldr r5, [r5, #4] ; CHECK-THUMB1-NOMOV-NEXT: ldr r6, [sp, #32] ; CHECK-THUMB1-NOMOV-NEXT: ldr r7, [sp, #24] ; CHECK-THUMB1-NOMOV-NEXT: subs r6, r7, r6 @@ -182,8 +221,10 @@ ; CHECK-THUMB1: @ %bb.0: @ %entry ; CHECK-THUMB1-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-THUMB1-NEXT: sub sp, #4 -; CHECK-THUMB1-NEXT: ldr r4, [sp, #36] -; CHECK-THUMB1-NEXT: ldr r5, [sp, #28] +; CHECK-THUMB1-NEXT: add r4, sp, #32 +; CHECK-THUMB1-NEXT: ldr r4, [r4, #4] +; CHECK-THUMB1-NEXT: add r5, sp, #24 +; CHECK-THUMB1-NEXT: ldr r5, [r5, #4] ; CHECK-THUMB1-NEXT: ldr r6, [sp, #32] ; CHECK-THUMB1-NEXT: ldr r7, [sp, #24] ; CHECK-THUMB1-NEXT: subs r6, r7, r6 @@ -213,21 +254,19 @@ ; ; CHECK-THUMB2-LABEL: test_slt_select: ; CHECK-THUMB2: @ %bb.0: @ %entry -; CHECK-THUMB2-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-THUMB2-NEXT: sub sp, #4 -; CHECK-THUMB2-NEXT: ldrd r12, r7, [sp, #32] +; CHECK-THUMB2-NEXT: push {r4, r5, r6, lr} +; CHECK-THUMB2-NEXT: add.w lr, sp, #16 ; CHECK-THUMB2-NEXT: movs r6, #0 -; CHECK-THUMB2-NEXT: ldrd lr, r5, [sp, #24] -; CHECK-THUMB2-NEXT: subs.w r4, lr, r12 -; CHECK-THUMB2-NEXT: sbcs.w r7, r5, r7 +; CHECK-THUMB2-NEXT: ldm.w lr, {r4, r5, r12, lr} +; CHECK-THUMB2-NEXT: subs.w r4, r4, r12 +; CHECK-THUMB2-NEXT: sbcs.w r5, r5, lr ; CHECK-THUMB2-NEXT: it lo ; CHECK-THUMB2-NEXT: movlo r6, #1 ; CHECK-THUMB2-NEXT: cmp r6, #0 ; CHECK-THUMB2-NEXT: itt eq ; CHECK-THUMB2-NEXT: moveq r0, r2 ; CHECK-THUMB2-NEXT: moveq r1, r3 -; CHECK-THUMB2-NEXT: add sp, #4 -; CHECK-THUMB2-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-THUMB2-NEXT: pop {r4, r5, r6, pc} entry: %cmp = icmp ult i64 %a, %b %r1 = select i1 %cmp, i64 %c, i64 %d @@ -237,11 +276,10 @@ define {i32, i32} @test_slt_not(i32 %c, i32 %d, i64 %a, i64 %b) { ; CHECK-ARM-LABEL: test_slt_not: ; CHECK-ARM: @ %bb.0: @ %entry -; CHECK-ARM-NEXT: ldr r12, [sp] +; CHECK-ARM-NEXT: ldm sp, {r0, r12} ; CHECK-ARM-NEXT: mov r1, #0 -; CHECK-ARM-NEXT: ldr r0, [sp, #4] -; CHECK-ARM-NEXT: subs r2, r2, r12 -; CHECK-ARM-NEXT: sbcs r0, r3, r0 +; CHECK-ARM-NEXT: subs r0, r2, r0 +; CHECK-ARM-NEXT: sbcs r0, r3, r12 ; CHECK-ARM-NEXT: mov r0, #0 ; CHECK-ARM-NEXT: movwge r1, #1 ; CHECK-ARM-NEXT: movwlt r0, #1 @@ -251,9 +289,10 @@ ; CHECK-THUMB1-NOMOV: @ %bb.0: @ %entry ; CHECK-THUMB1-NOMOV-NEXT: .save {r4, r5, r7, lr} ; CHECK-THUMB1-NOMOV-NEXT: push {r4, r5, r7, lr} +; CHECK-THUMB1-NOMOV-NEXT: add r0, sp, #16 +; CHECK-THUMB1-NOMOV-NEXT: ldr r0, [r0, #4] ; CHECK-THUMB1-NOMOV-NEXT: movs r1, #1 ; CHECK-THUMB1-NOMOV-NEXT: movs r4, #0 -; CHECK-THUMB1-NOMOV-NEXT: ldr r0, [sp, #20] ; CHECK-THUMB1-NOMOV-NEXT: ldr r5, [sp, #16] ; CHECK-THUMB1-NOMOV-NEXT: subs r2, r2, r5 ; CHECK-THUMB1-NOMOV-NEXT: sbcs r3, r0 @@ -275,9 +314,10 @@ ; CHECK-THUMB1-LABEL: test_slt_not: ; CHECK-THUMB1: @ %bb.0: @ %entry ; CHECK-THUMB1-NEXT: push {r4, r5, r7, lr} +; CHECK-THUMB1-NEXT: add r0, sp, #16 +; CHECK-THUMB1-NEXT: ldr r0, [r0, #4] ; CHECK-THUMB1-NEXT: movs r1, #1 ; CHECK-THUMB1-NEXT: movs r4, #0 -; CHECK-THUMB1-NEXT: ldr r0, [sp, #20] ; CHECK-THUMB1-NEXT: ldr r5, [sp, #16] ; CHECK-THUMB1-NEXT: subs r2, r2, r5 ; CHECK-THUMB1-NEXT: sbcs r3, r0 @@ -296,11 +336,10 @@ ; ; CHECK-THUMB2-LABEL: test_slt_not: ; CHECK-THUMB2: @ %bb.0: @ %entry -; CHECK-THUMB2-NEXT: ldr.w r12, [sp] +; CHECK-THUMB2-NEXT: ldrd r0, r12, [sp] ; CHECK-THUMB2-NEXT: movs r1, #0 -; CHECK-THUMB2-NEXT: ldr r0, [sp, #4] -; CHECK-THUMB2-NEXT: subs.w r2, r2, r12 -; CHECK-THUMB2-NEXT: sbcs.w r0, r3, r0 +; CHECK-THUMB2-NEXT: subs r0, r2, r0 +; CHECK-THUMB2-NEXT: sbcs.w r0, r3, r12 ; CHECK-THUMB2-NEXT: mov.w r0, #0 ; CHECK-THUMB2-NEXT: ite lt ; CHECK-THUMB2-NEXT: movlt r0, #1 diff --git a/llvm/test/CodeGen/Mips/atomic.ll b/llvm/test/CodeGen/Mips/atomic.ll --- a/llvm/test/CodeGen/Mips/atomic.ll +++ b/llvm/test/CodeGen/Mips/atomic.ll @@ -1688,12 +1688,13 @@ ; MIPS4-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(AtomicSwap32))) ; MIPS4-NEXT: sw $4, 12($sp) ; MIPS4-NEXT: ld $1, %got_disp(x)($1) +; MIPS4-NEXT: lw $3, 12($sp) ; MIPS4-NEXT: .LBB6_1: # %entry ; MIPS4-NEXT: # =>This Inner Loop Header: Depth=1 ; MIPS4-NEXT: ll $2, 0($1) -; MIPS4-NEXT: move $3, $4 -; MIPS4-NEXT: sc $3, 0($1) -; MIPS4-NEXT: beqz $3, .LBB6_1 +; MIPS4-NEXT: move $4, $3 +; MIPS4-NEXT: sc $4, 0($1) +; MIPS4-NEXT: beqz $4, .LBB6_1 ; MIPS4-NEXT: nop ; MIPS4-NEXT: # %bb.2: # %entry ; MIPS4-NEXT: jr $ra @@ -1707,12 +1708,13 @@ ; MIPS64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(AtomicSwap32))) ; MIPS64-NEXT: sw $4, 12($sp) ; MIPS64-NEXT: ld $1, %got_disp(x)($1) +; MIPS64-NEXT: lw $3, 12($sp) ; MIPS64-NEXT: .LBB6_1: # %entry ; MIPS64-NEXT: # =>This Inner Loop Header: Depth=1 ; MIPS64-NEXT: ll $2, 0($1) -; MIPS64-NEXT: move $3, $4 -; MIPS64-NEXT: sc $3, 0($1) -; MIPS64-NEXT: beqz $3, .LBB6_1 +; MIPS64-NEXT: move $4, $3 +; MIPS64-NEXT: sc $4, 0($1) +; MIPS64-NEXT: beqz $4, .LBB6_1 ; MIPS64-NEXT: nop ; MIPS64-NEXT: # %bb.2: # %entry ; MIPS64-NEXT: jr $ra @@ -1726,12 +1728,13 @@ ; MIPS64R2-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(AtomicSwap32))) ; MIPS64R2-NEXT: sw $4, 12($sp) ; MIPS64R2-NEXT: ld $1, %got_disp(x)($1) +; MIPS64R2-NEXT: lw $3, 12($sp) ; MIPS64R2-NEXT: .LBB6_1: # %entry ; MIPS64R2-NEXT: # =>This Inner Loop Header: Depth=1 ; MIPS64R2-NEXT: ll $2, 0($1) -; MIPS64R2-NEXT: move $3, $4 -; MIPS64R2-NEXT: sc $3, 0($1) -; MIPS64R2-NEXT: beqz $3, .LBB6_1 +; MIPS64R2-NEXT: move $4, $3 +; MIPS64R2-NEXT: sc $4, 0($1) +; MIPS64R2-NEXT: beqz $4, .LBB6_1 ; MIPS64R2-NEXT: nop ; MIPS64R2-NEXT: # %bb.2: # %entry ; MIPS64R2-NEXT: jr $ra @@ -1745,12 +1748,13 @@ ; MIPS64R6-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(AtomicSwap32))) ; MIPS64R6-NEXT: sw $4, 12($sp) ; MIPS64R6-NEXT: ld $1, %got_disp(x)($1) +; MIPS64R6-NEXT: lw $3, 12($sp) ; MIPS64R6-NEXT: .LBB6_1: # %entry ; MIPS64R6-NEXT: # =>This Inner Loop Header: Depth=1 ; MIPS64R6-NEXT: ll $2, 0($1) -; MIPS64R6-NEXT: move $3, $4 -; MIPS64R6-NEXT: sc $3, 0($1) -; MIPS64R6-NEXT: beqzc $3, .LBB6_1 +; MIPS64R6-NEXT: move $4, $3 +; MIPS64R6-NEXT: sc $4, 0($1) +; MIPS64R6-NEXT: beqzc $4, .LBB6_1 ; MIPS64R6-NEXT: nop ; MIPS64R6-NEXT: # %bb.2: # %entry ; MIPS64R6-NEXT: jr $ra @@ -2005,6 +2009,7 @@ ; MIPS4-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(AtomicCmpSwap32))) ; MIPS4-NEXT: sw $5, 12($sp) ; MIPS4-NEXT: ld $1, %got_disp(x)($1) +; MIPS4-NEXT: lw $3, 12($sp) ; MIPS4-NEXT: .LBB7_1: # %entry ; MIPS4-NEXT: # =>This Inner Loop Header: Depth=1 ; MIPS4-NEXT: ll $2, 0($1) @@ -2012,9 +2017,9 @@ ; MIPS4-NEXT: nop ; MIPS4-NEXT: # %bb.2: # %entry ; MIPS4-NEXT: # in Loop: Header=BB7_1 Depth=1 -; MIPS4-NEXT: move $3, $5 -; MIPS4-NEXT: sc $3, 0($1) -; MIPS4-NEXT: beqz $3, .LBB7_1 +; MIPS4-NEXT: move $5, $3 +; MIPS4-NEXT: sc $5, 0($1) +; MIPS4-NEXT: beqz $5, .LBB7_1 ; MIPS4-NEXT: nop ; MIPS4-NEXT: .LBB7_3: # %entry ; MIPS4-NEXT: jr $ra @@ -2028,6 +2033,7 @@ ; MIPS64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(AtomicCmpSwap32))) ; MIPS64-NEXT: sw $5, 12($sp) ; MIPS64-NEXT: ld $1, %got_disp(x)($1) +; MIPS64-NEXT: lw $3, 12($sp) ; MIPS64-NEXT: .LBB7_1: # %entry ; MIPS64-NEXT: # =>This Inner Loop Header: Depth=1 ; MIPS64-NEXT: ll $2, 0($1) @@ -2035,9 +2041,9 @@ ; MIPS64-NEXT: nop ; MIPS64-NEXT: # %bb.2: # %entry ; MIPS64-NEXT: # in Loop: Header=BB7_1 Depth=1 -; MIPS64-NEXT: move $3, $5 -; MIPS64-NEXT: sc $3, 0($1) -; MIPS64-NEXT: beqz $3, .LBB7_1 +; MIPS64-NEXT: move $5, $3 +; MIPS64-NEXT: sc $5, 0($1) +; MIPS64-NEXT: beqz $5, .LBB7_1 ; MIPS64-NEXT: nop ; MIPS64-NEXT: .LBB7_3: # %entry ; MIPS64-NEXT: jr $ra @@ -2051,6 +2057,7 @@ ; MIPS64R2-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(AtomicCmpSwap32))) ; MIPS64R2-NEXT: sw $5, 12($sp) ; MIPS64R2-NEXT: ld $1, %got_disp(x)($1) +; MIPS64R2-NEXT: lw $3, 12($sp) ; MIPS64R2-NEXT: .LBB7_1: # %entry ; MIPS64R2-NEXT: # =>This Inner Loop Header: Depth=1 ; MIPS64R2-NEXT: ll $2, 0($1) @@ -2058,9 +2065,9 @@ ; MIPS64R2-NEXT: nop ; MIPS64R2-NEXT: # %bb.2: # %entry ; MIPS64R2-NEXT: # in Loop: Header=BB7_1 Depth=1 -; MIPS64R2-NEXT: move $3, $5 -; MIPS64R2-NEXT: sc $3, 0($1) -; MIPS64R2-NEXT: beqz $3, .LBB7_1 +; MIPS64R2-NEXT: move $5, $3 +; MIPS64R2-NEXT: sc $5, 0($1) +; MIPS64R2-NEXT: beqz $5, .LBB7_1 ; MIPS64R2-NEXT: nop ; MIPS64R2-NEXT: .LBB7_3: # %entry ; MIPS64R2-NEXT: jr $ra @@ -2074,15 +2081,16 @@ ; MIPS64R6-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(AtomicCmpSwap32))) ; MIPS64R6-NEXT: sw $5, 12($sp) ; MIPS64R6-NEXT: ld $1, %got_disp(x)($1) +; MIPS64R6-NEXT: lw $3, 12($sp) ; MIPS64R6-NEXT: .LBB7_1: # %entry ; MIPS64R6-NEXT: # =>This Inner Loop Header: Depth=1 ; MIPS64R6-NEXT: ll $2, 0($1) ; MIPS64R6-NEXT: bnec $2, $4, .LBB7_3 ; MIPS64R6-NEXT: # %bb.2: # %entry ; MIPS64R6-NEXT: # in Loop: Header=BB7_1 Depth=1 -; MIPS64R6-NEXT: move $3, $5 -; MIPS64R6-NEXT: sc $3, 0($1) -; MIPS64R6-NEXT: beqzc $3, .LBB7_1 +; MIPS64R6-NEXT: move $5, $3 +; MIPS64R6-NEXT: sc $5, 0($1) +; MIPS64R6-NEXT: beqzc $5, .LBB7_1 ; MIPS64R6-NEXT: nop ; MIPS64R6-NEXT: .LBB7_3: # %entry ; MIPS64R6-NEXT: jr $ra diff --git a/llvm/test/CodeGen/Mips/cconv/byval.ll b/llvm/test/CodeGen/Mips/cconv/byval.ll --- a/llvm/test/CodeGen/Mips/cconv/byval.ll +++ b/llvm/test/CodeGen/Mips/cconv/byval.ll @@ -150,6 +150,104 @@ ; N64-NEXT: lui $1, 1 ; N64-NEXT: jr $ra ; N64-NEXT: daddu $sp, $sp, $1 +; +; O32-SDAG-LABEL: g: +; O32-SDAG: # %bb.0: # %entry +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: subu $sp, $sp, $1 +; O32-SDAG-NEXT: .cfi_def_cfa_offset 65536 +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: addu $1, $sp, $1 +; O32-SDAG-NEXT: sw $ra, -4($1) # 4-byte Folded Spill +; O32-SDAG-NEXT: .cfi_offset 31, -4 +; O32-SDAG-NEXT: ori $1, $zero, 65520 +; O32-SDAG-NEXT: subu $sp, $sp, $1 +; O32-SDAG-NEXT: addiu $1, $sp, 8 +; O32-SDAG-NEXT: addiu $5, $1, 16 +; O32-SDAG-NEXT: addiu $4, $sp, 16 +; O32-SDAG-NEXT: jal memcpy +; O32-SDAG-NEXT: ori $6, $zero, 65504 +; O32-SDAG-NEXT: lw $7, 20($sp) +; O32-SDAG-NEXT: lw $6, 16($sp) +; O32-SDAG-NEXT: lw $5, 12($sp) +; O32-SDAG-NEXT: jal f2 +; O32-SDAG-NEXT: lw $4, 8($sp) +; O32-SDAG-NEXT: ori $1, $zero, 65520 +; O32-SDAG-NEXT: addu $sp, $sp, $1 +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: addu $1, $sp, $1 +; O32-SDAG-NEXT: lw $ra, -4($1) # 4-byte Folded Reload +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: jr $ra +; O32-SDAG-NEXT: addu $sp, $sp, $1 +; +; N32-SDAG-LABEL: g: +; N32-SDAG: # %bb.0: # %entry +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: subu $sp, $sp, $1 +; N32-SDAG-NEXT: .cfi_def_cfa_offset 65536 +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: addu $1, $sp, $1 +; N32-SDAG-NEXT: sd $ra, -8($1) # 8-byte Folded Spill +; N32-SDAG-NEXT: .cfi_offset 31, -8 +; N32-SDAG-NEXT: ori $1, $zero, 65456 +; N32-SDAG-NEXT: subu $sp, $sp, $1 +; N32-SDAG-NEXT: addiu $1, $sp, 8 +; N32-SDAG-NEXT: addiu $5, $1, 64 +; N32-SDAG-NEXT: ori $6, $zero, 65456 +; N32-SDAG-NEXT: jal memcpy +; N32-SDAG-NEXT: move $4, $sp +; N32-SDAG-NEXT: ld $11, 64($sp) +; N32-SDAG-NEXT: ld $10, 56($sp) +; N32-SDAG-NEXT: ld $9, 48($sp) +; N32-SDAG-NEXT: ld $8, 40($sp) +; N32-SDAG-NEXT: ld $7, 32($sp) +; N32-SDAG-NEXT: ld $6, 24($sp) +; N32-SDAG-NEXT: ld $5, 16($sp) +; N32-SDAG-NEXT: jal f2 +; N32-SDAG-NEXT: ld $4, 8($sp) +; N32-SDAG-NEXT: ori $1, $zero, 65456 +; N32-SDAG-NEXT: addu $sp, $sp, $1 +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: addu $1, $sp, $1 +; N32-SDAG-NEXT: ld $ra, -8($1) # 8-byte Folded Reload +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: jr $ra +; N32-SDAG-NEXT: addu $sp, $sp, $1 +; +; N64-SDAG-LABEL: g: +; N64-SDAG: # %bb.0: # %entry +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: dsubu $sp, $sp, $1 +; N64-SDAG-NEXT: .cfi_def_cfa_offset 65536 +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: daddu $1, $sp, $1 +; N64-SDAG-NEXT: sd $ra, -8($1) # 8-byte Folded Spill +; N64-SDAG-NEXT: .cfi_offset 31, -8 +; N64-SDAG-NEXT: ori $1, $zero, 65456 +; N64-SDAG-NEXT: dsubu $sp, $sp, $1 +; N64-SDAG-NEXT: daddiu $1, $sp, 8 +; N64-SDAG-NEXT: daddiu $5, $1, 64 +; N64-SDAG-NEXT: ori $6, $zero, 65456 +; N64-SDAG-NEXT: jal memcpy +; N64-SDAG-NEXT: move $4, $sp +; N64-SDAG-NEXT: ld $11, 64($sp) +; N64-SDAG-NEXT: ld $10, 56($sp) +; N64-SDAG-NEXT: ld $9, 48($sp) +; N64-SDAG-NEXT: ld $8, 40($sp) +; N64-SDAG-NEXT: ld $7, 32($sp) +; N64-SDAG-NEXT: ld $6, 24($sp) +; N64-SDAG-NEXT: ld $5, 16($sp) +; N64-SDAG-NEXT: jal f2 +; N64-SDAG-NEXT: ld $4, 8($sp) +; N64-SDAG-NEXT: ori $1, $zero, 65456 +; N64-SDAG-NEXT: daddu $sp, $sp, $1 +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: daddu $1, $sp, $1 +; N64-SDAG-NEXT: ld $ra, -8($1) # 8-byte Folded Reload +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: jr $ra +; N64-SDAG-NEXT: daddu $sp, $sp, $1 entry: %a = alloca %struct.S1, align 8 call void @f2(ptr byval(%struct.S1) align 4 %a) @@ -254,11 +352,13 @@ ; N32-NEXT: sd $16, 0($1) # 8-byte Folded Spill ; N32-NEXT: .cfi_offset 31, -8 ; N32-NEXT: .cfi_offset 16, -16 -; N32-NEXT: move $5, $4 ; N32-NEXT: lui $1, 1 ; N32-NEXT: addu $1, $sp, $1 ; N32-NEXT: sw $4, -4($1) ; N32-NEXT: addiu $16, $sp, 8 +; N32-NEXT: lui $1, 1 +; N32-NEXT: addu $1, $sp, $1 +; N32-NEXT: lw $5, -4($1) ; N32-NEXT: ori $6, $zero, 65520 ; N32-NEXT: jal memcpy ; N32-NEXT: move $4, $16 @@ -339,6 +439,156 @@ ; N64-NEXT: daddiu $1, $1, 16 ; N64-NEXT: jr $ra ; N64-NEXT: daddu $sp, $sp, $1 +; +; O32-SDAG-LABEL: g2: +; O32-SDAG: # %bb.0: # %entry +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: addiu $1, $1, 8 +; O32-SDAG-NEXT: subu $sp, $sp, $1 +; O32-SDAG-NEXT: .cfi_def_cfa_offset 65544 +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: addu $1, $sp, $1 +; O32-SDAG-NEXT: sw $ra, 4($1) # 4-byte Folded Spill +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: addu $1, $sp, $1 +; O32-SDAG-NEXT: sw $16, 0($1) # 4-byte Folded Spill +; O32-SDAG-NEXT: .cfi_offset 31, -4 +; O32-SDAG-NEXT: .cfi_offset 16, -8 +; O32-SDAG-NEXT: move $5, $4 +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: addu $1, $sp, $1 +; O32-SDAG-NEXT: sw $4, -4($1) +; O32-SDAG-NEXT: addiu $sp, $sp, -16 +; O32-SDAG-NEXT: addiu $16, $sp, 8 +; O32-SDAG-NEXT: ori $6, $zero, 65520 +; O32-SDAG-NEXT: jal memcpy +; O32-SDAG-NEXT: move $4, $16 +; O32-SDAG-NEXT: addiu $sp, $sp, 16 +; O32-SDAG-NEXT: ori $1, $zero, 65520 +; O32-SDAG-NEXT: subu $sp, $sp, $1 +; O32-SDAG-NEXT: addiu $5, $16, 16 +; O32-SDAG-NEXT: addiu $4, $sp, 16 +; O32-SDAG-NEXT: jal memcpy +; O32-SDAG-NEXT: ori $6, $zero, 65504 +; O32-SDAG-NEXT: lw $7, 20($sp) +; O32-SDAG-NEXT: lw $6, 16($sp) +; O32-SDAG-NEXT: lw $5, 12($sp) +; O32-SDAG-NEXT: jal f2 +; O32-SDAG-NEXT: lw $4, 8($sp) +; O32-SDAG-NEXT: ori $1, $zero, 65520 +; O32-SDAG-NEXT: addu $sp, $sp, $1 +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: addu $1, $sp, $1 +; O32-SDAG-NEXT: lw $16, 0($1) # 4-byte Folded Reload +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: addu $1, $sp, $1 +; O32-SDAG-NEXT: lw $ra, 4($1) # 4-byte Folded Reload +; O32-SDAG-NEXT: lui $1, 1 +; O32-SDAG-NEXT: addiu $1, $1, 8 +; O32-SDAG-NEXT: jr $ra +; O32-SDAG-NEXT: addu $sp, $sp, $1 +; +; N32-SDAG-LABEL: g2: +; N32-SDAG: # %bb.0: # %entry +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: addiu $1, $1, 16 +; N32-SDAG-NEXT: subu $sp, $sp, $1 +; N32-SDAG-NEXT: .cfi_def_cfa_offset 65552 +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: addu $1, $sp, $1 +; N32-SDAG-NEXT: sd $ra, 8($1) # 8-byte Folded Spill +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: addu $1, $sp, $1 +; N32-SDAG-NEXT: sd $16, 0($1) # 8-byte Folded Spill +; N32-SDAG-NEXT: .cfi_offset 31, -8 +; N32-SDAG-NEXT: .cfi_offset 16, -16 +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: addu $1, $sp, $1 +; N32-SDAG-NEXT: sw $4, -4($1) +; N32-SDAG-NEXT: addiu $16, $sp, 8 +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: addu $1, $sp, $1 +; N32-SDAG-NEXT: lw $5, -4($1) +; N32-SDAG-NEXT: ori $6, $zero, 65520 +; N32-SDAG-NEXT: jal memcpy +; N32-SDAG-NEXT: move $4, $16 +; N32-SDAG-NEXT: addiu $5, $16, 64 +; N32-SDAG-NEXT: ori $1, $zero, 65456 +; N32-SDAG-NEXT: subu $sp, $sp, $1 +; N32-SDAG-NEXT: ori $6, $zero, 65456 +; N32-SDAG-NEXT: jal memcpy +; N32-SDAG-NEXT: move $4, $sp +; N32-SDAG-NEXT: ld $11, 64($sp) +; N32-SDAG-NEXT: ld $10, 56($sp) +; N32-SDAG-NEXT: ld $9, 48($sp) +; N32-SDAG-NEXT: ld $8, 40($sp) +; N32-SDAG-NEXT: ld $7, 32($sp) +; N32-SDAG-NEXT: ld $6, 24($sp) +; N32-SDAG-NEXT: ld $5, 16($sp) +; N32-SDAG-NEXT: jal f2 +; N32-SDAG-NEXT: ld $4, 8($sp) +; N32-SDAG-NEXT: ori $1, $zero, 65456 +; N32-SDAG-NEXT: addu $sp, $sp, $1 +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: addu $1, $sp, $1 +; N32-SDAG-NEXT: ld $16, 0($1) # 8-byte Folded Reload +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: addu $1, $sp, $1 +; N32-SDAG-NEXT: ld $ra, 8($1) # 8-byte Folded Reload +; N32-SDAG-NEXT: lui $1, 1 +; N32-SDAG-NEXT: addiu $1, $1, 16 +; N32-SDAG-NEXT: jr $ra +; N32-SDAG-NEXT: addu $sp, $sp, $1 +; +; N64-SDAG-LABEL: g2: +; N64-SDAG: # %bb.0: # %entry +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: daddiu $1, $1, 16 +; N64-SDAG-NEXT: dsubu $sp, $sp, $1 +; N64-SDAG-NEXT: .cfi_def_cfa_offset 65552 +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: daddu $1, $sp, $1 +; N64-SDAG-NEXT: sd $ra, 8($1) # 8-byte Folded Spill +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: daddu $1, $sp, $1 +; N64-SDAG-NEXT: sd $16, 0($1) # 8-byte Folded Spill +; N64-SDAG-NEXT: .cfi_offset 31, -8 +; N64-SDAG-NEXT: .cfi_offset 16, -16 +; N64-SDAG-NEXT: move $5, $4 +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: daddu $1, $sp, $1 +; N64-SDAG-NEXT: sd $4, -8($1) +; N64-SDAG-NEXT: daddiu $16, $sp, 8 +; N64-SDAG-NEXT: ori $6, $zero, 65520 +; N64-SDAG-NEXT: jal memcpy +; N64-SDAG-NEXT: move $4, $16 +; N64-SDAG-NEXT: ori $1, $zero, 65456 +; N64-SDAG-NEXT: dsubu $sp, $sp, $1 +; N64-SDAG-NEXT: daddiu $5, $16, 64 +; N64-SDAG-NEXT: ori $6, $zero, 65456 +; N64-SDAG-NEXT: jal memcpy +; N64-SDAG-NEXT: move $4, $sp +; N64-SDAG-NEXT: ld $11, 64($sp) +; N64-SDAG-NEXT: ld $10, 56($sp) +; N64-SDAG-NEXT: ld $9, 48($sp) +; N64-SDAG-NEXT: ld $8, 40($sp) +; N64-SDAG-NEXT: ld $7, 32($sp) +; N64-SDAG-NEXT: ld $6, 24($sp) +; N64-SDAG-NEXT: ld $5, 16($sp) +; N64-SDAG-NEXT: jal f2 +; N64-SDAG-NEXT: ld $4, 8($sp) +; N64-SDAG-NEXT: ori $1, $zero, 65456 +; N64-SDAG-NEXT: daddu $sp, $sp, $1 +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: daddu $1, $sp, $1 +; N64-SDAG-NEXT: ld $16, 0($1) # 8-byte Folded Reload +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: daddu $1, $sp, $1 +; N64-SDAG-NEXT: ld $ra, 8($1) # 8-byte Folded Reload +; N64-SDAG-NEXT: lui $1, 1 +; N64-SDAG-NEXT: daddiu $1, $1, 16 +; N64-SDAG-NEXT: jr $ra +; N64-SDAG-NEXT: daddu $sp, $sp, $1 entry: %a.addr = alloca ptr %byval-temp = alloca %struct.S1, align 8 @@ -386,8 +636,10 @@ ; N32-NEXT: .cfi_def_cfa_offset 16 ; N32-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill ; N32-NEXT: .cfi_offset 31, -8 -; N32-NEXT: sw $5, 0($sp) ; N32-NEXT: sw $4, 4($sp) +; N32-NEXT: sw $5, 0($sp) +; N32-NEXT: lw $4, 4($sp) +; N32-NEXT: lw $5, 0($sp) ; N32-NEXT: jal memcpy ; N32-NEXT: ori $6, $zero, 65520 ; N32-NEXT: addiu $2, $zero, 4 @@ -409,6 +661,53 @@ ; N64-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload ; N64-NEXT: jr $ra ; N64-NEXT: daddiu $sp, $sp, 32 +; +; O32-SDAG-LABEL: g3: +; O32-SDAG: # %bb.0: # %entry +; O32-SDAG-NEXT: addiu $sp, $sp, -32 +; O32-SDAG-NEXT: .cfi_def_cfa_offset 32 +; O32-SDAG-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; O32-SDAG-NEXT: .cfi_offset 31, -4 +; O32-SDAG-NEXT: sw $5, 20($sp) +; O32-SDAG-NEXT: sw $4, 24($sp) +; O32-SDAG-NEXT: jal memcpy +; O32-SDAG-NEXT: ori $6, $zero, 65520 +; O32-SDAG-NEXT: addiu $2, $zero, 4 +; O32-SDAG-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload +; O32-SDAG-NEXT: jr $ra +; O32-SDAG-NEXT: addiu $sp, $sp, 32 +; +; N32-SDAG-LABEL: g3: +; N32-SDAG: # %bb.0: # %entry +; N32-SDAG-NEXT: addiu $sp, $sp, -16 +; N32-SDAG-NEXT: .cfi_def_cfa_offset 16 +; N32-SDAG-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; N32-SDAG-NEXT: .cfi_offset 31, -8 +; N32-SDAG-NEXT: sw $4, 4($sp) +; N32-SDAG-NEXT: sw $5, 0($sp) +; N32-SDAG-NEXT: lw $4, 4($sp) +; N32-SDAG-NEXT: lw $5, 0($sp) +; N32-SDAG-NEXT: jal memcpy +; N32-SDAG-NEXT: ori $6, $zero, 65520 +; N32-SDAG-NEXT: addiu $2, $zero, 4 +; N32-SDAG-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; N32-SDAG-NEXT: jr $ra +; N32-SDAG-NEXT: addiu $sp, $sp, 16 +; +; N64-SDAG-LABEL: g3: +; N64-SDAG: # %bb.0: # %entry +; N64-SDAG-NEXT: daddiu $sp, $sp, -32 +; N64-SDAG-NEXT: .cfi_def_cfa_offset 32 +; N64-SDAG-NEXT: sd $ra, 24($sp) # 8-byte Folded Spill +; N64-SDAG-NEXT: .cfi_offset 31, -8 +; N64-SDAG-NEXT: sd $5, 8($sp) +; N64-SDAG-NEXT: sd $4, 16($sp) +; N64-SDAG-NEXT: jal memcpy +; N64-SDAG-NEXT: ori $6, $zero, 65520 +; N64-SDAG-NEXT: addiu $2, $zero, 4 +; N64-SDAG-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload +; N64-SDAG-NEXT: jr $ra +; N64-SDAG-NEXT: daddiu $sp, $sp, 32 entry: %a.addr = alloca ptr %b.addr = alloca ptr diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll --- a/llvm/test/CodeGen/Mips/cconv/vector.ll +++ b/llvm/test/CodeGen/Mips/cconv/vector.ll @@ -637,18 +637,16 @@ ; MIPS64: # %bb.0: ; MIPS64-NEXT: dsrl $1, $5, 48 ; MIPS64-NEXT: sll $1, $1, 0 -; MIPS64-NEXT: dsrl $2, $4, 48 -; MIPS64-NEXT: sll $2, $2, 0 -; MIPS64-NEXT: addu $1, $2, $1 -; MIPS64-NEXT: dsrl $2, $5, 56 -; MIPS64-NEXT: andi $1, $1, 255 -; MIPS64-NEXT: sll $2, $2, 0 -; MIPS64-NEXT: dsrl $3, $4, 56 +; MIPS64-NEXT: srl $2, $1, 8 +; MIPS64-NEXT: dsrl $3, $4, 48 ; MIPS64-NEXT: sll $3, $3, 0 -; MIPS64-NEXT: addu $2, $3, $2 +; MIPS64-NEXT: srl $6, $3, 8 +; MIPS64-NEXT: addu $2, $6, $2 +; MIPS64-NEXT: addu $1, $3, $1 +; MIPS64-NEXT: andi $1, $1, 255 ; MIPS64-NEXT: dsrl $3, $5, 40 -; MIPS64-NEXT: sll $2, $2, 24 ; MIPS64-NEXT: sll $1, $1, 16 +; MIPS64-NEXT: sll $2, $2, 24 ; MIPS64-NEXT: sll $3, $3, 0 ; MIPS64-NEXT: dsrl $6, $4, 40 ; MIPS64-NEXT: sll $6, $6, 0 @@ -1001,93 +999,89 @@ ; ; MIPS64-LABEL: i8_16: ; MIPS64: # %bb.0: -; MIPS64-NEXT: sll $1, $6, 0 -; MIPS64-NEXT: dsrl $2, $6, 56 -; MIPS64-NEXT: dsrl $3, $6, 48 -; MIPS64-NEXT: dsrl $8, $4, 48 -; MIPS64-NEXT: srl $9, $1, 16 -; MIPS64-NEXT: sll $10, $4, 0 -; MIPS64-NEXT: srl $11, $10, 16 -; MIPS64-NEXT: dsrl $12, $7, 56 -; MIPS64-NEXT: addu $13, $10, $1 -; MIPS64-NEXT: addu $9, $11, $9 -; MIPS64-NEXT: sll $2, $2, 0 -; MIPS64-NEXT: dsrl $11, $7, 48 -; MIPS64-NEXT: srl $14, $1, 8 -; MIPS64-NEXT: srl $15, $10, 8 -; MIPS64-NEXT: addu $14, $15, $14 -; MIPS64-NEXT: dsrl $15, $4, 56 -; MIPS64-NEXT: dsrl $24, $7, 40 +; MIPS64-NEXT: dsrl $1, $6, 48 +; MIPS64-NEXT: sll $2, $6, 0 +; MIPS64-NEXT: dsrl $3, $4, 48 ; MIPS64-NEXT: sll $3, $3, 0 -; MIPS64-NEXT: sll $8, $8, 0 -; MIPS64-NEXT: sll $15, $15, 0 -; MIPS64-NEXT: andi $9, $9, 255 -; MIPS64-NEXT: addu $2, $15, $2 -; MIPS64-NEXT: andi $13, $13, 255 -; MIPS64-NEXT: sll $14, $14, 8 -; MIPS64-NEXT: addu $3, $8, $3 -; MIPS64-NEXT: sll $8, $11, 0 -; MIPS64-NEXT: srl $1, $1, 24 -; MIPS64-NEXT: sll $11, $12, 0 -; MIPS64-NEXT: dsrl $12, $5, 56 +; MIPS64-NEXT: sll $1, $1, 0 +; MIPS64-NEXT: srl $8, $2, 8 +; MIPS64-NEXT: dsrl $9, $7, 48 +; MIPS64-NEXT: srl $10, $2, 16 +; MIPS64-NEXT: sll $11, $4, 0 +; MIPS64-NEXT: srl $12, $11, 16 +; MIPS64-NEXT: addu $13, $11, $2 +; MIPS64-NEXT: addu $10, $12, $10 +; MIPS64-NEXT: srl $12, $11, 8 +; MIPS64-NEXT: addu $8, $12, $8 +; MIPS64-NEXT: srl $12, $1, 8 +; MIPS64-NEXT: srl $14, $3, 8 +; MIPS64-NEXT: sll $9, $9, 0 ; MIPS64-NEXT: dsrl $15, $5, 48 -; MIPS64-NEXT: andi $3, $3, 255 -; MIPS64-NEXT: dsrl $25, $6, 40 ; MIPS64-NEXT: sll $15, $15, 0 -; MIPS64-NEXT: srl $10, $10, 24 -; MIPS64-NEXT: sll $12, $12, 0 -; MIPS64-NEXT: or $13, $13, $14 -; MIPS64-NEXT: sll $14, $24, 0 +; MIPS64-NEXT: srl $2, $2, 24 +; MIPS64-NEXT: srl $11, $11, 24 +; MIPS64-NEXT: andi $10, $10, 255 +; MIPS64-NEXT: addu $2, $11, $2 +; MIPS64-NEXT: addu $11, $15, $9 +; MIPS64-NEXT: addu $12, $14, $12 +; MIPS64-NEXT: andi $13, $13, 255 +; MIPS64-NEXT: sll $8, $8, 8 +; MIPS64-NEXT: addu $1, $3, $1 +; MIPS64-NEXT: srl $3, $9, 8 +; MIPS64-NEXT: dsrl $9, $7, 40 +; MIPS64-NEXT: dsrl $14, $6, 40 +; MIPS64-NEXT: srl $15, $15, 8 +; MIPS64-NEXT: or $8, $13, $8 +; MIPS64-NEXT: sll $9, $9, 0 +; MIPS64-NEXT: sll $12, $12, 24 +; MIPS64-NEXT: addu $3, $15, $3 +; MIPS64-NEXT: andi $11, $11, 255 ; MIPS64-NEXT: sll $2, $2, 24 -; MIPS64-NEXT: addu $11, $12, $11 -; MIPS64-NEXT: sll $9, $9, 16 -; MIPS64-NEXT: addu $1, $10, $1 -; MIPS64-NEXT: addu $8, $15, $8 -; MIPS64-NEXT: sll $10, $25, 0 -; MIPS64-NEXT: dsrl $12, $4, 40 -; MIPS64-NEXT: sll $12, $12, 0 -; MIPS64-NEXT: addu $10, $12, $10 -; MIPS64-NEXT: sll $3, $3, 16 -; MIPS64-NEXT: andi $8, $8, 255 -; MIPS64-NEXT: sll $1, $1, 24 -; MIPS64-NEXT: dsrl $12, $5, 40 -; MIPS64-NEXT: sll $12, $12, 0 +; MIPS64-NEXT: sll $10, $10, 16 +; MIPS64-NEXT: andi $1, $1, 255 +; MIPS64-NEXT: dsrl $13, $5, 40 +; MIPS64-NEXT: sll $13, $13, 0 +; MIPS64-NEXT: sll $14, $14, 0 +; MIPS64-NEXT: dsrl $15, $4, 40 +; MIPS64-NEXT: sll $15, $15, 0 +; MIPS64-NEXT: addu $14, $15, $14 +; MIPS64-NEXT: sll $1, $1, 16 ; MIPS64-NEXT: dsrl $6, $6, 32 -; MIPS64-NEXT: or $1, $1, $9 -; MIPS64-NEXT: addu $9, $12, $14 -; MIPS64-NEXT: sll $11, $11, 24 -; MIPS64-NEXT: sll $8, $8, 16 -; MIPS64-NEXT: dsrl $12, $7, 32 -; MIPS64-NEXT: andi $13, $13, 65535 -; MIPS64-NEXT: or $2, $2, $3 -; MIPS64-NEXT: sll $3, $10, 8 +; MIPS64-NEXT: or $2, $2, $10 +; MIPS64-NEXT: addu $9, $13, $9 +; MIPS64-NEXT: sll $10, $11, 16 +; MIPS64-NEXT: sll $3, $3, 24 +; MIPS64-NEXT: dsrl $11, $7, 32 +; MIPS64-NEXT: andi $8, $8, 65535 +; MIPS64-NEXT: or $1, $12, $1 +; MIPS64-NEXT: sll $12, $14, 8 ; MIPS64-NEXT: sll $6, $6, 0 ; MIPS64-NEXT: dsrl $4, $4, 32 ; MIPS64-NEXT: sll $4, $4, 0 ; MIPS64-NEXT: addu $4, $4, $6 ; MIPS64-NEXT: andi $4, $4, 255 -; MIPS64-NEXT: or $3, $4, $3 -; MIPS64-NEXT: andi $3, $3, 65535 -; MIPS64-NEXT: or $2, $3, $2 -; MIPS64-NEXT: or $1, $13, $1 -; MIPS64-NEXT: or $3, $11, $8 +; MIPS64-NEXT: or $4, $4, $12 +; MIPS64-NEXT: andi $4, $4, 65535 +; MIPS64-NEXT: or $1, $4, $1 +; MIPS64-NEXT: or $2, $8, $2 +; MIPS64-NEXT: or $3, $3, $10 ; MIPS64-NEXT: sll $4, $9, 8 -; MIPS64-NEXT: sll $6, $12, 0 +; MIPS64-NEXT: sll $6, $11, 0 ; MIPS64-NEXT: dsrl $8, $5, 32 ; MIPS64-NEXT: sll $8, $8, 0 ; MIPS64-NEXT: addu $6, $8, $6 ; MIPS64-NEXT: andi $6, $6, 255 ; MIPS64-NEXT: or $4, $6, $4 ; MIPS64-NEXT: andi $4, $4, 65535 -; MIPS64-NEXT: dsll $1, $1, 32 +; MIPS64-NEXT: dsll $2, $2, 32 ; MIPS64-NEXT: or $3, $4, $3 ; MIPS64-NEXT: sll $4, $7, 0 ; MIPS64-NEXT: srl $6, $4, 24 ; MIPS64-NEXT: sll $5, $5, 0 ; MIPS64-NEXT: srl $7, $5, 24 ; MIPS64-NEXT: addu $8, $5, $4 -; MIPS64-NEXT: dsll $2, $2, 32 -; MIPS64-NEXT: dsrl $1, $1, 32 +; MIPS64-NEXT: dsll $1, $1, 32 +; MIPS64-NEXT: dsrl $2, $2, 32 ; MIPS64-NEXT: addu $6, $7, $6 ; MIPS64-NEXT: sll $6, $6, 24 ; MIPS64-NEXT: srl $7, $4, 16 @@ -1095,7 +1089,7 @@ ; MIPS64-NEXT: addu $7, $9, $7 ; MIPS64-NEXT: andi $7, $7, 255 ; MIPS64-NEXT: sll $7, $7, 16 -; MIPS64-NEXT: or $2, $1, $2 +; MIPS64-NEXT: or $2, $2, $1 ; MIPS64-NEXT: dsll $1, $3, 32 ; MIPS64-NEXT: or $3, $6, $7 ; MIPS64-NEXT: andi $6, $8, 255 diff --git a/llvm/test/CodeGen/Mips/dins.ll b/llvm/test/CodeGen/Mips/dins.ll --- a/llvm/test/CodeGen/Mips/dins.ll +++ b/llvm/test/CodeGen/Mips/dins.ll @@ -37,19 +37,23 @@ ; MIPS64R2-NEXT: daddiu $sp, $sp, -16 ; MIPS64R2-NEXT: .cfi_def_cfa_offset 16 ; MIPS64R2-NEXT: sd $4, 8($sp) -; MIPS64R2-NEXT: sd $5, 0($sp) ; MIPS64R2-NEXT: daddiu $1, $zero, 123 +; MIPS64R2-NEXT: sd $5, 0($sp) +; MIPS64R2-NEXT: dsll $1, $1, 27 ; MIPS64R2-NEXT: ld $2, 8($sp) -; MIPS64R2-NEXT: dinsm $2, $1, 27, 37 -; MIPS64R2-NEXT: sd $2, 8($sp) -; MIPS64R2-NEXT: daddiu $1, $zero, 4 -; MIPS64R2-NEXT: ld $2, 0($sp) -; MIPS64R2-NEXT: dinsm $2, $1, 28, 6 +; MIPS64R2-NEXT: dext $2, $2, 0, 27 +; MIPS64R2-NEXT: or $1, $2, $1 +; MIPS64R2-NEXT: sd $1, 8($sp) ; MIPS64R2-NEXT: daddiu $1, $zero, 5 -; MIPS64R2-NEXT: sd $2, 0($sp) +; MIPS64R2-NEXT: daddiu $2, $zero, 4 +; MIPS64R2-NEXT: ld $3, 0($sp) +; MIPS64R2-NEXT: dinsm $3, $2, 28, 6 +; MIPS64R2-NEXT: dsll $1, $1, 50 +; MIPS64R2-NEXT: sd $3, 0($sp) ; MIPS64R2-NEXT: ld $2, 0($sp) -; MIPS64R2-NEXT: dinsu $2, $1, 50, 14 -; MIPS64R2-NEXT: sd $2, 0($sp) +; MIPS64R2-NEXT: dextm $2, $2, 0, 50 +; MIPS64R2-NEXT: or $1, $2, $1 +; MIPS64R2-NEXT: sd $1, 0($sp) ; MIPS64R2-NEXT: ld $1, 0($sp) ; MIPS64R2-NEXT: dsrl $1, $1, 50 ; MIPS64R2-NEXT: ld $2, 0($sp) @@ -181,19 +185,23 @@ ; MIPS64R2N32-NEXT: addiu $sp, $sp, -16 ; MIPS64R2N32-NEXT: .cfi_def_cfa_offset 16 ; MIPS64R2N32-NEXT: sd $4, 8($sp) -; MIPS64R2N32-NEXT: sd $5, 0($sp) ; MIPS64R2N32-NEXT: daddiu $1, $zero, 123 +; MIPS64R2N32-NEXT: sd $5, 0($sp) +; MIPS64R2N32-NEXT: dsll $1, $1, 27 ; MIPS64R2N32-NEXT: ld $2, 8($sp) -; MIPS64R2N32-NEXT: dinsm $2, $1, 27, 37 -; MIPS64R2N32-NEXT: sd $2, 8($sp) -; MIPS64R2N32-NEXT: daddiu $1, $zero, 4 -; MIPS64R2N32-NEXT: ld $2, 0($sp) -; MIPS64R2N32-NEXT: dinsm $2, $1, 28, 6 +; MIPS64R2N32-NEXT: dext $2, $2, 0, 27 +; MIPS64R2N32-NEXT: or $1, $2, $1 +; MIPS64R2N32-NEXT: sd $1, 8($sp) ; MIPS64R2N32-NEXT: daddiu $1, $zero, 5 -; MIPS64R2N32-NEXT: sd $2, 0($sp) +; MIPS64R2N32-NEXT: daddiu $2, $zero, 4 +; MIPS64R2N32-NEXT: ld $3, 0($sp) +; MIPS64R2N32-NEXT: dinsm $3, $2, 28, 6 +; MIPS64R2N32-NEXT: dsll $1, $1, 50 +; MIPS64R2N32-NEXT: sd $3, 0($sp) ; MIPS64R2N32-NEXT: ld $2, 0($sp) -; MIPS64R2N32-NEXT: dinsu $2, $1, 50, 14 -; MIPS64R2N32-NEXT: sd $2, 0($sp) +; MIPS64R2N32-NEXT: dextm $2, $2, 0, 50 +; MIPS64R2N32-NEXT: or $1, $2, $1 +; MIPS64R2N32-NEXT: sd $1, 0($sp) ; MIPS64R2N32-NEXT: ld $1, 0($sp) ; MIPS64R2N32-NEXT: dsrl $1, $1, 50 ; MIPS64R2N32-NEXT: ld $2, 0($sp) diff --git a/llvm/test/CodeGen/Mips/load-store-left-right.ll b/llvm/test/CodeGen/Mips/load-store-left-right.ll --- a/llvm/test/CodeGen/Mips/load-store-left-right.ll +++ b/llvm/test/CodeGen/Mips/load-store-left-right.ll @@ -670,11 +670,11 @@ ; MIPS32-NEXT: addiu $2, $2, %lo(_gp_disp) ; MIPS32-NEXT: addu $1, $2, $25 ; MIPS32-NEXT: lw $1, %got(struct_s0)($1) -; MIPS32-NEXT: lbu $2, 0($1) -; MIPS32-NEXT: sb $2, 2($1) ; MIPS32-NEXT: lbu $2, 1($1) -; MIPS32-NEXT: jr $ra ; MIPS32-NEXT: sb $2, 3($1) +; MIPS32-NEXT: lbu $2, 0($1) +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: sb $2, 2($1) ; ; MIPS32R6-LABEL: copy_struct_S0: ; MIPS32R6: # %bb.0: # %entry @@ -692,11 +692,11 @@ ; MIPS64-NEXT: daddu $1, $1, $25 ; MIPS64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(copy_struct_S0))) ; MIPS64-NEXT: ld $1, %got_disp(struct_s0)($1) -; MIPS64-NEXT: lbu $2, 0($1) -; MIPS64-NEXT: sb $2, 2($1) ; MIPS64-NEXT: lbu $2, 1($1) -; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: sb $2, 3($1) +; MIPS64-NEXT: lbu $2, 0($1) +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: sb $2, 2($1) ; ; MIPS64R6-LABEL: copy_struct_S0: ; MIPS64R6: # %bb.0: # %entry @@ -818,15 +818,15 @@ ; MIPS32-EL-NEXT: addiu $2, $2, %lo(_gp_disp) ; MIPS32-EL-NEXT: addu $1, $2, $25 ; MIPS32-EL-NEXT: lw $1, %got(struct_s2)($1) -; MIPS32-EL-NEXT: lwl $2, 3($1) -; MIPS32-EL-NEXT: lwr $2, 0($1) -; MIPS32-EL-NEXT: swl $2, 11($1) -; MIPS32-EL-NEXT: lwl $3, 7($1) -; MIPS32-EL-NEXT: lwr $3, 4($1) -; MIPS32-EL-NEXT: swl $3, 15($1) -; MIPS32-EL-NEXT: swr $2, 8($1) +; MIPS32-EL-NEXT: lwl $2, 7($1) +; MIPS32-EL-NEXT: lwr $2, 4($1) +; MIPS32-EL-NEXT: swl $2, 15($1) +; MIPS32-EL-NEXT: lwl $3, 3($1) +; MIPS32-EL-NEXT: lwr $3, 0($1) +; MIPS32-EL-NEXT: swl $3, 11($1) +; MIPS32-EL-NEXT: swr $2, 12($1) ; MIPS32-EL-NEXT: jr $ra -; MIPS32-EL-NEXT: swr $3, 12($1) +; MIPS32-EL-NEXT: swr $3, 8($1) ; ; MIPS32-EB-LABEL: copy_struct_S2: ; MIPS32-EB: # %bb.0: # %entry @@ -834,15 +834,15 @@ ; MIPS32-EB-NEXT: addiu $2, $2, %lo(_gp_disp) ; MIPS32-EB-NEXT: addu $1, $2, $25 ; MIPS32-EB-NEXT: lw $1, %got(struct_s2)($1) -; MIPS32-EB-NEXT: lwl $2, 0($1) -; MIPS32-EB-NEXT: lwr $2, 3($1) -; MIPS32-EB-NEXT: swl $2, 8($1) -; MIPS32-EB-NEXT: lwl $3, 4($1) -; MIPS32-EB-NEXT: lwr $3, 7($1) -; MIPS32-EB-NEXT: swl $3, 12($1) -; MIPS32-EB-NEXT: swr $2, 11($1) +; MIPS32-EB-NEXT: lwl $2, 4($1) +; MIPS32-EB-NEXT: lwr $2, 7($1) +; MIPS32-EB-NEXT: swl $2, 12($1) +; MIPS32-EB-NEXT: lwl $3, 0($1) +; MIPS32-EB-NEXT: lwr $3, 3($1) +; MIPS32-EB-NEXT: swl $3, 8($1) +; MIPS32-EB-NEXT: swr $2, 15($1) ; MIPS32-EB-NEXT: jr $ra -; MIPS32-EB-NEXT: swr $3, 15($1) +; MIPS32-EB-NEXT: swr $3, 11($1) ; ; MIPS32R6-LABEL: copy_struct_S2: ; MIPS32R6: # %bb.0: # %entry @@ -850,11 +850,11 @@ ; MIPS32R6-NEXT: addiu $2, $2, %lo(_gp_disp) ; MIPS32R6-NEXT: addu $1, $2, $25 ; MIPS32R6-NEXT: lw $1, %got(struct_s2)($1) -; MIPS32R6-NEXT: lw $2, 0($1) -; MIPS32R6-NEXT: sw $2, 8($1) ; MIPS32R6-NEXT: lw $2, 4($1) -; MIPS32R6-NEXT: jr $ra ; MIPS32R6-NEXT: sw $2, 12($1) +; MIPS32R6-NEXT: lw $2, 0($1) +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: sw $2, 8($1) ; ; MIPS64-EL-LABEL: copy_struct_S2: ; MIPS64-EL: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll b/llvm/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll --- a/llvm/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll +++ b/llvm/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll @@ -30,7 +30,7 @@ define void @f2(ptr %p) { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lhu16 $2, 0($4) +; CHECK-NEXT: lbu16 $2, 0($4) ; CHECK-NEXT: andi16 $2, $2, 2 ; CHECK-NEXT: bnezc $2, $BB1_2 ; CHECK-NEXT: # %bb.1: # %if.then diff --git a/llvm/test/CodeGen/Mips/mips64-f128.ll b/llvm/test/CodeGen/Mips/mips64-f128.ll --- a/llvm/test/CodeGen/Mips/mips64-f128.ll +++ b/llvm/test/CodeGen/Mips/mips64-f128.ll @@ -1986,13 +1986,16 @@ ; CMP_CC_FMT-NEXT: lui $1, %hi(%neg(%gp_rel(libcall2_copysignl))) ; CMP_CC_FMT-NEXT: daddu $1, $1, $25 ; CMP_CC_FMT-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(libcall2_copysignl))) -; CMP_CC_FMT-NEXT: ld $2, %got_disp(gld0)($1) -; CMP_CC_FMT-NEXT: ld $4, 8($2) -; CMP_CC_FMT-NEXT: ld $1, %got_disp(gld1)($1) -; CMP_CC_FMT-NEXT: ld $1, 8($1) -; CMP_CC_FMT-NEXT: dsrl $1, $1, 63 -; CMP_CC_FMT-NEXT: dinsu $4, $1, 63, 1 -; CMP_CC_FMT-NEXT: ld $2, 0($2) +; CMP_CC_FMT-NEXT: daddiu $2, $zero, 1 +; CMP_CC_FMT-NEXT: dsll $2, $2, 63 +; CMP_CC_FMT-NEXT: ld $3, %got_disp(gld1)($1) +; CMP_CC_FMT-NEXT: ld $3, 8($3) +; CMP_CC_FMT-NEXT: and $2, $3, $2 +; CMP_CC_FMT-NEXT: ld $1, %got_disp(gld0)($1) +; CMP_CC_FMT-NEXT: ld $3, 8($1) +; CMP_CC_FMT-NEXT: dextm $3, $3, 0, 63 +; CMP_CC_FMT-NEXT: or $4, $3, $2 +; CMP_CC_FMT-NEXT: ld $2, 0($1) ; CMP_CC_FMT-NEXT: jrc $ra entry: %0 = load fp128, ptr @gld0, align 16 diff --git a/llvm/test/CodeGen/Mips/o32_cc_byval.ll b/llvm/test/CodeGen/Mips/o32_cc_byval.ll --- a/llvm/test/CodeGen/Mips/o32_cc_byval.ll +++ b/llvm/test/CodeGen/Mips/o32_cc_byval.ll @@ -190,9 +190,9 @@ ; CHECK-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill ; CHECK-NEXT: addu $gp, $2, $25 ; CHECK-NEXT: move $4, $7 +; CHECK-NEXT: sw $7, 60($sp) ; CHECK-NEXT: sw $6, 56($sp) ; CHECK-NEXT: sw $5, 52($sp) -; CHECK-NEXT: sw $7, 60($sp) ; CHECK-NEXT: lw $1, 80($sp) ; CHECK-NEXT: sll $2, $5, 24 ; CHECK-NEXT: sra $2, $2, 24 diff --git a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll @@ -79,24 +79,36 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; MIPSEL-LABEL: test_urem_odd_setne: ; MIPSEL: # %bb.0: -; MIPSEL-NEXT: sll $1, $4, 1 -; MIPSEL-NEXT: addu $1, $1, $4 -; MIPSEL-NEXT: negu $1, $1 +; MIPSEL-NEXT: andi $1, $4, 15 +; MIPSEL-NEXT: sll $2, $1, 1 +; MIPSEL-NEXT: addu $1, $2, $1 +; MIPSEL-NEXT: sll $2, $4, 4 +; MIPSEL-NEXT: subu $1, $2, $1 +; MIPSEL-NEXT: srl $1, $1, 4 +; MIPSEL-NEXT: andi $1, $1, 12 +; MIPSEL-NEXT: srl $2, $1, 2 +; MIPSEL-NEXT: or $1, $1, $2 +; MIPSEL-NEXT: subu $1, $4, $1 ; MIPSEL-NEXT: andi $1, $1, 15 -; MIPSEL-NEXT: addiu $2, $zero, 3 ; MIPSEL-NEXT: jr $ra -; MIPSEL-NEXT: sltu $2, $2, $1 +; MIPSEL-NEXT: sltu $2, $zero, $1 ; ; MIPS64EL-LABEL: test_urem_odd_setne: ; MIPS64EL: # %bb.0: ; MIPS64EL-NEXT: sll $1, $4, 0 -; MIPS64EL-NEXT: sll $2, $1, 1 -; MIPS64EL-NEXT: addu $1, $2, $1 -; MIPS64EL-NEXT: negu $1, $1 +; MIPS64EL-NEXT: andi $2, $1, 15 +; MIPS64EL-NEXT: sll $3, $2, 1 +; MIPS64EL-NEXT: addu $2, $3, $2 +; MIPS64EL-NEXT: sll $3, $1, 4 +; MIPS64EL-NEXT: subu $2, $3, $2 +; MIPS64EL-NEXT: srl $2, $2, 4 +; MIPS64EL-NEXT: andi $2, $2, 12 +; MIPS64EL-NEXT: srl $3, $2, 2 +; MIPS64EL-NEXT: or $2, $2, $3 +; MIPS64EL-NEXT: subu $1, $1, $2 ; MIPS64EL-NEXT: andi $1, $1, 15 -; MIPS64EL-NEXT: addiu $2, $zero, 3 ; MIPS64EL-NEXT: jr $ra -; MIPS64EL-NEXT: sltu $2, $2, $1 +; MIPS64EL-NEXT: sltu $2, $zero, $1 %urem = urem i4 %X, 5 %cmp = icmp ne i4 %urem, 0 ret i1 %cmp @@ -159,28 +171,28 @@ ; MIPSEL-NEXT: lui $9, 12057 ; MIPSEL-NEXT: ori $9, $9, 37186 ; MIPSEL-NEXT: multu $6, $9 -; MIPSEL-NEXT: mflo $10 -; MIPSEL-NEXT: mfhi $11 +; MIPSEL-NEXT: mflo $9 +; MIPSEL-NEXT: mfhi $10 ; MIPSEL-NEXT: addu $2, $8, $2 -; MIPSEL-NEXT: addu $12, $10, $2 +; MIPSEL-NEXT: addu $11, $9, $2 ; MIPSEL-NEXT: sltu $2, $2, $8 ; MIPSEL-NEXT: addu $2, $7, $2 -; MIPSEL-NEXT: sltu $7, $12, $10 -; MIPSEL-NEXT: sll $8, $12, 31 -; MIPSEL-NEXT: srl $10, $12, 1 -; MIPSEL-NEXT: sll $12, $3, 1 +; MIPSEL-NEXT: sltu $7, $11, $9 +; MIPSEL-NEXT: sll $8, $11, 31 +; MIPSEL-NEXT: srl $9, $11, 1 +; MIPSEL-NEXT: sll $11, $3, 1 ; MIPSEL-NEXT: srl $3, $3, 1 ; MIPSEL-NEXT: mul $1, $4, $1 -; MIPSEL-NEXT: mul $4, $5, $9 -; MIPSEL-NEXT: sll $5, $6, 1 +; MIPSEL-NEXT: sll $4, $6, 1 +; MIPSEL-NEXT: sll $5, $5, 1 ; MIPSEL-NEXT: lui $6, 60010 -; MIPSEL-NEXT: addu $7, $11, $7 +; MIPSEL-NEXT: addu $7, $10, $7 ; MIPSEL-NEXT: addu $2, $2, $7 -; MIPSEL-NEXT: addu $2, $4, $2 -; MIPSEL-NEXT: addu $1, $5, $1 +; MIPSEL-NEXT: subu $2, $2, $5 +; MIPSEL-NEXT: addu $1, $4, $1 ; MIPSEL-NEXT: addu $1, $2, $1 ; MIPSEL-NEXT: sll $2, $1, 31 -; MIPSEL-NEXT: or $4, $10, $2 +; MIPSEL-NEXT: or $4, $9, $2 ; MIPSEL-NEXT: sltiu $2, $4, 13 ; MIPSEL-NEXT: xori $4, $4, 13 ; MIPSEL-NEXT: or $3, $3, $8 @@ -189,7 +201,7 @@ ; MIPSEL-NEXT: movz $2, $3, $4 ; MIPSEL-NEXT: andi $1, $1, 2 ; MIPSEL-NEXT: srl $1, $1, 1 -; MIPSEL-NEXT: or $1, $1, $12 +; MIPSEL-NEXT: or $1, $1, $11 ; MIPSEL-NEXT: andi $1, $1, 3 ; MIPSEL-NEXT: jr $ra ; MIPSEL-NEXT: movn $2, $zero, $1 diff --git a/llvm/test/CodeGen/Mips/v2i16tof32.ll b/llvm/test/CodeGen/Mips/v2i16tof32.ll --- a/llvm/test/CodeGen/Mips/v2i16tof32.ll +++ b/llvm/test/CodeGen/Mips/v2i16tof32.ll @@ -17,15 +17,15 @@ ; CHECK-NEXT: .cfi_def_cfa_register 30 ; CHECK-NEXT: addiu $1, $zero, -16 ; CHECK-NEXT: and $sp, $sp, $1 -; CHECK-NEXT: lw $1, 12($4) -; CHECK-NEXT: lw $2, 0($4) -; CHECK-NEXT: lw $3, 8($4) -; CHECK-NEXT: sw $3, 8($sp) -; CHECK-NEXT: sw $1, 12($sp) -; CHECK-NEXT: sw $2, 0($sp) -; CHECK-NEXT: lw $1, 4($4) -; CHECK-NEXT: sw $1, 4($sp) -; CHECK-NEXT: mtc1 $2, $f0 +; CHECK-NEXT: lw $1, 8($4) +; CHECK-NEXT: lw $2, 4($4) +; CHECK-NEXT: lw $3, 12($4) +; CHECK-NEXT: sw $3, 12($sp) +; CHECK-NEXT: sw $1, 8($sp) +; CHECK-NEXT: sw $2, 4($sp) +; CHECK-NEXT: lw $1, 0($4) +; CHECK-NEXT: sw $1, 0($sp) +; CHECK-NEXT: mtc1 $1, $f0 ; CHECK-NEXT: move $sp, $fp ; CHECK-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload ; CHECK-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/andc.ll b/llvm/test/CodeGen/PowerPC/andc.ll --- a/llvm/test/CodeGen/PowerPC/andc.ll +++ b/llvm/test/CodeGen/PowerPC/andc.ll @@ -4,9 +4,11 @@ define i1 @and_cmp1(i32 %x, i32 %y) { ; CHECK-LABEL: and_cmp1: ; CHECK: # %bb.0: -; CHECK-NEXT: andc 3, 4, 3 -; CHECK-NEXT: cntlzw 3, 3 -; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31 +; CHECK-NEXT: and 3, 3, 4 +; CHECK-NEXT: li 5, 0 +; CHECK-NEXT: cmpw 3, 4 +; CHECK-NEXT: li 3, 1 +; CHECK-NEXT: iseleq 3, 3, 5 ; CHECK-NEXT: blr %and = and i32 %x, %y %cmp = icmp eq i32 %and, %y diff --git a/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll b/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll --- a/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll +++ b/llvm/test/CodeGen/PowerPC/combine_ext_trunc.ll @@ -8,9 +8,6 @@ define i32 @pattern1(i32 %x, i32 %y){ ; CHECK-LABEL: pattern1: ; CHECK: # %bb.0: -; CHECK-NEXT: xori 5, 4, 65535 -; CHECK-NEXT: xoris 5, 5, 65535 -; CHECK-NEXT: and 3, 3, 5 ; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %a = xor i32 %y, -1 @@ -23,9 +20,6 @@ define i32 @pattern2(i32 %x, i32 %y){ ; CHECK-LABEL: pattern2: ; CHECK: # %bb.0: -; CHECK-NEXT: xori 5, 4, 65535 -; CHECK-NEXT: xoris 5, 5, 65535 -; CHECK-NEXT: and 3, 5, 3 ; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: blr %a = xor i32 %y, -1 @@ -38,10 +32,8 @@ define i32 @pattern3(i1 %cond, i32 %x) { ; CHECK-LABEL: pattern3: ; CHECK: # %bb.0: -; CHECK-NEXT: li 5, -1 -; CHECK-NEXT: andi. 3, 3, 1 -; CHECK-NEXT: rldic 3, 5, 0, 32 -; CHECK-NEXT: iselgt 3, 0, 3 +; CHECK-NEXT: clrlwi 3, 3, 31 +; CHECK-NEXT: addi 3, 3, -1 ; CHECK-NEXT: and 3, 3, 4 ; CHECK-NEXT: blr %sel = select i1 %cond, i32 0, i32 -1 @@ -53,11 +45,8 @@ define i32 @pattern4(i1 %cond, i32 %x) { ; CHECK-LABEL: pattern4: ; CHECK: # %bb.0: -; CHECK-NEXT: li 5, -1 -; CHECK-NEXT: andi. 3, 3, 1 -; CHECK-NEXT: rldic 3, 5, 0, 32 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: iselgt 3, 3, 5 +; CHECK-NEXT: clrlwi 3, 3, 31 +; CHECK-NEXT: neg 3, 3 ; CHECK-NEXT: or 3, 4, 3 ; CHECK-NEXT: blr %sel = select i1 %cond, i32 -1, i32 0 diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll --- a/llvm/test/CodeGen/PowerPC/fma-combine.ll +++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll @@ -177,10 +177,10 @@ ; CHECK-NEXT: lfs 1, .LCPI4_1@toc@l(3) ; CHECK-NEXT: xvcvsxwdp 0, 34 ; CHECK-NEXT: fmr 4, 0 -; CHECK-NEXT: xsmaddasp 0, 2, 3 -; CHECK-NEXT: xsnmaddasp 4, 2, 3 -; CHECK-NEXT: xsmaddasp 1, 2, 0 -; CHECK-NEXT: xsmaddasp 1, 4, 2 +; CHECK-NEXT: xsnmaddasp 0, 2, 3 +; CHECK-NEXT: xsmaddasp 4, 2, 3 +; CHECK-NEXT: xsmaddasp 1, 2, 4 +; CHECK-NEXT: xsmaddasp 1, 0, 2 ; CHECK-NEXT: blr %tmp = load float, ptr undef, align 4 %tmp2 = load float, ptr undef, align 4 @@ -202,14 +202,14 @@ ; CHECK-FAST-LABEL: getNegatedExpression_crash: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vspltisw 2, -1 +; CHECK-FAST-NEXT: vspltisw 3, 1 ; CHECK-FAST-NEXT: addis 3, 2, .LCPI5_0@toc@ha ; CHECK-FAST-NEXT: lfs 4, .LCPI5_0@toc@l(3) -; CHECK-FAST-NEXT: xvcvsxwdp 3, 34 -; CHECK-FAST-NEXT: xssubdp 0, 1, 3 -; CHECK-FAST-NEXT: # kill: def $f3 killed $f3 killed $vsl3 -; CHECK-FAST-NEXT: xsmaddadp 3, 1, 4 -; CHECK-FAST-NEXT: xsmaddadp 0, 3, 2 -; CHECK-FAST-NEXT: fmr 1, 0 +; CHECK-FAST-NEXT: xvcvsxwdp 0, 34 +; CHECK-FAST-NEXT: xvcvsxwdp 3, 35 +; CHECK-FAST-NEXT: xsmaddadp 0, 1, 4 +; CHECK-FAST-NEXT: xsadddp 1, 1, 3 +; CHECK-FAST-NEXT: xsmaddadp 1, 0, 2 ; CHECK-FAST-NEXT: blr ; ; CHECK-FAST-NOVSX-LABEL: getNegatedExpression_crash: @@ -217,23 +217,25 @@ ; CHECK-FAST-NOVSX-NEXT: addis 3, 2, .LCPI5_0@toc@ha ; CHECK-FAST-NOVSX-NEXT: addis 4, 2, .LCPI5_1@toc@ha ; CHECK-FAST-NOVSX-NEXT: lfs 0, .LCPI5_0@toc@l(3) +; CHECK-FAST-NOVSX-NEXT: addis 3, 2, .LCPI5_2@toc@ha ; CHECK-FAST-NOVSX-NEXT: lfs 3, .LCPI5_1@toc@l(4) -; CHECK-FAST-NOVSX-NEXT: fmadd 3, 1, 3, 0 -; CHECK-FAST-NOVSX-NEXT: fsub 0, 1, 0 -; CHECK-FAST-NOVSX-NEXT: fmadd 1, 3, 2, 0 +; CHECK-FAST-NOVSX-NEXT: lfs 4, .LCPI5_2@toc@l(3) +; CHECK-FAST-NOVSX-NEXT: fmadd 0, 1, 3, 0 +; CHECK-FAST-NOVSX-NEXT: fadd 1, 1, 4 +; CHECK-FAST-NOVSX-NEXT: fmadd 1, 0, 2, 1 ; CHECK-FAST-NOVSX-NEXT: blr ; ; CHECK-LABEL: getNegatedExpression_crash: ; CHECK: # %bb.0: ; CHECK-NEXT: vspltisw 2, -1 +; CHECK-NEXT: vspltisw 3, 1 ; CHECK-NEXT: addis 3, 2, .LCPI5_0@toc@ha ; CHECK-NEXT: lfs 4, .LCPI5_0@toc@l(3) -; CHECK-NEXT: xvcvsxwdp 3, 34 -; CHECK-NEXT: xssubdp 0, 1, 3 -; CHECK-NEXT: # kill: def $f3 killed $f3 killed $vsl3 -; CHECK-NEXT: xsmaddadp 3, 1, 4 -; CHECK-NEXT: xsmaddadp 0, 3, 2 -; CHECK-NEXT: fmr 1, 0 +; CHECK-NEXT: xvcvsxwdp 0, 34 +; CHECK-NEXT: xvcvsxwdp 3, 35 +; CHECK-NEXT: xsmaddadp 0, 1, 4 +; CHECK-NEXT: xsadddp 1, 1, 3 +; CHECK-NEXT: xsmaddadp 1, 0, 2 ; CHECK-NEXT: blr %neg = fneg reassoc double %x %fma = call reassoc nsz double @llvm.fma.f64(double %neg, double 42.0, double -1.0) diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll --- a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll @@ -87,16 +87,17 @@ define i64 @rotl_i64(i64 %x, i64 %z) { ; CHECK32_32-LABEL: rotl_i64: ; CHECK32_32: # %bb.0: -; CHECK32_32-NEXT: andi. 5, 6, 32 +; CHECK32_32-NEXT: srwi 5, 6, 5 +; CHECK32_32-NEXT: andi. 5, 5, 1 ; CHECK32_32-NEXT: clrlwi 5, 6, 27 ; CHECK32_32-NEXT: subfic 6, 5, 32 -; CHECK32_32-NEXT: bc 12, 2, .LBB4_2 +; CHECK32_32-NEXT: bc 12, 1, .LBB4_2 ; CHECK32_32-NEXT: # %bb.1: -; CHECK32_32-NEXT: ori 7, 3, 0 -; CHECK32_32-NEXT: ori 3, 4, 0 +; CHECK32_32-NEXT: ori 7, 4, 0 ; CHECK32_32-NEXT: b .LBB4_3 ; CHECK32_32-NEXT: .LBB4_2: -; CHECK32_32-NEXT: addi 7, 4, 0 +; CHECK32_32-NEXT: addi 7, 3, 0 +; CHECK32_32-NEXT: addi 3, 4, 0 ; CHECK32_32-NEXT: .LBB4_3: ; CHECK32_32-NEXT: srw 4, 7, 6 ; CHECK32_32-NEXT: slw 8, 3, 5 @@ -108,23 +109,24 @@ ; ; CHECK32_64-LABEL: rotl_i64: ; CHECK32_64: # %bb.0: -; CHECK32_64-NEXT: andi. 5, 6, 32 -; CHECK32_64-NEXT: clrlwi 5, 6, 27 -; CHECK32_64-NEXT: bc 12, 2, .LBB4_2 +; CHECK32_64-NEXT: srwi 5, 6, 5 +; CHECK32_64-NEXT: clrlwi 6, 6, 27 +; CHECK32_64-NEXT: andi. 5, 5, 1 +; CHECK32_64-NEXT: subfic 5, 6, 32 +; CHECK32_64-NEXT: bc 12, 1, .LBB4_2 ; CHECK32_64-NEXT: # %bb.1: -; CHECK32_64-NEXT: ori 7, 3, 0 -; CHECK32_64-NEXT: ori 3, 4, 0 +; CHECK32_64-NEXT: ori 7, 4, 0 ; CHECK32_64-NEXT: b .LBB4_3 ; CHECK32_64-NEXT: .LBB4_2: -; CHECK32_64-NEXT: addi 7, 4, 0 +; CHECK32_64-NEXT: addi 7, 3, 0 +; CHECK32_64-NEXT: addi 3, 4, 0 ; CHECK32_64-NEXT: .LBB4_3: -; CHECK32_64-NEXT: subfic 6, 5, 32 -; CHECK32_64-NEXT: srw 4, 7, 6 -; CHECK32_64-NEXT: slw 8, 3, 5 -; CHECK32_64-NEXT: srw 6, 3, 6 -; CHECK32_64-NEXT: slw 5, 7, 5 +; CHECK32_64-NEXT: srw 4, 7, 5 +; CHECK32_64-NEXT: slw 8, 3, 6 +; CHECK32_64-NEXT: srw 5, 3, 5 +; CHECK32_64-NEXT: slw 6, 7, 6 ; CHECK32_64-NEXT: or 3, 8, 4 -; CHECK32_64-NEXT: or 4, 5, 6 +; CHECK32_64-NEXT: or 4, 6, 5 ; CHECK32_64-NEXT: blr ; ; CHECK64-LABEL: rotl_i64: diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll --- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll +++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll @@ -43,18 +43,19 @@ define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) { ; CHECK32_32-LABEL: fshl_i64: ; CHECK32_32: # %bb.0: -; CHECK32_32-NEXT: andi. 7, 8, 32 +; CHECK32_32-NEXT: srwi 7, 8, 5 +; CHECK32_32-NEXT: andi. 7, 7, 1 ; CHECK32_32-NEXT: clrlwi 7, 8, 27 ; CHECK32_32-NEXT: subfic 8, 7, 32 -; CHECK32_32-NEXT: bc 12, 2, .LBB1_2 +; CHECK32_32-NEXT: bc 12, 1, .LBB1_2 ; CHECK32_32-NEXT: # %bb.1: -; CHECK32_32-NEXT: ori 9, 5, 0 -; CHECK32_32-NEXT: ori 3, 4, 0 -; CHECK32_32-NEXT: ori 4, 6, 0 +; CHECK32_32-NEXT: ori 9, 4, 0 +; CHECK32_32-NEXT: ori 4, 5, 0 ; CHECK32_32-NEXT: b .LBB1_3 ; CHECK32_32-NEXT: .LBB1_2: -; CHECK32_32-NEXT: addi 9, 4, 0 -; CHECK32_32-NEXT: addi 4, 5, 0 +; CHECK32_32-NEXT: addi 9, 5, 0 +; CHECK32_32-NEXT: addi 3, 4, 0 +; CHECK32_32-NEXT: addi 4, 6, 0 ; CHECK32_32-NEXT: .LBB1_3: ; CHECK32_32-NEXT: srw 5, 9, 8 ; CHECK32_32-NEXT: slw 3, 3, 7 @@ -66,22 +67,23 @@ ; ; CHECK32_64-LABEL: fshl_i64: ; CHECK32_64: # %bb.0: -; CHECK32_64-NEXT: andi. 7, 8, 32 -; CHECK32_64-NEXT: clrlwi 7, 8, 27 -; CHECK32_64-NEXT: bc 12, 2, .LBB1_2 +; CHECK32_64-NEXT: srwi 7, 8, 5 +; CHECK32_64-NEXT: clrlwi 8, 8, 27 +; CHECK32_64-NEXT: andi. 7, 7, 1 +; CHECK32_64-NEXT: subfic 7, 8, 32 +; CHECK32_64-NEXT: bc 12, 1, .LBB1_2 ; CHECK32_64-NEXT: # %bb.1: -; CHECK32_64-NEXT: ori 9, 5, 0 -; CHECK32_64-NEXT: ori 3, 4, 0 -; CHECK32_64-NEXT: ori 5, 6, 0 +; CHECK32_64-NEXT: ori 9, 4, 0 ; CHECK32_64-NEXT: b .LBB1_3 ; CHECK32_64-NEXT: .LBB1_2: -; CHECK32_64-NEXT: addi 9, 4, 0 +; CHECK32_64-NEXT: addi 9, 5, 0 +; CHECK32_64-NEXT: addi 3, 4, 0 +; CHECK32_64-NEXT: addi 5, 6, 0 ; CHECK32_64-NEXT: .LBB1_3: -; CHECK32_64-NEXT: subfic 8, 7, 32 -; CHECK32_64-NEXT: srw 4, 9, 8 -; CHECK32_64-NEXT: slw 3, 3, 7 -; CHECK32_64-NEXT: srw 5, 5, 8 -; CHECK32_64-NEXT: slw 6, 9, 7 +; CHECK32_64-NEXT: srw 4, 9, 7 +; CHECK32_64-NEXT: slw 3, 3, 8 +; CHECK32_64-NEXT: srw 5, 5, 7 +; CHECK32_64-NEXT: slw 6, 9, 8 ; CHECK32_64-NEXT: or 3, 3, 4 ; CHECK32_64-NEXT: or 4, 6, 5 ; CHECK32_64-NEXT: blr @@ -102,37 +104,39 @@ ; CHECK32_32-LABEL: fshl_i128: ; CHECK32_32: # %bb.0: ; CHECK32_32-NEXT: lwz 11, 20(1) -; CHECK32_32-NEXT: andi. 12, 11, 64 -; CHECK32_32-NEXT: mcrf 1, 0 -; CHECK32_32-NEXT: andi. 12, 11, 32 +; CHECK32_32-NEXT: srwi 12, 11, 6 +; CHECK32_32-NEXT: andi. 12, 12, 1 +; CHECK32_32-NEXT: srwi 12, 11, 5 +; CHECK32_32-NEXT: crmove 20, 1 +; CHECK32_32-NEXT: andi. 12, 12, 1 ; CHECK32_32-NEXT: clrlwi 11, 11, 27 -; CHECK32_32-NEXT: bc 12, 6, .LBB2_2 +; CHECK32_32-NEXT: bc 12, 20, .LBB2_2 ; CHECK32_32-NEXT: # %bb.1: -; CHECK32_32-NEXT: ori 4, 6, 0 -; CHECK32_32-NEXT: ori 12, 7, 0 -; CHECK32_32-NEXT: ori 3, 5, 0 -; CHECK32_32-NEXT: ori 5, 8, 0 -; CHECK32_32-NEXT: ori 6, 9, 0 -; CHECK32_32-NEXT: ori 7, 10, 0 +; CHECK32_32-NEXT: ori 12, 5, 0 +; CHECK32_32-NEXT: ori 5, 6, 0 +; CHECK32_32-NEXT: ori 6, 7, 0 +; CHECK32_32-NEXT: ori 7, 8, 0 ; CHECK32_32-NEXT: b .LBB2_3 ; CHECK32_32-NEXT: .LBB2_2: -; CHECK32_32-NEXT: addi 12, 5, 0 -; CHECK32_32-NEXT: addi 5, 6, 0 -; CHECK32_32-NEXT: addi 6, 7, 0 -; CHECK32_32-NEXT: addi 7, 8, 0 +; CHECK32_32-NEXT: addi 4, 6, 0 +; CHECK32_32-NEXT: addi 12, 7, 0 +; CHECK32_32-NEXT: addi 3, 5, 0 +; CHECK32_32-NEXT: addi 5, 8, 0 +; CHECK32_32-NEXT: addi 6, 9, 0 +; CHECK32_32-NEXT: addi 7, 10, 0 ; CHECK32_32-NEXT: .LBB2_3: ; CHECK32_32-NEXT: subfic 8, 11, 32 -; CHECK32_32-NEXT: bc 12, 2, .LBB2_5 +; CHECK32_32-NEXT: bc 12, 1, .LBB2_5 ; CHECK32_32-NEXT: # %bb.4: -; CHECK32_32-NEXT: ori 9, 12, 0 -; CHECK32_32-NEXT: ori 3, 4, 0 -; CHECK32_32-NEXT: ori 4, 5, 0 -; CHECK32_32-NEXT: ori 5, 6, 0 -; CHECK32_32-NEXT: ori 6, 7, 0 +; CHECK32_32-NEXT: ori 9, 4, 0 +; CHECK32_32-NEXT: ori 4, 12, 0 ; CHECK32_32-NEXT: b .LBB2_6 ; CHECK32_32-NEXT: .LBB2_5: -; CHECK32_32-NEXT: addi 9, 4, 0 -; CHECK32_32-NEXT: addi 4, 12, 0 +; CHECK32_32-NEXT: addi 9, 12, 0 +; CHECK32_32-NEXT: addi 3, 4, 0 +; CHECK32_32-NEXT: addi 4, 5, 0 +; CHECK32_32-NEXT: addi 5, 6, 0 +; CHECK32_32-NEXT: addi 6, 7, 0 ; CHECK32_32-NEXT: .LBB2_6: ; CHECK32_32-NEXT: srw 7, 9, 8 ; CHECK32_32-NEXT: slw 3, 3, 11 @@ -152,56 +156,58 @@ ; CHECK32_64: # %bb.0: ; CHECK32_64-NEXT: stwu 1, -16(1) ; CHECK32_64-NEXT: lwz 11, 36(1) -; CHECK32_64-NEXT: andi. 12, 11, 64 +; CHECK32_64-NEXT: srwi 12, 11, 6 +; CHECK32_64-NEXT: srwi 0, 11, 5 ; CHECK32_64-NEXT: stw 30, 8(1) # 4-byte Folded Spill -; CHECK32_64-NEXT: mcrf 1, 0 -; CHECK32_64-NEXT: clrlwi 12, 11, 27 -; CHECK32_64-NEXT: andi. 11, 11, 32 -; CHECK32_64-NEXT: bc 12, 6, .LBB2_2 +; CHECK32_64-NEXT: andi. 12, 12, 1 +; CHECK32_64-NEXT: clrlwi 11, 11, 27 +; CHECK32_64-NEXT: crmove 20, 1 +; CHECK32_64-NEXT: andi. 12, 0, 1 +; CHECK32_64-NEXT: bc 12, 20, .LBB2_2 ; CHECK32_64-NEXT: # %bb.1: -; CHECK32_64-NEXT: ori 4, 6, 0 -; CHECK32_64-NEXT: ori 30, 7, 0 -; CHECK32_64-NEXT: ori 3, 5, 0 -; CHECK32_64-NEXT: ori 7, 9, 0 +; CHECK32_64-NEXT: ori 30, 5, 0 ; CHECK32_64-NEXT: b .LBB2_3 ; CHECK32_64-NEXT: .LBB2_2: -; CHECK32_64-NEXT: addi 30, 5, 0 +; CHECK32_64-NEXT: addi 4, 6, 0 +; CHECK32_64-NEXT: addi 30, 7, 0 +; CHECK32_64-NEXT: addi 3, 5, 0 +; CHECK32_64-NEXT: addi 7, 9, 0 ; CHECK32_64-NEXT: .LBB2_3: -; CHECK32_64-NEXT: bc 12, 2, .LBB2_5 +; CHECK32_64-NEXT: bc 12, 1, .LBB2_5 ; CHECK32_64-NEXT: # %bb.4: -; CHECK32_64-NEXT: ori 5, 30, 0 -; CHECK32_64-NEXT: ori 3, 4, 0 +; CHECK32_64-NEXT: ori 5, 4, 0 ; CHECK32_64-NEXT: b .LBB2_6 ; CHECK32_64-NEXT: .LBB2_5: -; CHECK32_64-NEXT: addi 5, 4, 0 +; CHECK32_64-NEXT: addi 5, 30, 0 +; CHECK32_64-NEXT: addi 3, 4, 0 ; CHECK32_64-NEXT: .LBB2_6: -; CHECK32_64-NEXT: bc 12, 6, .LBB2_8 +; CHECK32_64-NEXT: bc 12, 20, .LBB2_8 ; CHECK32_64-NEXT: # %bb.7: -; CHECK32_64-NEXT: ori 4, 8, 0 -; CHECK32_64-NEXT: ori 8, 10, 0 +; CHECK32_64-NEXT: ori 4, 6, 0 ; CHECK32_64-NEXT: b .LBB2_9 ; CHECK32_64-NEXT: .LBB2_8: -; CHECK32_64-NEXT: addi 4, 6, 0 +; CHECK32_64-NEXT: addi 4, 8, 0 +; CHECK32_64-NEXT: addi 8, 10, 0 ; CHECK32_64-NEXT: .LBB2_9: -; CHECK32_64-NEXT: subfic 11, 12, 32 -; CHECK32_64-NEXT: bc 12, 2, .LBB2_11 +; CHECK32_64-NEXT: subfic 12, 11, 32 +; CHECK32_64-NEXT: bc 12, 1, .LBB2_11 ; CHECK32_64-NEXT: # %bb.10: -; CHECK32_64-NEXT: ori 0, 4, 0 -; CHECK32_64-NEXT: ori 4, 7, 0 -; CHECK32_64-NEXT: ori 7, 8, 0 +; CHECK32_64-NEXT: ori 0, 30, 0 ; CHECK32_64-NEXT: b .LBB2_12 ; CHECK32_64-NEXT: .LBB2_11: -; CHECK32_64-NEXT: addi 0, 30, 0 +; CHECK32_64-NEXT: addi 0, 4, 0 +; CHECK32_64-NEXT: addi 4, 7, 0 +; CHECK32_64-NEXT: addi 7, 8, 0 ; CHECK32_64-NEXT: .LBB2_12: -; CHECK32_64-NEXT: srw 6, 5, 11 +; CHECK32_64-NEXT: srw 6, 5, 12 ; CHECK32_64-NEXT: lwz 30, 8(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: slw 3, 3, 12 -; CHECK32_64-NEXT: srw 9, 0, 11 -; CHECK32_64-NEXT: slw 5, 5, 12 -; CHECK32_64-NEXT: srw 10, 4, 11 -; CHECK32_64-NEXT: slw 0, 0, 12 -; CHECK32_64-NEXT: srw 7, 7, 11 -; CHECK32_64-NEXT: slw 8, 4, 12 +; CHECK32_64-NEXT: slw 3, 3, 11 +; CHECK32_64-NEXT: srw 9, 0, 12 +; CHECK32_64-NEXT: slw 5, 5, 11 +; CHECK32_64-NEXT: srw 10, 4, 12 +; CHECK32_64-NEXT: slw 0, 0, 11 +; CHECK32_64-NEXT: srw 7, 7, 12 +; CHECK32_64-NEXT: slw 8, 4, 11 ; CHECK32_64-NEXT: or 3, 3, 6 ; CHECK32_64-NEXT: or 4, 5, 9 ; CHECK32_64-NEXT: or 5, 0, 10 @@ -211,12 +217,13 @@ ; ; CHECK64-LABEL: fshl_i128: ; CHECK64: # %bb.0: -; CHECK64-NEXT: andi. 8, 7, 64 +; CHECK64-NEXT: rldicl 8, 7, 58, 6 ; CHECK64-NEXT: clrlwi 7, 7, 26 -; CHECK64-NEXT: iseleq 5, 6, 5 +; CHECK64-NEXT: andi. 8, 8, 1 +; CHECK64-NEXT: iselgt 5, 5, 6 ; CHECK64-NEXT: subfic 8, 7, 64 -; CHECK64-NEXT: iseleq 6, 3, 6 -; CHECK64-NEXT: iseleq 3, 4, 3 +; CHECK64-NEXT: iselgt 6, 6, 3 +; CHECK64-NEXT: iselgt 3, 3, 4 ; CHECK64-NEXT: srd 4, 5, 8 ; CHECK64-NEXT: sld 5, 6, 7 ; CHECK64-NEXT: srd 6, 6, 8 @@ -256,20 +263,21 @@ ; CHECK32_32-NEXT: li 6, 37 ; CHECK32_32-NEXT: bl __umoddi3 ; CHECK32_32-NEXT: rotlwi 3, 30, 27 +; CHECK32_32-NEXT: srwi 6, 4, 5 ; CHECK32_32-NEXT: slwi 5, 30, 27 -; CHECK32_32-NEXT: andi. 6, 4, 32 ; CHECK32_32-NEXT: rlwimi 3, 29, 27, 0, 4 +; CHECK32_32-NEXT: andi. 6, 6, 1 ; CHECK32_32-NEXT: clrlwi 4, 4, 27 ; CHECK32_32-NEXT: subfic 6, 4, 32 -; CHECK32_32-NEXT: bc 12, 2, .LBB3_2 +; CHECK32_32-NEXT: bc 12, 1, .LBB3_2 ; CHECK32_32-NEXT: # %bb.1: -; CHECK32_32-NEXT: ori 7, 3, 0 -; CHECK32_32-NEXT: ori 8, 28, 0 -; CHECK32_32-NEXT: ori 3, 5, 0 +; CHECK32_32-NEXT: ori 7, 28, 0 +; CHECK32_32-NEXT: ori 8, 27, 0 ; CHECK32_32-NEXT: b .LBB3_3 ; CHECK32_32-NEXT: .LBB3_2: -; CHECK32_32-NEXT: addi 7, 28, 0 -; CHECK32_32-NEXT: addi 8, 27, 0 +; CHECK32_32-NEXT: addi 7, 3, 0 +; CHECK32_32-NEXT: addi 8, 28, 0 +; CHECK32_32-NEXT: addi 3, 5, 0 ; CHECK32_32-NEXT: .LBB3_3: ; CHECK32_32-NEXT: lwz 30, 24(1) # 4-byte Folded Reload ; CHECK32_32-NEXT: srw 5, 7, 6 @@ -310,40 +318,36 @@ ; CHECK32_64-NEXT: mr 30, 6 ; CHECK32_64-NEXT: li 6, 37 ; CHECK32_64-NEXT: bl __umoddi3 +; CHECK32_64-NEXT: srwi 5, 4, 5 ; CHECK32_64-NEXT: rotlwi 3, 30, 27 -; CHECK32_64-NEXT: andi. 5, 4, 32 -; CHECK32_64-NEXT: bc 12, 2, .LBB3_2 +; CHECK32_64-NEXT: andi. 5, 5, 1 +; CHECK32_64-NEXT: rlwimi 3, 29, 27, 0, 4 +; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK32_64-NEXT: bc 12, 1, .LBB3_2 ; CHECK32_64-NEXT: # %bb.1: -; CHECK32_64-NEXT: ori 8, 28, 0 +; CHECK32_64-NEXT: ori 7, 28, 0 +; CHECK32_64-NEXT: ori 8, 27, 0 ; CHECK32_64-NEXT: b .LBB3_3 ; CHECK32_64-NEXT: .LBB3_2: -; CHECK32_64-NEXT: addi 8, 27, 0 +; CHECK32_64-NEXT: addi 7, 3, 0 +; CHECK32_64-NEXT: addi 8, 28, 0 ; CHECK32_64-NEXT: .LBB3_3: -; CHECK32_64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: rlwimi 3, 29, 27, 0, 4 ; CHECK32_64-NEXT: clrlwi 4, 4, 27 -; CHECK32_64-NEXT: bc 12, 2, .LBB3_5 -; CHECK32_64-NEXT: # %bb.4: -; CHECK32_64-NEXT: ori 7, 3, 0 -; CHECK32_64-NEXT: b .LBB3_6 -; CHECK32_64-NEXT: .LBB3_5: -; CHECK32_64-NEXT: addi 7, 28, 0 -; CHECK32_64-NEXT: .LBB3_6: +; CHECK32_64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload ; CHECK32_64-NEXT: slwi 5, 30, 27 -; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload -; CHECK32_64-NEXT: bc 12, 2, .LBB3_8 -; CHECK32_64-NEXT: # %bb.7: -; CHECK32_64-NEXT: ori 3, 5, 0 -; CHECK32_64-NEXT: b .LBB3_8 -; CHECK32_64-NEXT: .LBB3_8: ; CHECK32_64-NEXT: subfic 6, 4, 32 -; CHECK32_64-NEXT: slw 8, 8, 4 -; CHECK32_64-NEXT: lwz 29, 20(1) # 4-byte Folded Reload +; CHECK32_64-NEXT: bc 12, 1, .LBB3_4 +; CHECK32_64-NEXT: b .LBB3_5 +; CHECK32_64-NEXT: .LBB3_4: +; CHECK32_64-NEXT: addi 3, 5, 0 +; CHECK32_64-NEXT: .LBB3_5: ; CHECK32_64-NEXT: srw 9, 7, 6 +; CHECK32_64-NEXT: slw 8, 8, 4 +; CHECK32_64-NEXT: lwz 30, 24(1) # 4-byte Folded Reload ; CHECK32_64-NEXT: srw 5, 3, 6 ; CHECK32_64-NEXT: slw 4, 7, 4 +; CHECK32_64-NEXT: lwz 27, 12(1) # 4-byte Folded Reload ; CHECK32_64-NEXT: or 3, 8, 9 -; CHECK32_64-NEXT: lwz 28, 16(1) # 4-byte Folded Reload ; CHECK32_64-NEXT: or 4, 4, 5 ; CHECK32_64-NEXT: lwz 0, 36(1) ; CHECK32_64-NEXT: addi 1, 1, 32 diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll --- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll @@ -478,58 +478,204 @@ ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r5, r2, g@toc@ha ; LE-PWR9-NEXT: sldi r3, r3, 5 -; LE-PWR9-NEXT: sldi r4, r4, 5 +; LE-PWR9-NEXT: std r25, -56(r1) # 8-byte Folded Spill ; LE-PWR9-NEXT: addi r5, r5, g@toc@l -; LE-PWR9-NEXT: add r6, r5, r3 -; LE-PWR9-NEXT: lxvx vs1, r5, r3 -; LE-PWR9-NEXT: lxv vs0, 16(r6) -; LE-PWR9-NEXT: add r6, r5, r4 -; LE-PWR9-NEXT: stxvx vs1, r5, r4 -; LE-PWR9-NEXT: stxv vs0, 16(r6) +; LE-PWR9-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; LE-PWR9-NEXT: std r27, -40(r1) # 8-byte Folded Spill +; LE-PWR9-NEXT: std r28, -32(r1) # 8-byte Folded Spill +; LE-PWR9-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; LE-PWR9-NEXT: mr r6, r5 +; LE-PWR9-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; LE-PWR9-NEXT: sldi r4, r4, 5 +; LE-PWR9-NEXT: lwzux r3, r6, r3 +; LE-PWR9-NEXT: lhz r30, 18(r6) +; LE-PWR9-NEXT: lhz r29, 20(r6) +; LE-PWR9-NEXT: lhz r28, 22(r6) +; LE-PWR9-NEXT: lhz r27, 24(r6) +; LE-PWR9-NEXT: lhz r26, 26(r6) +; LE-PWR9-NEXT: lhz r25, 28(r6) +; LE-PWR9-NEXT: lhz r7, 4(r6) +; LE-PWR9-NEXT: lhz r8, 6(r6) +; LE-PWR9-NEXT: lhz r9, 8(r6) +; LE-PWR9-NEXT: lhz r10, 10(r6) +; LE-PWR9-NEXT: lhz r11, 12(r6) +; LE-PWR9-NEXT: lhz r12, 14(r6) +; LE-PWR9-NEXT: lhz r0, 16(r6) +; LE-PWR9-NEXT: lhz r6, 30(r6) +; LE-PWR9-NEXT: stwux r3, r4, r5 +; LE-PWR9-NEXT: sth r25, 28(r4) +; LE-PWR9-NEXT: sth r26, 26(r4) +; LE-PWR9-NEXT: sth r27, 24(r4) +; LE-PWR9-NEXT: sth r28, 22(r4) +; LE-PWR9-NEXT: sth r29, 20(r4) +; LE-PWR9-NEXT: sth r30, 18(r4) +; LE-PWR9-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; LE-PWR9-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; LE-PWR9-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; LE-PWR9-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; LE-PWR9-NEXT: ld r26, -48(r1) # 8-byte Folded Reload +; LE-PWR9-NEXT: ld r25, -56(r1) # 8-byte Folded Reload +; LE-PWR9-NEXT: sth r6, 30(r4) +; LE-PWR9-NEXT: sth r0, 16(r4) +; LE-PWR9-NEXT: sth r12, 14(r4) +; LE-PWR9-NEXT: sth r11, 12(r4) +; LE-PWR9-NEXT: sth r10, 10(r4) +; LE-PWR9-NEXT: sth r9, 8(r4) +; LE-PWR9-NEXT: sth r8, 6(r4) +; LE-PWR9-NEXT: sth r7, 4(r4) ; LE-PWR9-NEXT: blr ; ; LE-PWR8-LABEL: testXLdStPair: ; LE-PWR8: # %bb.0: # %entry ; LE-PWR8-NEXT: addis r5, r2, g@toc@ha ; LE-PWR8-NEXT: sldi r3, r3, 5 -; LE-PWR8-NEXT: li r7, 16 +; LE-PWR8-NEXT: sldi r4, r4, 5 +; LE-PWR8-NEXT: std r25, -56(r1) # 8-byte Folded Spill +; LE-PWR8-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; LE-PWR8-NEXT: std r27, -40(r1) # 8-byte Folded Spill +; LE-PWR8-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; LE-PWR8-NEXT: addi r5, r5, g@toc@l -; LE-PWR8-NEXT: add r6, r5, r3 -; LE-PWR8-NEXT: lxvd2x vs1, r5, r3 -; LE-PWR8-NEXT: sldi r3, r4, 5 -; LE-PWR8-NEXT: lxvd2x vs0, r6, r7 -; LE-PWR8-NEXT: add r4, r5, r3 -; LE-PWR8-NEXT: stxvd2x vs1, r5, r3 -; LE-PWR8-NEXT: stxvd2x vs0, r4, r7 +; LE-PWR8-NEXT: mr r6, r5 +; LE-PWR8-NEXT: lwzux r3, r6, r3 +; LE-PWR8-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; LE-PWR8-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; LE-PWR8-NEXT: lhz r7, 4(r6) +; LE-PWR8-NEXT: lhz r8, 6(r6) +; LE-PWR8-NEXT: lhz r9, 8(r6) +; LE-PWR8-NEXT: lhz r10, 10(r6) +; LE-PWR8-NEXT: lhz r11, 12(r6) +; LE-PWR8-NEXT: lhz r12, 14(r6) +; LE-PWR8-NEXT: lhz r0, 16(r6) +; LE-PWR8-NEXT: lhz r30, 18(r6) +; LE-PWR8-NEXT: lhz r29, 20(r6) +; LE-PWR8-NEXT: lhz r28, 22(r6) +; LE-PWR8-NEXT: lhz r27, 24(r6) +; LE-PWR8-NEXT: lhz r26, 26(r6) +; LE-PWR8-NEXT: lhz r25, 28(r6) +; LE-PWR8-NEXT: lhz r6, 30(r6) +; LE-PWR8-NEXT: stwux r3, r4, r5 +; LE-PWR8-NEXT: sth r25, 28(r4) +; LE-PWR8-NEXT: sth r26, 26(r4) +; LE-PWR8-NEXT: ld r26, -48(r1) # 8-byte Folded Reload +; LE-PWR8-NEXT: ld r25, -56(r1) # 8-byte Folded Reload +; LE-PWR8-NEXT: sth r27, 24(r4) +; LE-PWR8-NEXT: sth r28, 22(r4) +; LE-PWR8-NEXT: sth r29, 20(r4) +; LE-PWR8-NEXT: sth r30, 18(r4) +; LE-PWR8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; LE-PWR8-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; LE-PWR8-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; LE-PWR8-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; LE-PWR8-NEXT: sth r6, 30(r4) +; LE-PWR8-NEXT: sth r0, 16(r4) +; LE-PWR8-NEXT: sth r12, 14(r4) +; LE-PWR8-NEXT: sth r11, 12(r4) +; LE-PWR8-NEXT: sth r10, 10(r4) +; LE-PWR8-NEXT: sth r9, 8(r4) +; LE-PWR8-NEXT: sth r8, 6(r4) +; LE-PWR8-NEXT: sth r7, 4(r4) ; LE-PWR8-NEXT: blr ; ; BE-PWR9-LABEL: testXLdStPair: ; BE-PWR9: # %bb.0: # %entry ; BE-PWR9-NEXT: addis r5, r2, g@toc@ha ; BE-PWR9-NEXT: sldi r3, r3, 5 -; BE-PWR9-NEXT: sldi r4, r4, 5 +; BE-PWR9-NEXT: std r25, -56(r1) # 8-byte Folded Spill ; BE-PWR9-NEXT: addi r5, r5, g@toc@l -; BE-PWR9-NEXT: add r6, r5, r3 -; BE-PWR9-NEXT: lxvx vs1, r5, r3 -; BE-PWR9-NEXT: lxv vs0, 16(r6) -; BE-PWR9-NEXT: add r6, r5, r4 -; BE-PWR9-NEXT: stxvx vs1, r5, r4 -; BE-PWR9-NEXT: stxv vs0, 16(r6) +; BE-PWR9-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; BE-PWR9-NEXT: std r27, -40(r1) # 8-byte Folded Spill +; BE-PWR9-NEXT: std r28, -32(r1) # 8-byte Folded Spill +; BE-PWR9-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; BE-PWR9-NEXT: mr r6, r5 +; BE-PWR9-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; BE-PWR9-NEXT: sldi r4, r4, 5 +; BE-PWR9-NEXT: lwzux r3, r6, r3 +; BE-PWR9-NEXT: lhz r30, 18(r6) +; BE-PWR9-NEXT: lhz r29, 20(r6) +; BE-PWR9-NEXT: lhz r28, 22(r6) +; BE-PWR9-NEXT: lhz r27, 24(r6) +; BE-PWR9-NEXT: lhz r26, 26(r6) +; BE-PWR9-NEXT: lhz r25, 28(r6) +; BE-PWR9-NEXT: lhz r7, 4(r6) +; BE-PWR9-NEXT: lhz r8, 6(r6) +; BE-PWR9-NEXT: lhz r9, 8(r6) +; BE-PWR9-NEXT: lhz r10, 10(r6) +; BE-PWR9-NEXT: lhz r11, 12(r6) +; BE-PWR9-NEXT: lhz r12, 14(r6) +; BE-PWR9-NEXT: lhz r0, 16(r6) +; BE-PWR9-NEXT: lhz r6, 30(r6) +; BE-PWR9-NEXT: stwux r3, r4, r5 +; BE-PWR9-NEXT: sth r25, 28(r4) +; BE-PWR9-NEXT: sth r26, 26(r4) +; BE-PWR9-NEXT: sth r27, 24(r4) +; BE-PWR9-NEXT: sth r28, 22(r4) +; BE-PWR9-NEXT: sth r29, 20(r4) +; BE-PWR9-NEXT: sth r30, 18(r4) +; BE-PWR9-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; BE-PWR9-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; BE-PWR9-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; BE-PWR9-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; BE-PWR9-NEXT: ld r26, -48(r1) # 8-byte Folded Reload +; BE-PWR9-NEXT: ld r25, -56(r1) # 8-byte Folded Reload +; BE-PWR9-NEXT: sth r6, 30(r4) +; BE-PWR9-NEXT: sth r0, 16(r4) +; BE-PWR9-NEXT: sth r12, 14(r4) +; BE-PWR9-NEXT: sth r11, 12(r4) +; BE-PWR9-NEXT: sth r10, 10(r4) +; BE-PWR9-NEXT: sth r9, 8(r4) +; BE-PWR9-NEXT: sth r8, 6(r4) +; BE-PWR9-NEXT: sth r7, 4(r4) ; BE-PWR9-NEXT: blr ; ; BE-PWR8-LABEL: testXLdStPair: ; BE-PWR8: # %bb.0: # %entry ; BE-PWR8-NEXT: addis r5, r2, g@toc@ha ; BE-PWR8-NEXT: sldi r3, r3, 5 -; BE-PWR8-NEXT: li r7, 16 +; BE-PWR8-NEXT: sldi r4, r4, 5 +; BE-PWR8-NEXT: std r25, -56(r1) # 8-byte Folded Spill +; BE-PWR8-NEXT: std r26, -48(r1) # 8-byte Folded Spill +; BE-PWR8-NEXT: std r27, -40(r1) # 8-byte Folded Spill +; BE-PWR8-NEXT: std r28, -32(r1) # 8-byte Folded Spill ; BE-PWR8-NEXT: addi r5, r5, g@toc@l -; BE-PWR8-NEXT: add r6, r5, r3 -; BE-PWR8-NEXT: lxvd2x vs0, r5, r3 -; BE-PWR8-NEXT: sldi r3, r4, 5 -; BE-PWR8-NEXT: lxvd2x vs1, r6, r7 -; BE-PWR8-NEXT: add r4, r5, r3 -; BE-PWR8-NEXT: stxvd2x vs0, r5, r3 -; BE-PWR8-NEXT: stxvd2x vs1, r4, r7 +; BE-PWR8-NEXT: mr r6, r5 +; BE-PWR8-NEXT: lwzux r3, r6, r3 +; BE-PWR8-NEXT: std r29, -24(r1) # 8-byte Folded Spill +; BE-PWR8-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; BE-PWR8-NEXT: lhz r7, 4(r6) +; BE-PWR8-NEXT: lhz r8, 6(r6) +; BE-PWR8-NEXT: lhz r9, 8(r6) +; BE-PWR8-NEXT: lhz r10, 10(r6) +; BE-PWR8-NEXT: lhz r11, 12(r6) +; BE-PWR8-NEXT: lhz r12, 14(r6) +; BE-PWR8-NEXT: lhz r0, 16(r6) +; BE-PWR8-NEXT: lhz r30, 18(r6) +; BE-PWR8-NEXT: lhz r29, 20(r6) +; BE-PWR8-NEXT: lhz r28, 22(r6) +; BE-PWR8-NEXT: lhz r27, 24(r6) +; BE-PWR8-NEXT: lhz r26, 26(r6) +; BE-PWR8-NEXT: lhz r25, 28(r6) +; BE-PWR8-NEXT: lhz r6, 30(r6) +; BE-PWR8-NEXT: stwux r3, r4, r5 +; BE-PWR8-NEXT: sth r25, 28(r4) +; BE-PWR8-NEXT: sth r26, 26(r4) +; BE-PWR8-NEXT: ld r26, -48(r1) # 8-byte Folded Reload +; BE-PWR8-NEXT: ld r25, -56(r1) # 8-byte Folded Reload +; BE-PWR8-NEXT: sth r27, 24(r4) +; BE-PWR8-NEXT: sth r28, 22(r4) +; BE-PWR8-NEXT: sth r29, 20(r4) +; BE-PWR8-NEXT: sth r30, 18(r4) +; BE-PWR8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; BE-PWR8-NEXT: ld r29, -24(r1) # 8-byte Folded Reload +; BE-PWR8-NEXT: ld r28, -32(r1) # 8-byte Folded Reload +; BE-PWR8-NEXT: ld r27, -40(r1) # 8-byte Folded Reload +; BE-PWR8-NEXT: sth r6, 30(r4) +; BE-PWR8-NEXT: sth r0, 16(r4) +; BE-PWR8-NEXT: sth r12, 14(r4) +; BE-PWR8-NEXT: sth r11, 12(r4) +; BE-PWR8-NEXT: sth r10, 10(r4) +; BE-PWR8-NEXT: sth r9, 8(r4) +; BE-PWR8-NEXT: sth r8, 6(r4) +; BE-PWR8-NEXT: sth r7, 4(r4) ; BE-PWR8-NEXT: blr entry: %arrayidx = getelementptr inbounds <256 x i1>, ptr @g, i64 %SrcIdx diff --git a/llvm/test/CodeGen/PowerPC/mul-high.ll b/llvm/test/CodeGen/PowerPC/mul-high.ll --- a/llvm/test/CodeGen/PowerPC/mul-high.ll +++ b/llvm/test/CodeGen/PowerPC/mul-high.ll @@ -17,8 +17,10 @@ define i32 @test_mulhw(i32 %a, i32 %b) { ; CHECK-LABEL: test_mulhw: ; CHECK: # %bb.0: -; CHECK-NEXT: mulhw r3, r3, r4 -; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: extsw r4, r4 +; CHECK-NEXT: mulld r3, r3, r4 +; CHECK-NEXT: rldicl r3, r3, 32, 32 ; CHECK-NEXT: blr %1 = sext i32 %a to i64 %2 = sext i32 %b to i64 @@ -31,8 +33,10 @@ define i32 @test_mulhu(i32 %a, i32 %b) { ; CHECK-LABEL: test_mulhu: ; CHECK: # %bb.0: -; CHECK-NEXT: mulhwu r3, r3, r4 ; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: mulld r3, r3, r4 +; CHECK-NEXT: rldicl r3, r3, 32, 32 ; CHECK-NEXT: blr %1 = zext i32 %a to i64 %2 = zext i32 %b to i64 @@ -73,8 +77,10 @@ define signext i32 @test_mulhw_signext(i32 %a, i32 %b) { ; CHECK-LABEL: test_mulhw_signext: ; CHECK: # %bb.0: -; CHECK-NEXT: mulhw r3, r3, r4 ; CHECK-NEXT: extsw r3, r3 +; CHECK-NEXT: extsw r4, r4 +; CHECK-NEXT: mulld r3, r3, r4 +; CHECK-NEXT: sradi r3, r3, 32 ; CHECK-NEXT: blr %1 = sext i32 %a to i64 %2 = sext i32 %b to i64 @@ -87,8 +93,10 @@ define zeroext i32 @test_mulhu_zeroext(i32 %a, i32 %b) { ; CHECK-LABEL: test_mulhu_zeroext: ; CHECK: # %bb.0: -; CHECK-NEXT: mulhwu r3, r3, r4 ; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: clrldi r4, r4, 32 +; CHECK-NEXT: mulld r3, r3, r4 +; CHECK-NEXT: rldicl r3, r3, 32, 32 ; CHECK-NEXT: blr %1 = zext i32 %a to i64 %2 = zext i32 %b to i64 diff --git a/llvm/test/CodeGen/PowerPC/pr44183.ll b/llvm/test/CodeGen/PowerPC/pr44183.ll --- a/llvm/test/CodeGen/PowerPC/pr44183.ll +++ b/llvm/test/CodeGen/PowerPC/pr44183.ll @@ -10,9 +10,10 @@ ; CHECK-LABEL: _ZN1m1nEv: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mflr r0 +; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; CHECK-NEXT: stdu r1, -48(r1) -; CHECK-NEXT: std r0, 64(r1) +; CHECK-NEXT: stdu r1, -64(r1) +; CHECK-NEXT: std r0, 80(r1) ; CHECK-NEXT: mr r30, r3 ; CHECK-NEXT: ld r3, 8(r3) ; CHECK-NEXT: lwz r4, 36(r30) @@ -22,18 +23,15 @@ ; CHECK-NEXT: rlwimi r4, r3, 0, 0, 0 ; CHECK-NEXT: bl _ZN1llsE1d ; CHECK-NEXT: nop -; CHECK-NEXT: ld r3, 16(r30) -; CHECK-NEXT: ld r4, 8(r30) -; CHECK-NEXT: rldicl r4, r4, 60, 4 -; CHECK-NEXT: sldi r3, r3, 60 -; CHECK-NEXT: or r3, r3, r4 -; CHECK-NEXT: sldi r3, r3, 31 -; CHECK-NEXT: rlwinm r4, r3, 0, 0, 0 +; CHECK-NEXT: ld r3, 8(r30) +; CHECK-NEXT: rldicl r3, r3, 60, 4 +; CHECK-NEXT: rlwinm r4, r3, 31, 0, 0 ; CHECK-NEXT: bl _ZN1llsE1d ; CHECK-NEXT: nop -; CHECK-NEXT: addi r1, r1, 48 +; CHECK-NEXT: addi r1, r1, 64 ; CHECK-NEXT: ld r0, 16(r1) ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload ; CHECK-NEXT: mtlr r0 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/pr45301.ll b/llvm/test/CodeGen/PowerPC/pr45301.ll --- a/llvm/test/CodeGen/PowerPC/pr45301.ll +++ b/llvm/test/CodeGen/PowerPC/pr45301.ll @@ -12,21 +12,22 @@ ; CHECK-NEXT: bl i ; CHECK-NEXT: nop ; CHECK-NEXT: addis r4, r2, g@toc@ha -; CHECK-NEXT: addi r5, r4, g@toc@l -; CHECK-NEXT: ld r6, 16(r5) +; CHECK-NEXT: ld r5, g@toc@l(r4) +; CHECK-NEXT: addi r4, r4, g@toc@l +; CHECK-NEXT: std r5, 0(r3) +; CHECK-NEXT: rldicl r5, r5, 32, 32 +; CHECK-NEXT: ld r6, 16(r4) ; CHECK-NEXT: std r6, 16(r3) -; CHECK-NEXT: ld r4, g@toc@l(r4) -; CHECK-NEXT: std r4, 0(r3) -; CHECK-NEXT: rldicl r4, r4, 32, 32 -; CHECK-NEXT: ld r7, 8(r5) -; CHECK-NEXT: std r7, 8(r3) -; CHECK-NEXT: ld r7, 24(r5) -; CHECK-NEXT: std r7, 24(r3) -; CHECK-NEXT: ld r5, 32(r5) -; CHECK-NEXT: std r5, 32(r3) -; CHECK-NEXT: stwbrx r4, 0, r3 +; CHECK-NEXT: ld r6, 32(r4) +; CHECK-NEXT: std r6, 32(r3) +; CHECK-NEXT: ld r6, 24(r4) +; CHECK-NEXT: std r6, 24(r3) +; CHECK-NEXT: ld r4, 8(r4) +; CHECK-NEXT: std r4, 8(r3) +; CHECK-NEXT: stwbrx r5, 0, r3 ; CHECK-NEXT: li r4, 20 -; CHECK-NEXT: stwbrx r6, r3, r4 +; CHECK-NEXT: lwbrx r4, r3, r4 +; CHECK-NEXT: stw r4, 20(r3) ; CHECK-NEXT: addi r1, r1, 112 ; CHECK-NEXT: ld r0, 16(r1) ; CHECK-NEXT: mtlr r0 diff --git a/llvm/test/CodeGen/PowerPC/pr45432.ll b/llvm/test/CodeGen/PowerPC/pr45432.ll --- a/llvm/test/CodeGen/PowerPC/pr45432.ll +++ b/llvm/test/CodeGen/PowerPC/pr45432.ll @@ -15,8 +15,8 @@ ; CHECK-NEXT: addis 3, 2, g@toc@ha ; CHECK-NEXT: std 0, 80(1) ; CHECK-NEXT: std 30, 48(1) # 8-byte Folded Spill -; CHECK-NEXT: lwz 3, g@toc@l(3) -; CHECK-NEXT: extswsli 30, 3, 2 +; CHECK-NEXT: lwa 3, g@toc@l(3) +; CHECK-NEXT: sldi 30, 3, 2 ; CHECK-NEXT: addis 3, 2, f@got@tlsld@ha ; CHECK-NEXT: addi 3, 3, f@got@tlsld@l ; CHECK-NEXT: bl __tls_get_addr(f@tlsld) diff --git a/llvm/test/CodeGen/PowerPC/prefer-dqform.ll b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll --- a/llvm/test/CodeGen/PowerPC/prefer-dqform.ll +++ b/llvm/test/CodeGen/PowerPC/prefer-dqform.ll @@ -19,32 +19,33 @@ ; CHECK-P9-NEXT: lwz r4, 0(r4) ; CHECK-P9-NEXT: lwz r5, 0(r5) ; CHECK-P9-NEXT: iseleq r3, r10, r3 -; CHECK-P9-NEXT: subfic r10, r3, 1 -; CHECK-P9-NEXT: add r4, r10, r4 -; CHECK-P9-NEXT: srawi r4, r4, 4 -; CHECK-P9-NEXT: addze r4, r4 +; CHECK-P9-NEXT: sub r10, r4, r3 +; CHECK-P9-NEXT: addi r10, r10, 1 +; CHECK-P9-NEXT: srawi r11, r10, 4 +; CHECK-P9-NEXT: addze r11, r11 ; CHECK-P9-NEXT: srawi r5, r5, 1 -; CHECK-P9-NEXT: slwi r4, r4, 4 +; CHECK-P9-NEXT: slwi r11, r11, 4 ; CHECK-P9-NEXT: addze r5, r5 -; CHECK-P9-NEXT: sub r4, r4, r10 +; CHECK-P9-NEXT: sub r10, r11, r10 +; CHECK-P9-NEXT: add r4, r4, r10 ; CHECK-P9-NEXT: cmpw r3, r4 ; CHECK-P9-NEXT: bgtlr cr0 ; CHECK-P9-NEXT: # %bb.1: # %_loop_2_do_.lr.ph ; CHECK-P9-NEXT: extswsli r5, r5, 3 ; CHECK-P9-NEXT: add r5, r8, r5 ; CHECK-P9-NEXT: addi r8, r5, -8 -; CHECK-P9-NEXT: lwz r5, 0(r7) +; CHECK-P9-NEXT: lwa r5, 0(r7) ; CHECK-P9-NEXT: extsw r7, r4 ; CHECK-P9-NEXT: rldic r4, r3, 3, 29 ; CHECK-P9-NEXT: sub r3, r7, r3 -; CHECK-P9-NEXT: addi r10, r4, 8 ; CHECK-P9-NEXT: lxvdsx vs0, 0, r8 +; CHECK-P9-NEXT: addi r10, r4, 8 ; CHECK-P9-NEXT: rldicl r3, r3, 60, 4 -; CHECK-P9-NEXT: extswsli r5, r5, 3 ; CHECK-P9-NEXT: addi r3, r3, 1 +; CHECK-P9-NEXT: sldi r5, r5, 3 ; CHECK-P9-NEXT: sub r4, r10, r5 -; CHECK-P9-NEXT: add r5, r9, r10 ; CHECK-P9-NEXT: mtctr r3 +; CHECK-P9-NEXT: add r5, r9, r10 ; CHECK-P9-NEXT: add r4, r6, r4 ; CHECK-P9-NEXT: .p2align 4 ; CHECK-P9-NEXT: .LBB0_2: # %_loop_2_do_ @@ -65,39 +66,40 @@ ; ; CHECK-P10-LABEL: test: ; CHECK-P10: # %bb.0: # %test_entry +; CHECK-P10-NEXT: lwz r4, 0(r4) ; CHECK-P10-NEXT: andi. r3, r6, 15 ; CHECK-P10-NEXT: li r3, 2 ; CHECK-P10-NEXT: li r10, 1 -; CHECK-P10-NEXT: lwz r4, 0(r4) ; CHECK-P10-NEXT: lwz r5, 0(r5) ; CHECK-P10-NEXT: iseleq r3, r10, r3 -; CHECK-P10-NEXT: subfic r10, r3, 1 -; CHECK-P10-NEXT: add r4, r10, r4 -; CHECK-P10-NEXT: srawi r4, r4, 4 -; CHECK-P10-NEXT: addze r4, r4 +; CHECK-P10-NEXT: sub r10, r4, r3 +; CHECK-P10-NEXT: addi r10, r10, 1 +; CHECK-P10-NEXT: srawi r11, r10, 4 +; CHECK-P10-NEXT: addze r11, r11 ; CHECK-P10-NEXT: srawi r5, r5, 1 -; CHECK-P10-NEXT: slwi r4, r4, 4 +; CHECK-P10-NEXT: slwi r11, r11, 4 ; CHECK-P10-NEXT: addze r5, r5 -; CHECK-P10-NEXT: sub r4, r4, r10 +; CHECK-P10-NEXT: sub r10, r11, r10 +; CHECK-P10-NEXT: add r4, r4, r10 ; CHECK-P10-NEXT: cmpw r3, r4 ; CHECK-P10-NEXT: bgtlr cr0 ; CHECK-P10-NEXT: # %bb.1: # %_loop_2_do_.lr.ph ; CHECK-P10-NEXT: extswsli r5, r5, 3 ; CHECK-P10-NEXT: add r5, r8, r5 ; CHECK-P10-NEXT: addi r8, r5, -8 -; CHECK-P10-NEXT: lwz r5, 0(r7) +; CHECK-P10-NEXT: lwa r5, 0(r7) ; CHECK-P10-NEXT: extsw r7, r4 ; CHECK-P10-NEXT: rldic r4, r3, 3, 29 ; CHECK-P10-NEXT: addi r10, r4, 8 ; CHECK-P10-NEXT: sub r3, r7, r3 ; CHECK-P10-NEXT: lxvdsx vs0, 0, r8 ; CHECK-P10-NEXT: rldicl r3, r3, 60, 4 -; CHECK-P10-NEXT: extswsli r5, r5, 3 -; CHECK-P10-NEXT: addi r3, r3, 1 -; CHECK-P10-NEXT: sub r4, r10, r5 +; CHECK-P10-NEXT: sldi r4, r5, 3 +; CHECK-P10-NEXT: sub r4, r10, r4 ; CHECK-P10-NEXT: add r5, r9, r10 -; CHECK-P10-NEXT: mtctr r3 +; CHECK-P10-NEXT: addi r3, r3, 1 ; CHECK-P10-NEXT: add r4, r6, r4 +; CHECK-P10-NEXT: mtctr r3 ; CHECK-P10-NEXT: .p2align 4 ; CHECK-P10-NEXT: .LBB0_2: # %_loop_2_do_ ; CHECK-P10-NEXT: # diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -26,9 +26,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: clrlwi 3, 3, 24 ; CHECK-NEXT: addi 3, 3, 42 -; CHECK-NEXT: andi. 4, 3, 256 +; CHECK-NEXT: srwi 4, 3, 8 +; CHECK-NEXT: andi. 4, 4, 1 ; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: iselgt 3, 4, 3 ; CHECK-NEXT: blr %a = add i8 %x, 42 %c = icmp ugt i8 %x, %a @@ -71,9 +72,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: clrlwi 3, 3, 16 ; CHECK-NEXT: addi 3, 3, 42 -; CHECK-NEXT: andis. 4, 3, 1 +; CHECK-NEXT: srwi 4, 3, 16 +; CHECK-NEXT: andi. 4, 4, 1 ; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: iselgt 3, 4, 3 ; CHECK-NEXT: blr %a = add i16 %x, 42 %c = icmp ugt i16 %x, %a @@ -205,9 +207,10 @@ ; CHECK-NEXT: clrlwi 4, 4, 24 ; CHECK-NEXT: clrlwi 3, 3, 24 ; CHECK-NEXT: add 3, 3, 4 -; CHECK-NEXT: andi. 4, 3, 256 +; CHECK-NEXT: srwi 4, 3, 8 +; CHECK-NEXT: andi. 4, 4, 1 ; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: iselgt 3, 4, 3 ; CHECK-NEXT: blr %a = add i8 %x, %y %c = icmp ugt i8 %x, %a @@ -256,9 +259,10 @@ ; CHECK-NEXT: clrlwi 4, 4, 16 ; CHECK-NEXT: clrlwi 3, 3, 16 ; CHECK-NEXT: add 3, 3, 4 -; CHECK-NEXT: andis. 4, 3, 1 +; CHECK-NEXT: srwi 4, 3, 16 +; CHECK-NEXT: andi. 4, 4, 1 ; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: iselgt 3, 4, 3 ; CHECK-NEXT: blr %a = add i16 %x, %y %c = icmp ugt i16 %x, %a diff --git a/llvm/test/CodeGen/PowerPC/select-constant-xor.ll b/llvm/test/CodeGen/PowerPC/select-constant-xor.ll --- a/llvm/test/CodeGen/PowerPC/select-constant-xor.ll +++ b/llvm/test/CodeGen/PowerPC/select-constant-xor.ll @@ -29,9 +29,10 @@ define i32 @selecti64i32(i64 %a) { ; CHECK-LABEL: selecti64i32: ; CHECK: # %bb.0: -; CHECK-NEXT: sradi 3, 3, 63 -; CHECK-NEXT: xori 3, 3, 65535 -; CHECK-NEXT: xoris 3, 3, 32767 +; CHECK-NEXT: rldicl 3, 3, 1, 63 +; CHECK-NEXT: lis 4, -32768 +; CHECK-NEXT: xori 3, 3, 1 +; CHECK-NEXT: sub 3, 4, 3 ; CHECK-NEXT: blr %c = icmp sgt i64 %a, -1 %s = select i1 %c, i32 2147483647, i32 -2147483648 diff --git a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll --- a/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll +++ b/llvm/test/CodeGen/PowerPC/select-i1-vs-i1.ll @@ -134,21 +134,21 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpw 5, 6 ; CHECK-NEXT: cmpw 1, 3, 4 -; CHECK-NEXT: creqv 20, 6, 2 -; CHECK-NEXT: isel 3, 7, 8, 20 +; CHECK-NEXT: crxor 20, 6, 2 +; CHECK-NEXT: isel 3, 8, 7, 20 ; CHECK-NEXT: blr ; ; CHECK-NO-ISEL-LABEL: testi32eq: ; CHECK-NO-ISEL: # %bb.0: # %entry ; CHECK-NO-ISEL-NEXT: cmpw 5, 6 ; CHECK-NO-ISEL-NEXT: cmpw 1, 3, 4 -; CHECK-NO-ISEL-NEXT: creqv 20, 6, 2 +; CHECK-NO-ISEL-NEXT: crxor 20, 6, 2 ; CHECK-NO-ISEL-NEXT: bc 12, 20, .LBB4_2 ; CHECK-NO-ISEL-NEXT: # %bb.1: # %entry -; CHECK-NO-ISEL-NEXT: ori 3, 8, 0 +; CHECK-NO-ISEL-NEXT: ori 3, 7, 0 ; CHECK-NO-ISEL-NEXT: blr ; CHECK-NO-ISEL-NEXT: .LBB4_2: # %entry -; CHECK-NO-ISEL-NEXT: addi 3, 7, 0 +; CHECK-NO-ISEL-NEXT: addi 3, 8, 0 ; CHECK-NO-ISEL-NEXT: blr entry: %cmp1 = icmp eq i32 %c3, %c4 @@ -434,21 +434,21 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpd 5, 6 ; CHECK-NEXT: cmpd 1, 3, 4 -; CHECK-NEXT: creqv 20, 6, 2 -; CHECK-NEXT: isel 3, 7, 8, 20 +; CHECK-NEXT: crxor 20, 6, 2 +; CHECK-NEXT: isel 3, 8, 7, 20 ; CHECK-NEXT: blr ; ; CHECK-NO-ISEL-LABEL: testi64eq: ; CHECK-NO-ISEL: # %bb.0: # %entry ; CHECK-NO-ISEL-NEXT: cmpd 5, 6 ; CHECK-NO-ISEL-NEXT: cmpd 1, 3, 4 -; CHECK-NO-ISEL-NEXT: creqv 20, 6, 2 +; CHECK-NO-ISEL-NEXT: crxor 20, 6, 2 ; CHECK-NO-ISEL-NEXT: bc 12, 20, .LBB14_2 ; CHECK-NO-ISEL-NEXT: # %bb.1: # %entry -; CHECK-NO-ISEL-NEXT: ori 3, 8, 0 +; CHECK-NO-ISEL-NEXT: ori 3, 7, 0 ; CHECK-NO-ISEL-NEXT: blr ; CHECK-NO-ISEL-NEXT: .LBB14_2: # %entry -; CHECK-NO-ISEL-NEXT: addi 3, 7, 0 +; CHECK-NO-ISEL-NEXT: addi 3, 8, 0 ; CHECK-NO-ISEL-NEXT: blr entry: %cmp1 = icmp eq i64 %c3, %c4 @@ -758,10 +758,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: fcmpu 0, 3, 4 ; CHECK-NEXT: fcmpu 1, 1, 2 -; CHECK-NEXT: fmr 1, 5 -; CHECK-NEXT: creqv 20, 6, 2 -; CHECK-NEXT: bclr 12, 20, 0 +; CHECK-NEXT: crxor 20, 6, 2 +; CHECK-NEXT: bc 12, 20, .LBB24_2 ; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: fmr 6, 5 +; CHECK-NEXT: .LBB24_2: # %entry ; CHECK-NEXT: fmr 1, 6 ; CHECK-NEXT: blr ; @@ -769,10 +770,11 @@ ; CHECK-NO-ISEL: # %bb.0: # %entry ; CHECK-NO-ISEL-NEXT: fcmpu 0, 3, 4 ; CHECK-NO-ISEL-NEXT: fcmpu 1, 1, 2 -; CHECK-NO-ISEL-NEXT: fmr 1, 5 -; CHECK-NO-ISEL-NEXT: creqv 20, 6, 2 -; CHECK-NO-ISEL-NEXT: bclr 12, 20, 0 +; CHECK-NO-ISEL-NEXT: crxor 20, 6, 2 +; CHECK-NO-ISEL-NEXT: bc 12, 20, .LBB24_2 ; CHECK-NO-ISEL-NEXT: # %bb.1: # %entry +; CHECK-NO-ISEL-NEXT: fmr 6, 5 +; CHECK-NO-ISEL-NEXT: .LBB24_2: # %entry ; CHECK-NO-ISEL-NEXT: fmr 1, 6 ; CHECK-NO-ISEL-NEXT: blr entry: @@ -1108,10 +1110,11 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: fcmpu 0, 3, 4 ; CHECK-NEXT: fcmpu 1, 1, 2 -; CHECK-NEXT: fmr 1, 5 -; CHECK-NEXT: creqv 20, 6, 2 -; CHECK-NEXT: bclr 12, 20, 0 +; CHECK-NEXT: crxor 20, 6, 2 +; CHECK-NEXT: bc 12, 20, .LBB34_2 ; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: fmr 6, 5 +; CHECK-NEXT: .LBB34_2: # %entry ; CHECK-NEXT: fmr 1, 6 ; CHECK-NEXT: blr ; @@ -1119,10 +1122,11 @@ ; CHECK-NO-ISEL: # %bb.0: # %entry ; CHECK-NO-ISEL-NEXT: fcmpu 0, 3, 4 ; CHECK-NO-ISEL-NEXT: fcmpu 1, 1, 2 -; CHECK-NO-ISEL-NEXT: fmr 1, 5 -; CHECK-NO-ISEL-NEXT: creqv 20, 6, 2 -; CHECK-NO-ISEL-NEXT: bclr 12, 20, 0 +; CHECK-NO-ISEL-NEXT: crxor 20, 6, 2 +; CHECK-NO-ISEL-NEXT: bc 12, 20, .LBB34_2 ; CHECK-NO-ISEL-NEXT: # %bb.1: # %entry +; CHECK-NO-ISEL-NEXT: fmr 6, 5 +; CHECK-NO-ISEL-NEXT: .LBB34_2: # %entry ; CHECK-NO-ISEL-NEXT: fmr 1, 6 ; CHECK-NO-ISEL-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/select_const.ll b/llvm/test/CodeGen/PowerPC/select_const.ll --- a/llvm/test/CodeGen/PowerPC/select_const.ll +++ b/llvm/test/CodeGen/PowerPC/select_const.ll @@ -69,7 +69,7 @@ define i32 @select_0_or_neg1(i1 %cond) { ; ALL-LABEL: select_0_or_neg1: ; ALL: # %bb.0: -; ALL-NEXT: clrldi 3, 3, 63 +; ALL-NEXT: clrlwi 3, 3, 31 ; ALL-NEXT: addi 3, 3, -1 ; ALL-NEXT: blr %sel = select i1 %cond, i32 0, i32 -1 @@ -88,7 +88,8 @@ define i32 @select_0_or_neg1_signext(i1 signext %cond) { ; ALL-LABEL: select_0_or_neg1_signext: ; ALL: # %bb.0: -; ALL-NEXT: not 3, 3 +; ALL-NEXT: xori 3, 3, 65535 +; ALL-NEXT: xoris 3, 3, 65535 ; ALL-NEXT: blr %sel = select i1 %cond, i32 0, i32 -1 ret i32 %sel @@ -99,7 +100,7 @@ define i32 @select_neg1_or_0(i1 %cond) { ; ALL-LABEL: select_neg1_or_0: ; ALL: # %bb.0: -; ALL-NEXT: clrldi 3, 3, 63 +; ALL-NEXT: clrlwi 3, 3, 31 ; ALL-NEXT: neg 3, 3 ; ALL-NEXT: blr %sel = select i1 %cond, i32 -1, i32 0 @@ -128,7 +129,7 @@ define i32 @select_Cplus1_C(i1 %cond) { ; ALL-LABEL: select_Cplus1_C: ; ALL: # %bb.0: -; ALL-NEXT: clrldi 3, 3, 63 +; ALL-NEXT: clrlwi 3, 3, 31 ; ALL-NEXT: addi 3, 3, 41 ; ALL-NEXT: blr %sel = select i1 %cond, i32 42, i32 41 @@ -158,7 +159,7 @@ define i32 @select_C_Cplus1(i1 %cond) { ; ALL-LABEL: select_C_Cplus1: ; ALL: # %bb.0: -; ALL-NEXT: clrldi 3, 3, 63 +; ALL-NEXT: clrlwi 3, 3, 31 ; ALL-NEXT: subfic 3, 3, 42 ; ALL-NEXT: blr %sel = select i1 %cond, i32 41, i32 42 diff --git a/llvm/test/CodeGen/PowerPC/signbit-shift.ll b/llvm/test/CodeGen/PowerPC/signbit-shift.ll --- a/llvm/test/CodeGen/PowerPC/signbit-shift.ll +++ b/llvm/test/CodeGen/PowerPC/signbit-shift.ll @@ -45,9 +45,8 @@ define i32 @sel_ifpos_tval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifpos_tval_bigger: ; CHECK: # %bb.0: -; CHECK-NEXT: rlwinm 3, 3, 1, 31, 31 -; CHECK-NEXT: xori 3, 3, 1 -; CHECK-NEXT: addi 3, 3, 41 +; CHECK-NEXT: srawi 3, 3, 31 +; CHECK-NEXT: addi 3, 3, 42 ; CHECK-NEXT: blr %c = icmp sgt i32 %x, -1 %r = select i1 %c, i32 42, i32 41 @@ -68,8 +67,9 @@ define i32 @add_sext_ifpos(i32 %x) { ; CHECK-LABEL: add_sext_ifpos: ; CHECK: # %bb.0: -; CHECK-NEXT: srwi 3, 3, 31 -; CHECK-NEXT: addi 3, 3, 41 +; CHECK-NEXT: not 3, 3 +; CHECK-NEXT: srawi 3, 3, 31 +; CHECK-NEXT: addi 3, 3, 42 ; CHECK-NEXT: blr %c = icmp sgt i32 %x, -1 %e = sext i1 %c to i32 @@ -96,9 +96,8 @@ define i32 @sel_ifpos_fval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifpos_fval_bigger: ; CHECK: # %bb.0: -; CHECK-NEXT: rlwinm 3, 3, 1, 31, 31 -; CHECK-NEXT: xori 3, 3, 1 -; CHECK-NEXT: subfic 3, 3, 42 +; CHECK-NEXT: srwi 3, 3, 31 +; CHECK-NEXT: addi 3, 3, 41 ; CHECK-NEXT: blr %c = icmp sgt i32 %x, -1 %r = select i1 %c, i32 41, i32 42 @@ -132,7 +131,7 @@ define i32 @sel_ifneg_tval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifneg_tval_bigger: ; CHECK: # %bb.0: -; CHECK-NEXT: rlwinm 3, 3, 1, 31, 31 +; CHECK-NEXT: srwi 3, 3, 31 ; CHECK-NEXT: addi 3, 3, 41 ; CHECK-NEXT: blr %c = icmp slt i32 %x, 0 @@ -165,8 +164,8 @@ define i32 @sel_ifneg_fval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifneg_fval_bigger: ; CHECK: # %bb.0: -; CHECK-NEXT: rlwinm 3, 3, 1, 31, 31 -; CHECK-NEXT: subfic 3, 3, 42 +; CHECK-NEXT: srawi 3, 3, 31 +; CHECK-NEXT: addi 3, 3, 42 ; CHECK-NEXT: blr %c = icmp slt i32 %x, 0 %r = select i1 %c, i32 41, i32 42 diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll @@ -478,9 +478,9 @@ ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r6, r3 -; P9LE-NEXT: mulhw r7, r6, r4 -; P9LE-NEXT: add r6, r7, r6 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: mulhw r6, r3, r4 +; P9LE-NEXT: add r6, r6, r3 ; P9LE-NEXT: srwi r7, r6, 31 ; P9LE-NEXT: srawi r6, r6, 6 ; P9LE-NEXT: add r6, r6, r7 @@ -490,9 +490,9 @@ ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: extsh r7, r3 -; P9LE-NEXT: mulhw r8, r7, r4 -; P9LE-NEXT: add r7, r8, r7 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: mulhw r7, r3, r4 +; P9LE-NEXT: add r7, r7, r3 ; P9LE-NEXT: srwi r8, r7, 31 ; P9LE-NEXT: srawi r7, r7, 6 ; P9LE-NEXT: add r7, r7, r8 @@ -501,9 +501,9 @@ ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: extsh r8, r3 -; P9LE-NEXT: mulhw r4, r8, r4 -; P9LE-NEXT: add r4, r4, r8 +; P9LE-NEXT: extsh r3, r3 +; P9LE-NEXT: mulhw r4, r3, r4 +; P9LE-NEXT: add r4, r4, r3 ; P9LE-NEXT: srwi r8, r4, 31 ; P9LE-NEXT: srawi r4, r4, 6 ; P9LE-NEXT: add r4, r4, r8 @@ -525,23 +525,23 @@ ; P9BE-LABEL: combine_srem_sdiv: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r5, -21386 +; P9BE-NEXT: lis r4, -21386 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r5, r5, 37253 -; P9BE-NEXT: extsh r4, r3 -; P9BE-NEXT: mulhw r6, r4, r5 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r6, r4, 31 -; P9BE-NEXT: srawi r4, r4, 6 -; P9BE-NEXT: add r4, r4, r6 -; P9BE-NEXT: mulli r6, r4, 95 +; P9BE-NEXT: ori r4, r4, 37253 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: mulhw r5, r3, r4 +; P9BE-NEXT: add r5, r5, r3 +; P9BE-NEXT: srwi r6, r5, 31 +; P9BE-NEXT: srawi r5, r5, 6 +; P9BE-NEXT: add r5, r5, r6 +; P9BE-NEXT: mulli r6, r5, 95 ; P9BE-NEXT: sub r3, r3, r6 ; P9BE-NEXT: mtfprwz f0, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r6, r3 -; P9BE-NEXT: mulhw r7, r6, r5 -; P9BE-NEXT: add r6, r7, r6 +; P9BE-NEXT: extsh r3, r3 +; P9BE-NEXT: mulhw r6, r3, r4 +; P9BE-NEXT: add r6, r6, r3 ; P9BE-NEXT: srwi r7, r6, 31 ; P9BE-NEXT: srawi r6, r6, 6 ; P9BE-NEXT: add r6, r6, r7 @@ -553,10 +553,10 @@ ; P9BE-NEXT: lxv vs2, 0(r3) ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: extsh r7, r3 +; P9BE-NEXT: extsh r3, r3 ; P9BE-NEXT: xxperm vs0, vs1, vs2 -; P9BE-NEXT: mulhw r8, r7, r5 -; P9BE-NEXT: add r7, r8, r7 +; P9BE-NEXT: mulhw r7, r3, r4 +; P9BE-NEXT: add r7, r7, r3 ; P9BE-NEXT: srwi r8, r7, 31 ; P9BE-NEXT: srawi r7, r7, 6 ; P9BE-NEXT: add r7, r7, r8 @@ -566,18 +566,18 @@ ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: extsh r3, r3 -; P9BE-NEXT: mulhw r5, r3, r5 -; P9BE-NEXT: add r5, r5, r3 -; P9BE-NEXT: srwi r8, r5, 31 -; P9BE-NEXT: srawi r5, r5, 6 -; P9BE-NEXT: add r5, r5, r8 -; P9BE-NEXT: mulli r8, r5, 95 +; P9BE-NEXT: mulhw r4, r3, r4 +; P9BE-NEXT: add r4, r4, r3 +; P9BE-NEXT: srwi r8, r4, 31 +; P9BE-NEXT: srawi r4, r4, 6 +; P9BE-NEXT: add r4, r4, r8 +; P9BE-NEXT: mulli r8, r4, 95 ; P9BE-NEXT: sub r3, r3, r8 ; P9BE-NEXT: mtfprwz f3, r3 ; P9BE-NEXT: xxperm vs1, vs3, vs2 -; P9BE-NEXT: mtfprwz f3, r5 +; P9BE-NEXT: mtfprwz f3, r4 ; P9BE-NEXT: xxmrghw v2, vs1, vs0 -; P9BE-NEXT: mtfprwz f0, r4 +; P9BE-NEXT: mtfprwz f0, r5 ; P9BE-NEXT: mtfprwz f1, r6 ; P9BE-NEXT: xxperm vs0, vs1, vs2 ; P9BE-NEXT: mtfprwz f1, r7 @@ -596,44 +596,44 @@ ; P8LE-NEXT: rldicl r6, r4, 48, 48 ; P8LE-NEXT: rldicl r7, r4, 32, 48 ; P8LE-NEXT: extsh r5, r5 -; P8LE-NEXT: extsh r8, r6 -; P8LE-NEXT: extsh r9, r7 -; P8LE-NEXT: mulhw r10, r5, r3 -; P8LE-NEXT: mulhw r11, r8, r3 +; P8LE-NEXT: extsh r6, r6 +; P8LE-NEXT: extsh r7, r7 +; P8LE-NEXT: mulhw r8, r5, r3 +; P8LE-NEXT: mulhw r9, r6, r3 ; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: mulhw r12, r9, r3 -; P8LE-NEXT: extsh r0, r4 -; P8LE-NEXT: mulhw r3, r0, r3 -; P8LE-NEXT: add r10, r10, r5 -; P8LE-NEXT: add r8, r11, r8 -; P8LE-NEXT: srwi r11, r10, 31 -; P8LE-NEXT: add r9, r12, r9 -; P8LE-NEXT: srawi r10, r10, 6 -; P8LE-NEXT: srawi r12, r8, 6 -; P8LE-NEXT: srwi r8, r8, 31 -; P8LE-NEXT: add r10, r10, r11 -; P8LE-NEXT: add r3, r3, r0 -; P8LE-NEXT: srawi r11, r9, 6 +; P8LE-NEXT: mulhw r10, r7, r3 +; P8LE-NEXT: extsh r4, r4 +; P8LE-NEXT: mulhw r3, r4, r3 +; P8LE-NEXT: add r8, r8, r5 +; P8LE-NEXT: add r9, r9, r6 +; P8LE-NEXT: srwi r11, r8, 31 +; P8LE-NEXT: add r10, r10, r7 +; P8LE-NEXT: srawi r8, r8, 6 +; P8LE-NEXT: srawi r12, r9, 6 ; P8LE-NEXT: srwi r9, r9, 31 -; P8LE-NEXT: add r8, r12, r8 -; P8LE-NEXT: mtvsrd v2, r10 -; P8LE-NEXT: mulli r12, r10, 95 -; P8LE-NEXT: add r9, r11, r9 +; P8LE-NEXT: add r8, r8, r11 +; P8LE-NEXT: add r3, r3, r4 +; P8LE-NEXT: srawi r11, r10, 6 +; P8LE-NEXT: srwi r10, r10, 31 +; P8LE-NEXT: add r9, r12, r9 +; P8LE-NEXT: mtvsrd v2, r8 +; P8LE-NEXT: mulli r12, r8, 95 +; P8LE-NEXT: add r10, r11, r10 ; P8LE-NEXT: srwi r11, r3, 31 -; P8LE-NEXT: mtvsrd v3, r8 +; P8LE-NEXT: mtvsrd v3, r9 ; P8LE-NEXT: srawi r3, r3, 6 -; P8LE-NEXT: mulli r10, r8, 95 -; P8LE-NEXT: mtvsrd v4, r9 -; P8LE-NEXT: add r3, r3, r11 ; P8LE-NEXT: mulli r8, r9, 95 +; P8LE-NEXT: mtvsrd v4, r10 +; P8LE-NEXT: add r3, r3, r11 +; P8LE-NEXT: mulli r9, r10, 95 ; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: mulli r9, r3, 95 +; P8LE-NEXT: mulli r10, r3, 95 ; P8LE-NEXT: sub r5, r5, r12 -; P8LE-NEXT: sub r6, r6, r10 +; P8LE-NEXT: sub r6, r6, r8 ; P8LE-NEXT: mtvsrd v3, r5 ; P8LE-NEXT: mtvsrd v5, r6 -; P8LE-NEXT: sub r5, r7, r8 -; P8LE-NEXT: sub r4, r4, r9 +; P8LE-NEXT: sub r5, r7, r9 +; P8LE-NEXT: sub r4, r4, r10 ; P8LE-NEXT: mtvsrd v0, r5 ; P8LE-NEXT: mtvsrd v1, r4 ; P8LE-NEXT: vmrghh v3, v5, v3 @@ -647,60 +647,60 @@ ; ; P8BE-LABEL: combine_srem_sdiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r5, v2 -; P8BE-NEXT: lis r4, -21386 +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, -21386 ; P8BE-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; P8BE-NEXT: addis r30, r2, .LCPI2_0@toc@ha -; P8BE-NEXT: ori r4, r4, 37253 -; P8BE-NEXT: clrldi r3, r5, 48 -; P8BE-NEXT: rldicl r6, r5, 48, 48 -; P8BE-NEXT: rldicl r7, r5, 32, 48 -; P8BE-NEXT: extsh r8, r3 -; P8BE-NEXT: extsh r9, r6 -; P8BE-NEXT: extsh r10, r7 -; P8BE-NEXT: mulhw r11, r8, r4 -; P8BE-NEXT: mulhw r12, r9, r4 -; P8BE-NEXT: rldicl r5, r5, 16, 48 -; P8BE-NEXT: mulhw r0, r10, r4 +; P8BE-NEXT: ori r3, r3, 37253 +; P8BE-NEXT: addi r0, r30, .LCPI2_0@toc@l +; P8BE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: lxvw4x v2, 0, r0 +; P8BE-NEXT: rldicl r7, r4, 32, 48 ; P8BE-NEXT: extsh r5, r5 -; P8BE-NEXT: mulhw r4, r5, r4 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: add r9, r12, r9 +; P8BE-NEXT: extsh r6, r6 +; P8BE-NEXT: extsh r7, r7 +; P8BE-NEXT: mulhw r8, r5, r3 +; P8BE-NEXT: mulhw r9, r6, r3 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: mulhw r10, r7, r3 +; P8BE-NEXT: extsh r4, r4 +; P8BE-NEXT: mulhw r3, r4, r3 +; P8BE-NEXT: add r8, r8, r5 +; P8BE-NEXT: add r9, r9, r6 ; P8BE-NEXT: srwi r11, r8, 31 -; P8BE-NEXT: add r10, r0, r10 +; P8BE-NEXT: add r10, r10, r7 ; P8BE-NEXT: srawi r8, r8, 6 -; P8BE-NEXT: addi r0, r30, .LCPI2_0@toc@l -; P8BE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload ; P8BE-NEXT: srawi r12, r9, 6 ; P8BE-NEXT: srwi r9, r9, 31 ; P8BE-NEXT: add r8, r8, r11 -; P8BE-NEXT: add r4, r4, r5 -; P8BE-NEXT: lxvw4x v2, 0, r0 +; P8BE-NEXT: add r3, r3, r4 ; P8BE-NEXT: srawi r11, r10, 6 ; P8BE-NEXT: srwi r10, r10, 31 ; P8BE-NEXT: add r9, r12, r9 ; P8BE-NEXT: mtvsrwz v3, r8 ; P8BE-NEXT: mulli r12, r8, 95 ; P8BE-NEXT: add r10, r11, r10 -; P8BE-NEXT: srwi r11, r4, 31 +; P8BE-NEXT: srwi r11, r3, 31 ; P8BE-NEXT: mtvsrwz v4, r9 -; P8BE-NEXT: srawi r4, r4, 6 +; P8BE-NEXT: srawi r3, r3, 6 ; P8BE-NEXT: mulli r8, r9, 95 ; P8BE-NEXT: mtvsrwz v5, r10 -; P8BE-NEXT: add r4, r4, r11 +; P8BE-NEXT: add r3, r3, r11 ; P8BE-NEXT: mulli r9, r10, 95 ; P8BE-NEXT: vperm v3, v4, v3, v2 -; P8BE-NEXT: mulli r10, r4, 95 -; P8BE-NEXT: sub r3, r3, r12 +; P8BE-NEXT: mulli r10, r3, 95 +; P8BE-NEXT: sub r5, r5, r12 ; P8BE-NEXT: sub r6, r6, r8 -; P8BE-NEXT: mtvsrwz v4, r3 +; P8BE-NEXT: mtvsrwz v4, r5 ; P8BE-NEXT: mtvsrwz v0, r6 -; P8BE-NEXT: sub r3, r7, r9 -; P8BE-NEXT: sub r5, r5, r10 -; P8BE-NEXT: mtvsrwz v1, r3 -; P8BE-NEXT: mtvsrwz v6, r5 +; P8BE-NEXT: sub r5, r7, r9 +; P8BE-NEXT: sub r4, r4, r10 +; P8BE-NEXT: mtvsrwz v1, r5 +; P8BE-NEXT: mtvsrwz v6, r4 ; P8BE-NEXT: vperm v4, v0, v4, v2 -; P8BE-NEXT: mtvsrwz v0, r4 +; P8BE-NEXT: mtvsrwz v0, r3 ; P8BE-NEXT: vperm v1, v6, v1, v2 ; P8BE-NEXT: vperm v2, v0, v5, v2 ; P8BE-NEXT: xxmrghw v4, v1, v4 diff --git a/llvm/test/CodeGen/PowerPC/store-combine.ll b/llvm/test/CodeGen/PowerPC/store-combine.ll --- a/llvm/test/CodeGen/PowerPC/store-combine.ll +++ b/llvm/test/CodeGen/PowerPC/store-combine.ll @@ -10,12 +10,24 @@ define void @store_i32_by_i8(i32 signext %m, ptr %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: stw 3, 0(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 +; CHECK-PPC64LE-NEXT: stb 3, 0(4) +; CHECK-PPC64LE-NEXT: stb 5, 1(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 +; CHECK-PPC64LE-NEXT: srwi 3, 3, 24 +; CHECK-PPC64LE-NEXT: stb 5, 2(4) +; CHECK-PPC64LE-NEXT: stb 3, 3(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: stwbrx 3, 0, 4 +; CHECK-PPC64-NEXT: srwi 5, 3, 8 +; CHECK-PPC64-NEXT: stb 3, 0(4) +; CHECK-PPC64-NEXT: stb 5, 1(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 16 +; CHECK-PPC64-NEXT: srwi 3, 3, 24 +; CHECK-PPC64-NEXT: stb 5, 2(4) +; CHECK-PPC64-NEXT: stb 3, 3(4) ; CHECK-PPC64-NEXT: blr entry: %conv = trunc i32 %m to i8 @@ -43,12 +55,24 @@ define void @store_i32_by_i8_bswap(i32 signext %m, ptr %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: stwbrx 3, 0, 4 +; CHECK-PPC64LE-NEXT: srwi 5, 3, 24 +; CHECK-PPC64LE-NEXT: stb 3, 3(4) +; CHECK-PPC64LE-NEXT: stb 5, 0(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 +; CHECK-PPC64LE-NEXT: stb 5, 1(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 +; CHECK-PPC64LE-NEXT: stb 5, 2(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: stw 3, 0(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 24 +; CHECK-PPC64-NEXT: srwi 6, 3, 16 +; CHECK-PPC64-NEXT: stb 3, 3(4) +; CHECK-PPC64-NEXT: stb 5, 0(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 8 +; CHECK-PPC64-NEXT: stb 6, 1(4) +; CHECK-PPC64-NEXT: stb 5, 2(4) ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i32 %m, 24 @@ -80,12 +104,40 @@ define void @store_i64_by_i8(i64 %m, ptr %p) { ; CHECK-PPC64LE-LABEL: store_i64_by_i8: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: std 3, 0(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 56, 8 +; CHECK-PPC64LE-NEXT: stb 3, 0(4) +; CHECK-PPC64LE-NEXT: stb 5, 1(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 48, 16 +; CHECK-PPC64LE-NEXT: stb 5, 2(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 40, 24 +; CHECK-PPC64LE-NEXT: stb 5, 3(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 32, 32 +; CHECK-PPC64LE-NEXT: stb 5, 4(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 24, 40 +; CHECK-PPC64LE-NEXT: stb 5, 5(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 16, 48 +; CHECK-PPC64LE-NEXT: rldicl 3, 3, 8, 56 +; CHECK-PPC64LE-NEXT: stb 5, 6(4) +; CHECK-PPC64LE-NEXT: stb 3, 7(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i64_by_i8: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: stdbrx 3, 0, 4 +; CHECK-PPC64-NEXT: rldicl 5, 3, 56, 8 +; CHECK-PPC64-NEXT: rldicl 6, 3, 48, 16 +; CHECK-PPC64-NEXT: stb 3, 0(4) +; CHECK-PPC64-NEXT: stb 5, 1(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 40, 24 +; CHECK-PPC64-NEXT: stb 6, 2(4) +; CHECK-PPC64-NEXT: rldicl 6, 3, 32, 32 +; CHECK-PPC64-NEXT: stb 5, 3(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 24, 40 +; CHECK-PPC64-NEXT: stb 6, 4(4) +; CHECK-PPC64-NEXT: stb 5, 5(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 16, 48 +; CHECK-PPC64-NEXT: rldicl 3, 3, 8, 56 +; CHECK-PPC64-NEXT: stb 5, 6(4) +; CHECK-PPC64-NEXT: stb 3, 7(4) ; CHECK-PPC64-NEXT: blr entry: %conv = trunc i64 %m to i8 @@ -133,12 +185,40 @@ define void @store_i64_by_i8_bswap(i64 %m, ptr %p) { ; CHECK-PPC64LE-LABEL: store_i64_by_i8_bswap: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: stdbrx 3, 0, 4 +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 56, 8 +; CHECK-PPC64LE-NEXT: stb 3, 7(4) +; CHECK-PPC64LE-NEXT: stb 5, 6(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 48, 16 +; CHECK-PPC64LE-NEXT: stb 5, 5(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 40, 24 +; CHECK-PPC64LE-NEXT: stb 5, 4(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 32, 32 +; CHECK-PPC64LE-NEXT: stb 5, 3(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 24, 40 +; CHECK-PPC64LE-NEXT: stb 5, 2(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 16, 48 +; CHECK-PPC64LE-NEXT: rldicl 3, 3, 8, 56 +; CHECK-PPC64LE-NEXT: stb 5, 1(4) +; CHECK-PPC64LE-NEXT: stb 3, 0(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i64_by_i8_bswap: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: std 3, 0(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 56, 8 +; CHECK-PPC64-NEXT: rldicl 6, 3, 48, 16 +; CHECK-PPC64-NEXT: stb 3, 7(4) +; CHECK-PPC64-NEXT: stb 5, 6(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 40, 24 +; CHECK-PPC64-NEXT: stb 6, 5(4) +; CHECK-PPC64-NEXT: rldicl 6, 3, 32, 32 +; CHECK-PPC64-NEXT: stb 5, 4(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 24, 40 +; CHECK-PPC64-NEXT: stb 6, 3(4) +; CHECK-PPC64-NEXT: stb 5, 2(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 16, 48 +; CHECK-PPC64-NEXT: rldicl 3, 3, 8, 56 +; CHECK-PPC64-NEXT: stb 5, 1(4) +; CHECK-PPC64-NEXT: stb 3, 0(4) ; CHECK-PPC64-NEXT: blr entry: %conv = trunc i64 %m to i8 @@ -190,7 +270,21 @@ ; CHECK-PPC64LE-NEXT: slwi 5, 3, 3 ; CHECK-PPC64LE-NEXT: sub 3, 5, 3 ; CHECK-PPC64LE-NEXT: extsw 3, 3 -; CHECK-PPC64LE-NEXT: stdbrx 3, 0, 4 +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 56, 8 +; CHECK-PPC64LE-NEXT: stb 3, 7(4) +; CHECK-PPC64LE-NEXT: stb 5, 6(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 48, 16 +; CHECK-PPC64LE-NEXT: stb 5, 5(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 40, 24 +; CHECK-PPC64LE-NEXT: stb 5, 4(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 32, 32 +; CHECK-PPC64LE-NEXT: stb 5, 3(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 24, 40 +; CHECK-PPC64LE-NEXT: stb 5, 2(4) +; CHECK-PPC64LE-NEXT: rldicl 5, 3, 16, 48 +; CHECK-PPC64LE-NEXT: rldicl 3, 3, 8, 56 +; CHECK-PPC64LE-NEXT: stb 5, 1(4) +; CHECK-PPC64LE-NEXT: stb 3, 0(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i64_by_i8_bswap_uses: @@ -198,7 +292,21 @@ ; CHECK-PPC64-NEXT: slwi 5, 3, 3 ; CHECK-PPC64-NEXT: sub 3, 5, 3 ; CHECK-PPC64-NEXT: extsw 3, 3 -; CHECK-PPC64-NEXT: std 3, 0(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 56, 8 +; CHECK-PPC64-NEXT: rldicl 6, 3, 48, 16 +; CHECK-PPC64-NEXT: stb 3, 7(4) +; CHECK-PPC64-NEXT: stb 5, 6(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 40, 24 +; CHECK-PPC64-NEXT: stb 6, 5(4) +; CHECK-PPC64-NEXT: rldicl 6, 3, 32, 32 +; CHECK-PPC64-NEXT: stb 5, 4(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 24, 40 +; CHECK-PPC64-NEXT: stb 6, 3(4) +; CHECK-PPC64-NEXT: stb 5, 2(4) +; CHECK-PPC64-NEXT: rldicl 5, 3, 16, 48 +; CHECK-PPC64-NEXT: rldicl 3, 3, 8, 56 +; CHECK-PPC64-NEXT: stb 5, 1(4) +; CHECK-PPC64-NEXT: stb 3, 0(4) ; CHECK-PPC64-NEXT: blr entry: %mul = mul nsw i32 %t, 7 @@ -246,8 +354,9 @@ define void @store_i32_by_i8_bswap_volatile(i32 signext %m, ptr %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_volatile: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: li 5, 2 -; CHECK-PPC64LE-NEXT: sthbrx 3, 4, 5 +; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 +; CHECK-PPC64LE-NEXT: stb 3, 3(4) +; CHECK-PPC64LE-NEXT: stb 5, 2(4) ; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 ; CHECK-PPC64LE-NEXT: srwi 3, 3, 24 ; CHECK-PPC64LE-NEXT: stb 5, 1(4) @@ -256,7 +365,9 @@ ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_volatile: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: sth 3, 2(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 8 +; CHECK-PPC64-NEXT: stb 3, 3(4) +; CHECK-PPC64-NEXT: stb 5, 2(4) ; CHECK-PPC64-NEXT: srwi 5, 3, 16 ; CHECK-PPC64-NEXT: srwi 3, 3, 24 ; CHECK-PPC64-NEXT: stb 5, 1(4) @@ -290,8 +401,9 @@ define void @store_i32_by_i8_bswap_store_in_between(i32 signext %m, ptr %p, ptr %q) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_store_in_between: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: li 6, 2 -; CHECK-PPC64LE-NEXT: sthbrx 3, 4, 6 +; CHECK-PPC64LE-NEXT: srwi 6, 3, 8 +; CHECK-PPC64LE-NEXT: stb 3, 3(4) +; CHECK-PPC64LE-NEXT: stb 6, 2(4) ; CHECK-PPC64LE-NEXT: li 6, 3 ; CHECK-PPC64LE-NEXT: stb 6, 0(5) ; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 @@ -303,7 +415,9 @@ ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_store_in_between: ; CHECK-PPC64: # %bb.0: # %entry ; CHECK-PPC64-NEXT: li 6, 3 -; CHECK-PPC64-NEXT: sth 3, 2(4) +; CHECK-PPC64-NEXT: srwi 7, 3, 8 +; CHECK-PPC64-NEXT: stb 3, 3(4) +; CHECK-PPC64-NEXT: stb 7, 2(4) ; CHECK-PPC64-NEXT: stb 6, 0(5) ; CHECK-PPC64-NEXT: srwi 5, 3, 16 ; CHECK-PPC64-NEXT: srwi 3, 3, 24 @@ -377,13 +491,24 @@ define void @store_i32_by_i8_bswap_nonzero_offset(i32 signext %m, ptr %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_nonzero_offset: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: addi 4, 4, 1 -; CHECK-PPC64LE-NEXT: stwbrx 3, 0, 4 +; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 +; CHECK-PPC64LE-NEXT: stb 3, 4(4) +; CHECK-PPC64LE-NEXT: stb 5, 3(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 +; CHECK-PPC64LE-NEXT: srwi 3, 3, 24 +; CHECK-PPC64LE-NEXT: stb 5, 2(4) +; CHECK-PPC64LE-NEXT: stb 3, 1(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_nonzero_offset: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: stw 3, 1(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 8 +; CHECK-PPC64-NEXT: stb 3, 4(4) +; CHECK-PPC64-NEXT: stb 5, 3(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 16 +; CHECK-PPC64-NEXT: srwi 3, 3, 24 +; CHECK-PPC64-NEXT: stb 5, 2(4) +; CHECK-PPC64-NEXT: stb 3, 1(4) ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i32 %m, 8 @@ -412,13 +537,24 @@ define void @store_i32_by_i8_neg_offset(i32 signext %m, ptr %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_neg_offset: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: stw 3, -4(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 +; CHECK-PPC64LE-NEXT: stb 3, -4(4) +; CHECK-PPC64LE-NEXT: stb 5, -3(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 +; CHECK-PPC64LE-NEXT: srwi 3, 3, 24 +; CHECK-PPC64LE-NEXT: stb 5, -2(4) +; CHECK-PPC64LE-NEXT: stb 3, -1(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_neg_offset: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: addi 4, 4, -4 -; CHECK-PPC64-NEXT: stwbrx 3, 0, 4 +; CHECK-PPC64-NEXT: srwi 5, 3, 8 +; CHECK-PPC64-NEXT: stb 3, -4(4) +; CHECK-PPC64-NEXT: stb 5, -3(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 16 +; CHECK-PPC64-NEXT: srwi 3, 3, 24 +; CHECK-PPC64-NEXT: stb 5, -2(4) +; CHECK-PPC64-NEXT: stb 3, -1(4) ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i32 %m, 8 @@ -447,13 +583,24 @@ define void @store_i32_by_i8_bswap_neg_offset(i32 signext %m, ptr %p) { ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_neg_offset: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: addi 4, 4, -4 -; CHECK-PPC64LE-NEXT: stwbrx 3, 0, 4 +; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 +; CHECK-PPC64LE-NEXT: stb 3, -1(4) +; CHECK-PPC64LE-NEXT: stb 5, -3(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 24 +; CHECK-PPC64LE-NEXT: stb 5, -4(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 +; CHECK-PPC64LE-NEXT: stb 5, -2(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_neg_offset: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: stw 3, -4(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 16 +; CHECK-PPC64-NEXT: srwi 6, 3, 24 +; CHECK-PPC64-NEXT: stb 3, -1(4) +; CHECK-PPC64-NEXT: stb 5, -3(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 8 +; CHECK-PPC64-NEXT: stb 6, -4(4) +; CHECK-PPC64-NEXT: stb 5, -2(4) ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i32 %m, 16 @@ -483,16 +630,27 @@ ; CHECK-PPC64LE-LABEL: store_i32_by_i8_bswap_base_index_offset: ; CHECK-PPC64LE: # %bb.0: # %entry ; CHECK-PPC64LE-NEXT: extsw 4, 4 +; CHECK-PPC64LE-NEXT: srwi 6, 3, 16 ; CHECK-PPC64LE-NEXT: add 4, 5, 4 -; CHECK-PPC64LE-NEXT: addi 4, 4, -4 -; CHECK-PPC64LE-NEXT: stwbrx 3, 0, 4 +; CHECK-PPC64LE-NEXT: srwi 5, 3, 24 +; CHECK-PPC64LE-NEXT: stb 5, -4(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 +; CHECK-PPC64LE-NEXT: stb 6, -3(4) +; CHECK-PPC64LE-NEXT: stb 3, -1(4) +; CHECK-PPC64LE-NEXT: stb 5, -2(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_base_index_offset: ; CHECK-PPC64: # %bb.0: # %entry ; CHECK-PPC64-NEXT: extsw 4, 4 +; CHECK-PPC64-NEXT: srwi 6, 3, 16 ; CHECK-PPC64-NEXT: add 4, 5, 4 -; CHECK-PPC64-NEXT: stw 3, -4(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 24 +; CHECK-PPC64-NEXT: stb 6, -3(4) +; CHECK-PPC64-NEXT: srwi 6, 3, 8 +; CHECK-PPC64-NEXT: stb 5, -4(4) +; CHECK-PPC64-NEXT: stb 6, -2(4) +; CHECK-PPC64-NEXT: stb 3, -1(4) ; CHECK-PPC64-NEXT: blr entry: %0 = lshr i32 %m, 16 @@ -536,15 +694,26 @@ ; CHECK-PPC64LE: # %bb.0: # %entry ; CHECK-PPC64LE-NEXT: extsw 4, 4 ; CHECK-PPC64LE-NEXT: add 4, 5, 4 -; CHECK-PPC64LE-NEXT: addi 4, 4, 3 -; CHECK-PPC64LE-NEXT: stwbrx 3, 0, 4 +; CHECK-PPC64LE-NEXT: srwi 5, 3, 24 +; CHECK-PPC64LE-NEXT: stb 5, 3(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 16 +; CHECK-PPC64LE-NEXT: stb 3, 6(4) +; CHECK-PPC64LE-NEXT: stb 5, 4(4) +; CHECK-PPC64LE-NEXT: srwi 5, 3, 8 +; CHECK-PPC64LE-NEXT: stb 5, 5(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_i32_by_i8_bswap_complicated: ; CHECK-PPC64: # %bb.0: # %entry ; CHECK-PPC64-NEXT: extsw 4, 4 +; CHECK-PPC64-NEXT: srwi 6, 3, 24 ; CHECK-PPC64-NEXT: add 4, 5, 4 -; CHECK-PPC64-NEXT: stw 3, 3(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 16 +; CHECK-PPC64-NEXT: stb 5, 4(4) +; CHECK-PPC64-NEXT: srwi 5, 3, 8 +; CHECK-PPC64-NEXT: stb 6, 3(4) +; CHECK-PPC64-NEXT: stb 5, 5(4) +; CHECK-PPC64-NEXT: stb 3, 6(4) ; CHECK-PPC64-NEXT: blr entry: %idx.ext = sext i32 %i to i64 @@ -596,12 +765,16 @@ define void @store_16_by_i8(i16 %m, ptr %p) { ; CHECK-PPC64LE-LABEL: store_16_by_i8: ; CHECK-PPC64LE: # %bb.0: # %entry -; CHECK-PPC64LE-NEXT: sth 3, 0(4) +; CHECK-PPC64LE-NEXT: stb 3, 0(4) +; CHECK-PPC64LE-NEXT: srwi 3, 3, 8 +; CHECK-PPC64LE-NEXT: stb 3, 1(4) ; CHECK-PPC64LE-NEXT: blr ; ; CHECK-PPC64-LABEL: store_16_by_i8: ; CHECK-PPC64: # %bb.0: # %entry -; CHECK-PPC64-NEXT: sthbrx 3, 0, 4 +; CHECK-PPC64-NEXT: srwi 5, 3, 8 +; CHECK-PPC64-NEXT: stb 3, 0(4) +; CHECK-PPC64-NEXT: stb 5, 1(4) ; CHECK-PPC64-NEXT: blr entry: %conv1 = trunc i16 %m to i8 diff --git a/llvm/test/CodeGen/PowerPC/store-forward-be32.ll b/llvm/test/CodeGen/PowerPC/store-forward-be32.ll --- a/llvm/test/CodeGen/PowerPC/store-forward-be32.ll +++ b/llvm/test/CodeGen/PowerPC/store-forward-be32.ll @@ -101,8 +101,8 @@ define i32 @tc44(ptr noundef byval(%struct.ST) align 4 %s) { ; CHECK-LABEL: tc44: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stw 3, 24(1) ; CHECK-NEXT: stw 4, 28(1) +; CHECK-NEXT: stw 3, 24(1) ; CHECK-NEXT: blr entry: %0 = load i32, ptr %s, align 4 @@ -113,9 +113,10 @@ define i32 @tc41(ptr noundef byval(%struct.ST) align 4 %s) { ; CHECK-LABEL: tc41: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stw 3, 24(1) +; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: srawi 3, 3, 24 ; CHECK-NEXT: stw 4, 28(1) +; CHECK-NEXT: stw 5, 24(1) ; CHECK-NEXT: blr entry: %0 = load i32, ptr %s, align 4 @@ -127,9 +128,10 @@ define i32 @tc42(ptr noundef byval(%struct.ST) align 4 %s) { ; CHECK-LABEL: tc42: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stw 3, 24(1) +; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: srawi 3, 3, 16 ; CHECK-NEXT: stw 4, 28(1) +; CHECK-NEXT: stw 5, 24(1) ; CHECK-NEXT: blr entry: %0 = load i32, ptr %s, align 4 @@ -141,9 +143,10 @@ define i32 @tc43(ptr noundef byval(%struct.ST) align 4 %s) { ; CHECK-LABEL: tc43: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stw 3, 24(1) +; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: srawi 3, 3, 8 ; CHECK-NEXT: stw 4, 28(1) +; CHECK-NEXT: stw 5, 24(1) ; CHECK-NEXT: blr entry: %0 = load i32, ptr %s, align 4 @@ -155,8 +158,8 @@ define i32 @utc44(ptr noundef byval(%struct.UST) align 4 %s) { ; CHECK-LABEL: utc44: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stw 3, 24(1) ; CHECK-NEXT: stw 4, 28(1) +; CHECK-NEXT: stw 3, 24(1) ; CHECK-NEXT: blr entry: %0 = load i32, ptr %s, align 4 @@ -167,9 +170,10 @@ define i32 @utc41(ptr noundef byval(%struct.UST) align 4 %s) { ; CHECK-LABEL: utc41: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stw 3, 24(1) +; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: srwi 3, 3, 24 ; CHECK-NEXT: stw 4, 28(1) +; CHECK-NEXT: stw 5, 24(1) ; CHECK-NEXT: blr entry: %0 = load i32, ptr %s, align 4 @@ -181,9 +185,10 @@ define i32 @utc42(ptr noundef byval(%struct.UST) align 4 %s) { ; CHECK-LABEL: utc42: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stw 3, 24(1) +; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: srwi 3, 3, 16 ; CHECK-NEXT: stw 4, 28(1) +; CHECK-NEXT: stw 5, 24(1) ; CHECK-NEXT: blr entry: %0 = load i32, ptr %s, align 4 @@ -195,9 +200,10 @@ define i32 @utc43(ptr noundef byval(%struct.UST) align 4 %s) { ; CHECK-LABEL: utc43: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: stw 3, 24(1) +; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: srwi 3, 3, 8 ; CHECK-NEXT: stw 4, 28(1) +; CHECK-NEXT: stw 5, 24(1) ; CHECK-NEXT: blr entry: %0 = load i32, ptr %s, align 4 diff --git a/llvm/test/CodeGen/PowerPC/store-forward-be64.ll b/llvm/test/CodeGen/PowerPC/store-forward-be64.ll --- a/llvm/test/CodeGen/PowerPC/store-forward-be64.ll +++ b/llvm/test/CodeGen/PowerPC/store-forward-be64.ll @@ -217,8 +217,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: sradi 3, 3, 8 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -232,8 +232,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: sradi 3, 3, 16 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -247,8 +247,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: sradi 3, 3, 24 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -262,8 +262,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: sradi 3, 3, 32 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -277,8 +277,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: sradi 3, 3, 40 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -292,8 +292,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: sradi 3, 3, 48 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -307,8 +307,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: sradi 3, 3, 56 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -320,8 +320,8 @@ define i64 @ultc88(ptr noundef byval(%struct.ULST) align 8 %s) { ; CHECK-LABEL: ultc88: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: std 3, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 3, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -334,8 +334,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: rldicl 3, 3, 56, 8 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -349,8 +349,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: rldicl 3, 3, 48, 16 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -364,8 +364,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: rldicl 3, 3, 40, 24 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -379,8 +379,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: rldicl 3, 3, 32, 32 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -394,8 +394,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: rldicl 3, 3, 24, 40 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -409,8 +409,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: rldicl 3, 3, 16, 48 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 @@ -424,8 +424,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: mr 5, 3 ; CHECK-NEXT: rldicl 3, 3, 8, 56 -; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: std 4, 56(1) +; CHECK-NEXT: std 5, 48(1) ; CHECK-NEXT: blr entry: %0 = load i64, ptr %s, align 8 diff --git a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll @@ -70,26 +70,28 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; PPC-LABEL: test_urem_odd_setne: ; PPC: # %bb.0: -; PPC-NEXT: mulli 3, 3, 13 +; PPC-NEXT: clrlwi 4, 3, 28 +; PPC-NEXT: mulli 4, 4, 13 +; PPC-NEXT: srwi 4, 4, 6 +; PPC-NEXT: mulli 4, 4, 5 +; PPC-NEXT: sub 3, 3, 4 ; PPC-NEXT: clrlwi 3, 3, 28 -; PPC-NEXT: li 4, 0 -; PPC-NEXT: cmplwi 3, 3 -; PPC-NEXT: li 3, 1 -; PPC-NEXT: bclr 12, 1, 0 -; PPC-NEXT: # %bb.1: -; PPC-NEXT: ori 3, 4, 0 +; PPC-NEXT: cntlzw 3, 3 +; PPC-NEXT: not 3, 3 +; PPC-NEXT: rlwinm 3, 3, 27, 31, 31 ; PPC-NEXT: blr ; ; PPC64LE-LABEL: test_urem_odd_setne: ; PPC64LE: # %bb.0: -; PPC64LE-NEXT: slwi 5, 3, 1 -; PPC64LE-NEXT: li 4, 0 -; PPC64LE-NEXT: add 3, 3, 5 -; PPC64LE-NEXT: neg 3, 3 +; PPC64LE-NEXT: clrlwi 4, 3, 28 +; PPC64LE-NEXT: mulli 4, 4, 13 +; PPC64LE-NEXT: srwi 4, 4, 6 +; PPC64LE-NEXT: rlwimi 4, 4, 2, 28, 29 +; PPC64LE-NEXT: sub 3, 3, 4 ; PPC64LE-NEXT: clrlwi 3, 3, 28 -; PPC64LE-NEXT: cmplwi 3, 3 -; PPC64LE-NEXT: li 3, 1 -; PPC64LE-NEXT: iselgt 3, 3, 4 +; PPC64LE-NEXT: cntlzw 3, 3 +; PPC64LE-NEXT: not 3, 3 +; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31 ; PPC64LE-NEXT: blr %urem = urem i4 %X, 5 %cmp = icmp ne i4 %urem, 0 @@ -217,36 +219,36 @@ ; PPC-NEXT: lis 6, -12795 ; PPC-NEXT: ori 6, 6, 40665 ; PPC-NEXT: mulhwu 7, 5, 6 -; PPC-NEXT: lis 9, 12057 -; PPC-NEXT: ori 9, 9, 37186 -; PPC-NEXT: mullw 11, 4, 6 -; PPC-NEXT: addc 7, 11, 7 -; PPC-NEXT: lis 11, -5526 -; PPC-NEXT: ori 11, 11, 61135 -; PPC-NEXT: mulhwu 8, 4, 6 -; PPC-NEXT: addze 8, 8 -; PPC-NEXT: mulhwu 10, 5, 9 -; PPC-NEXT: mullw 4, 4, 9 -; PPC-NEXT: mullw 9, 5, 9 -; PPC-NEXT: addc 7, 9, 7 -; PPC-NEXT: addze 9, 10 -; PPC-NEXT: rotlwi 10, 7, 31 +; PPC-NEXT: lis 8, 12057 +; PPC-NEXT: ori 8, 8, 37186 +; PPC-NEXT: mullw 10, 4, 6 +; PPC-NEXT: addc 7, 10, 7 ; PPC-NEXT: mullw 3, 3, 6 -; PPC-NEXT: mullw 6, 5, 6 +; PPC-NEXT: mullw 11, 5, 6 +; PPC-NEXT: mulhwu 6, 4, 6 +; PPC-NEXT: addze 6, 6 +; PPC-NEXT: slwi 4, 4, 1 +; PPC-NEXT: mulhwu 9, 5, 8 +; PPC-NEXT: mullw 8, 5, 8 +; PPC-NEXT: addc 7, 8, 7 +; PPC-NEXT: addze 9, 9 ; PPC-NEXT: slwi 5, 5, 1 +; PPC-NEXT: add 6, 6, 9 ; PPC-NEXT: add 3, 5, 3 -; PPC-NEXT: rotlwi 5, 6, 31 -; PPC-NEXT: rlwimi 5, 7, 31, 0, 0 -; PPC-NEXT: add 7, 8, 9 -; PPC-NEXT: add 4, 4, 7 +; PPC-NEXT: rotlwi 8, 11, 31 +; PPC-NEXT: sub 4, 6, 4 +; PPC-NEXT: lis 5, -5526 +; PPC-NEXT: rlwimi 8, 7, 31, 0, 0 +; PPC-NEXT: rotlwi 7, 7, 31 ; PPC-NEXT: add 3, 4, 3 -; PPC-NEXT: rlwimi 10, 3, 31, 0, 0 -; PPC-NEXT: cmplw 5, 11 -; PPC-NEXT: cmplwi 1, 10, 13 +; PPC-NEXT: ori 5, 5, 61135 +; PPC-NEXT: rlwimi 7, 3, 31, 0, 0 +; PPC-NEXT: cmplw 8, 5 +; PPC-NEXT: cmplwi 1, 7, 13 ; PPC-NEXT: rlwinm 3, 3, 31, 31, 31 ; PPC-NEXT: crand 20, 6, 0 ; PPC-NEXT: crandc 21, 4, 6 -; PPC-NEXT: rlwimi. 3, 6, 1, 30, 30 +; PPC-NEXT: rlwimi. 3, 11, 1, 30, 30 ; PPC-NEXT: cror 20, 20, 21 ; PPC-NEXT: crnand 20, 2, 20 ; PPC-NEXT: li 3, 1 diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -354,23 +354,23 @@ ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r6, r3, 16 -; P9LE-NEXT: mulhwu r6, r6, r4 +; P9LE-NEXT: clrlwi r3, r3, 16 +; P9LE-NEXT: mulhwu r6, r3, r4 ; P9LE-NEXT: mulli r7, r6, 95 ; P9LE-NEXT: sub r3, r3, r7 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r7, r3, 16 -; P9LE-NEXT: mulhwu r7, r7, r4 +; P9LE-NEXT: clrlwi r3, r3, 16 +; P9LE-NEXT: mulhwu r7, r3, r4 ; P9LE-NEXT: mulli r8, r7, 95 ; P9LE-NEXT: sub r3, r3, r8 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: clrlwi r8, r3, 16 -; P9LE-NEXT: mulhwu r4, r8, r4 +; P9LE-NEXT: clrlwi r3, r3, 16 +; P9LE-NEXT: mulhwu r4, r3, r4 ; P9LE-NEXT: mulli r8, r4, 95 ; P9LE-NEXT: mtvsrd v5, r4 ; P9LE-NEXT: sub r3, r3, r8 @@ -389,18 +389,18 @@ ; P9BE-LABEL: combine_urem_udiv: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r5, 689 +; P9BE-NEXT: lis r4, 689 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r5, r5, 55879 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: mulhwu r4, r4, r5 -; P9BE-NEXT: mulli r6, r4, 95 +; P9BE-NEXT: ori r4, r4, 55879 +; P9BE-NEXT: clrlwi r3, r3, 16 +; P9BE-NEXT: mulhwu r5, r3, r4 +; P9BE-NEXT: mulli r6, r5, 95 ; P9BE-NEXT: sub r3, r3, r6 ; P9BE-NEXT: mtfprwz f0, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r6, r3, 16 -; P9BE-NEXT: mulhwu r6, r6, r5 +; P9BE-NEXT: clrlwi r3, r3, 16 +; P9BE-NEXT: mulhwu r6, r3, r4 ; P9BE-NEXT: mulli r7, r6, 95 ; P9BE-NEXT: sub r3, r3, r7 ; P9BE-NEXT: mtfprwz f1, r3 @@ -409,23 +409,23 @@ ; P9BE-NEXT: lxv vs2, 0(r3) ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r7, r3, 16 +; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: xxperm vs0, vs1, vs2 -; P9BE-NEXT: mulhwu r7, r7, r5 +; P9BE-NEXT: mulhwu r7, r3, r4 ; P9BE-NEXT: mulli r8, r7, 95 ; P9BE-NEXT: sub r3, r3, r8 ; P9BE-NEXT: mtfprwz f1, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r3, r3, 16 -; P9BE-NEXT: mulhwu r5, r3, r5 -; P9BE-NEXT: mulli r8, r5, 95 +; P9BE-NEXT: mulhwu r4, r3, r4 +; P9BE-NEXT: mulli r8, r4, 95 ; P9BE-NEXT: sub r3, r3, r8 ; P9BE-NEXT: mtfprwz f3, r3 ; P9BE-NEXT: xxperm vs1, vs3, vs2 -; P9BE-NEXT: mtfprwz f3, r5 +; P9BE-NEXT: mtfprwz f3, r4 ; P9BE-NEXT: xxmrghw v2, vs1, vs0 -; P9BE-NEXT: mtfprwz f0, r4 +; P9BE-NEXT: mtfprwz f0, r5 ; P9BE-NEXT: mtfprwz f1, r6 ; P9BE-NEXT: xxperm vs0, vs1, vs2 ; P9BE-NEXT: mtfprwz f1, r7 @@ -443,27 +443,27 @@ ; P8LE-NEXT: clrldi r5, r4, 48 ; P8LE-NEXT: rldicl r6, r4, 48, 48 ; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: clrlwi r8, r6, 16 +; P8LE-NEXT: clrlwi r6, r6, 16 ; P8LE-NEXT: rldicl r7, r4, 32, 48 ; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: mulhwu r9, r5, r3 -; P8LE-NEXT: mulhwu r8, r8, r3 -; P8LE-NEXT: clrlwi r10, r7, 16 -; P8LE-NEXT: clrlwi r11, r4, 16 -; P8LE-NEXT: mulhwu r10, r10, r3 -; P8LE-NEXT: mulhwu r3, r11, r3 -; P8LE-NEXT: mulli r11, r9, 95 -; P8LE-NEXT: mtvsrd v2, r9 -; P8LE-NEXT: mulli r9, r8, 95 -; P8LE-NEXT: mtvsrd v3, r8 -; P8LE-NEXT: mulli r8, r10, 95 +; P8LE-NEXT: mulhwu r8, r5, r3 +; P8LE-NEXT: mulhwu r9, r6, r3 +; P8LE-NEXT: clrlwi r7, r7, 16 +; P8LE-NEXT: clrlwi r4, r4, 16 +; P8LE-NEXT: mulhwu r10, r7, r3 +; P8LE-NEXT: mulhwu r3, r4, r3 +; P8LE-NEXT: mulli r11, r8, 95 +; P8LE-NEXT: mtvsrd v2, r8 +; P8LE-NEXT: mulli r8, r9, 95 +; P8LE-NEXT: mtvsrd v3, r9 +; P8LE-NEXT: mulli r9, r10, 95 ; P8LE-NEXT: mtvsrd v4, r10 ; P8LE-NEXT: mulli r10, r3, 95 ; P8LE-NEXT: vmrghh v2, v3, v2 ; P8LE-NEXT: sub r5, r5, r11 -; P8LE-NEXT: sub r6, r6, r9 +; P8LE-NEXT: sub r6, r6, r8 ; P8LE-NEXT: mtvsrd v3, r5 -; P8LE-NEXT: sub r5, r7, r8 +; P8LE-NEXT: sub r5, r7, r9 ; P8LE-NEXT: mtvsrd v5, r6 ; P8LE-NEXT: sub r4, r4, r10 ; P8LE-NEXT: mtvsrd v0, r5 @@ -487,15 +487,15 @@ ; P8BE-NEXT: clrldi r5, r4, 48 ; P8BE-NEXT: rldicl r6, r4, 48, 48 ; P8BE-NEXT: lxvw4x v2, 0, r11 -; P8BE-NEXT: clrlwi r8, r5, 16 -; P8BE-NEXT: clrlwi r9, r6, 16 +; P8BE-NEXT: clrlwi r5, r5, 16 +; P8BE-NEXT: clrlwi r6, r6, 16 ; P8BE-NEXT: rldicl r7, r4, 32, 48 ; P8BE-NEXT: rldicl r4, r4, 16, 48 -; P8BE-NEXT: mulhwu r8, r8, r3 -; P8BE-NEXT: mulhwu r9, r9, r3 -; P8BE-NEXT: clrlwi r10, r7, 16 +; P8BE-NEXT: mulhwu r8, r5, r3 +; P8BE-NEXT: mulhwu r9, r6, r3 +; P8BE-NEXT: clrlwi r7, r7, 16 ; P8BE-NEXT: clrlwi r4, r4, 16 -; P8BE-NEXT: mulhwu r10, r10, r3 +; P8BE-NEXT: mulhwu r10, r7, r3 ; P8BE-NEXT: mulhwu r3, r4, r3 ; P8BE-NEXT: mulli r12, r8, 95 ; P8BE-NEXT: mtvsrwz v3, r8 diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll --- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll @@ -828,91 +828,74 @@ define <16 x i8> @test_v4i32_none(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_v4i32_none: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI10_0@toc@ha ; CHECK-LE-P8-NEXT: lbzx r4, 0, r4 -; CHECK-LE-P8-NEXT: lxsiwzx v3, 0, r3 -; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI10_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-LE-P8-NEXT: xxspltw v3, vs0, 1 ; CHECK-LE-P8-NEXT: vspltb v2, v2, 7 -; CHECK-LE-P8-NEXT: xxswapd v4, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_none: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI10_0@toc@ha ; CHECK-LE-P9-NEXT: lxsibzx v3, 0, r4 -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI10_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: lxvwsx v2, 0, r3 ; CHECK-LE-P9-NEXT: vspltb v3, v3, 7 -; CHECK-LE-P9-NEXT: xxperm v2, v3, vs0 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_none: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: lbzx r4, 0, r4 -; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI10_0@toc@ha -; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 -; CHECK-BE-P8-NEXT: addi r4, r5, .LCPI10_0@toc@l -; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-BE-P8-NEXT: xxspltw v3, vs0, 1 ; CHECK-BE-P8-NEXT: vspltb v2, v2, 7 -; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_none: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI10_0@toc@ha -; CHECK-BE-P9-NEXT: lxsibzx v2, 0, r4 -; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI10_0@toc@l -; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-BE-P9-NEXT: vspltb v2, v2, 7 -; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-BE-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-BE-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-BE-P9-NEXT: vspltb v3, v3, 7 +; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lbzx r4, 0, r4 -; CHECK-AIX-64-P8-NEXT: ld r5, L..C7(r2) # %const.0 -; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 -; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-64-P8-NEXT: xxspltw v3, vs0, 1 ; CHECK-AIX-64-P8-NEXT: vspltb v2, v2, 7 -; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.0 -; CHECK-AIX-64-P9-NEXT: lxsibzx v2, 0, r4 -; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-64-P9-NEXT: vspltb v2, v2, 7 -; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: vspltb v3, v3, 7 +; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_none: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lbzx r4, 0, r4 -; CHECK-AIX-32-P8-NEXT: lwz r5, L..C5(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: xxspltw v3, vs0, 1 ; CHECK-AIX-32-P8-NEXT: vspltb v2, v2, 7 -; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_none: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C1(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxsibzx v2, 0, r4 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-32-P9-NEXT: vspltb v2, v2, 7 -; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: lxsibzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: vspltb v3, v3, 7 +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <4 x i8>, ptr %a, align 4 @@ -1087,81 +1070,82 @@ define dso_local <16 x i8> @test_1_2(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_1_2: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI13_0@toc@ha ; CHECK-LE-P8-NEXT: lbzx r3, 0, r3 -; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI13_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: lhzx r4, 0, r4 ; CHECK-LE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-LE-P8-NEXT: mtvsrwz v3, r4 ; CHECK-LE-P8-NEXT: vspltb v2, v2, 7 -; CHECK-LE-P8-NEXT: xxswapd v4, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P8-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_1_2: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsibzx v2, 0, r3 -; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI13_0@toc@ha -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI13_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r4 ; CHECK-LE-P9-NEXT: vspltb v2, v2, 7 -; CHECK-LE-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-LE-P9-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_1_2: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: lbzx r3, 0, r3 -; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: lhzx r4, 0, r4 ; CHECK-BE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r4 ; CHECK-BE-P8-NEXT: vspltb v2, v2, 7 +; CHECK-BE-P8-NEXT: vsplth v3, v3, 3 ; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_1_2: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxsibzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: lxsihzx v3, 0, r4 ; CHECK-BE-P9-NEXT: vspltb v2, v2, 7 +; CHECK-BE-P9-NEXT: vsplth v3, v3, 3 ; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_1_2: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lbzx r3, 0, r3 -; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: lhzx r4, 0, r4 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 ; CHECK-AIX-64-P8-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-64-P8-NEXT: vsplth v3, v3, 3 ; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_1_2: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsibzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxsihzx v3, 0, r4 ; CHECK-AIX-64-P9-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-64-P9-NEXT: vsplth v3, v3, 3 ; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_1_2: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lbzx r3, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r5, L..C6(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lhzx r4, 0, r4 ; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v3, r4 ; CHECK-AIX-32-P8-NEXT: vspltb v2, v2, 7 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_1_2: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lxsibzx v2, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C2(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: vspltb v3, v2, 7 -; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r4 -; CHECK-AIX-32-P9-NEXT: lxv vs0, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxperm v2, v3, vs0 +; CHECK-AIX-32-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-32-P9-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <1 x i8>, ptr %a, align 4 @@ -1179,81 +1163,82 @@ define <16 x i8> @test_none_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_none_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI14_0@toc@ha ; CHECK-LE-P8-NEXT: lbzx r3, 0, r3 -; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI14_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: lhzx r4, 0, r4 ; CHECK-LE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-LE-P8-NEXT: mtvsrwz v3, r4 ; CHECK-LE-P8-NEXT: vspltb v2, v2, 7 -; CHECK-LE-P8-NEXT: xxswapd v4, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P8-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsibzx v2, 0, r3 -; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI14_0@toc@ha -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI14_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r4 ; CHECK-LE-P9-NEXT: vspltb v2, v2, 7 -; CHECK-LE-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-LE-P9-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_none_v2i64: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: lbzx r3, 0, r3 -; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: lhzx r4, 0, r4 ; CHECK-BE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r4 ; CHECK-BE-P8-NEXT: vspltb v2, v2, 7 +; CHECK-BE-P8-NEXT: vsplth v3, v3, 3 ; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v2i64: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxsibzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: lxsihzx v3, 0, r4 ; CHECK-BE-P9-NEXT: vspltb v2, v2, 7 +; CHECK-BE-P9-NEXT: vsplth v3, v3, 3 ; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_none_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lbzx r3, 0, r3 -; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: lhzx r4, 0, r4 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 ; CHECK-AIX-64-P8-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-64-P8-NEXT: vsplth v3, v3, 3 ; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsibzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxsihzx v3, 0, r4 ; CHECK-AIX-64-P9-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-64-P9-NEXT: vsplth v3, v3, 3 ; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_none_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lbzx r3, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r5, L..C7(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lhzx r4, 0, r4 ; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v3, r4 ; CHECK-AIX-32-P8-NEXT: vspltb v2, v2, 7 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_none_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lxsibzx v2, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C3(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: vspltb v3, v2, 7 -; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r4 -; CHECK-AIX-32-P9-NEXT: lxv vs0, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxperm v2, v3, vs0 +; CHECK-AIX-32-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: vspltb v2, v2, 7 +; CHECK-AIX-32-P9-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <1 x i8>, ptr %a, align 4 @@ -1271,32 +1256,25 @@ define <16 x i8> @test_v2i64_none(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_v2i64_none: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI15_0@toc@ha ; CHECK-LE-P8-NEXT: lbzx r4, 0, r4 -; CHECK-LE-P8-NEXT: lxsdx v3, 0, r3 -; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI15_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: lxvdsx v3, 0, r3 ; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 ; CHECK-LE-P8-NEXT: vspltb v2, v2, 7 -; CHECK-LE-P8-NEXT: xxswapd v4, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_none: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsd v2, 0(r3) -; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI15_0@toc@ha +; CHECK-LE-P9-NEXT: lxvdsx v2, 0, r3 ; CHECK-LE-P9-NEXT: lxsibzx v3, 0, r4 -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI15_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: vspltb v3, v3, 7 -; CHECK-LE-P9-NEXT: xxperm v2, v3, vs0 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_none: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: lbzx r4, 0, r4 -; CHECK-BE-P8-NEXT: lxsdx v3, 0, r3 +; CHECK-BE-P8-NEXT: lxvdsx v3, 0, r3 ; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 ; CHECK-BE-P8-NEXT: vspltb v2, v2, 7 ; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 @@ -1304,8 +1282,8 @@ ; ; CHECK-BE-P9-LABEL: test_v2i64_none: ; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lxvdsx v2, 0, r3 ; CHECK-BE-P9-NEXT: lxsibzx v3, 0, r4 -; CHECK-BE-P9-NEXT: lxsd v2, 0(r3) ; CHECK-BE-P9-NEXT: vspltb v3, v3, 7 ; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-P9-NEXT: blr @@ -1313,7 +1291,7 @@ ; CHECK-AIX-64-P8-LABEL: test_v2i64_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lbzx r4, 0, r4 -; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxvdsx v3, 0, r3 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 ; CHECK-AIX-64-P8-NEXT: vspltb v2, v2, 7 ; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 @@ -1321,8 +1299,8 @@ ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lxvdsx v2, 0, r3 ; CHECK-AIX-64-P9-NEXT: lxsibzx v3, 0, r4 -; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) ; CHECK-AIX-64-P9-NEXT: vspltb v3, v3, 7 ; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-64-P9-NEXT: blr @@ -1394,7 +1372,7 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16rhs: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C8(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C7(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r5 @@ -1403,7 +1381,7 @@ ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16rhs: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: ld r5, L..C4(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r5, L..C3(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: mtvsrwz v2, r4 ; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r3 ; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r5) @@ -1751,63 +1729,71 @@ define <16 x i8> @test_v2i64_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_v2i64_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxsdx v2, 0, r3 -; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-LE-P8-NEXT: lhzx r4, 0, r4 +; CHECK-LE-P8-NEXT: lxvdsx v3, 0, r3 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-LE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsd v2, 0(r3) -; CHECK-LE-P9-NEXT: lxsd v3, 0(r4) -; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-LE-P9-NEXT: lxvdsx v2, 0, r3 +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-LE-P9-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v2i64: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lxsdx v2, 0, r3 -; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: lhzx r4, 0, r4 +; CHECK-BE-P8-NEXT: lxvdsx v3, 0, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v2i64_v2i64: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsd v2, 0(r3) -; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: lxvdsx v2, 0, r3 +; CHECK-BE-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-BE-P9-NEXT: vsplth v3, v3, 3 ; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 -; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: lhzx r4, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvdsx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) -; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxvdsx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-AIX-64-P9-NEXT: vsplth v3, v3, 3 ; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r5, L..C8(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lhzx r4, 0, r4 ; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 -; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: xxspltw v3, vs0, 1 +; CHECK-AIX-32-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C4(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r4 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <8 x i8>, ptr %a, align 4 @@ -1981,24 +1967,21 @@ ; CHECK-LE-P8-LABEL: test_v4i32_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI24_0@toc@ha -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P8-NEXT: lfdx f1, 0, r4 -; CHECK-LE-P8-NEXT: addi r3, r5, .LCPI24_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs2, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, f0 -; CHECK-LE-P8-NEXT: xxswapd v3, f1 -; CHECK-LE-P8-NEXT: xxswapd v4, vs2 +; CHECK-LE-P8-NEXT: lfiwzx f1, 0, r3 +; CHECK-LE-P8-NEXT: lxvdsx v3, 0, r4 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI24_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: xxspltw v2, vs1, 1 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 ; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lxvwsx v2, 0, r3 ; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI24_0@toc@ha +; CHECK-LE-P9-NEXT: lxvdsx v3, 0, r4 ; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI24_0@toc@l -; CHECK-LE-P9-NEXT: xxswapd v2, f0 -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 ; CHECK-LE-P9-NEXT: lxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: xxperm v2, v3, vs0 ; CHECK-LE-P9-NEXT: blr @@ -2007,70 +1990,69 @@ ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI24_0@toc@ha -; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: lxvdsx v3, 0, r4 ; CHECK-BE-P8-NEXT: addi r3, r5, .LCPI24_0@toc@l ; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 -; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P8-NEXT: xxspltw v2, vs0, 1 ; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v2i64: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI24_0@toc@ha -; CHECK-BE-P9-NEXT: lxsd v2, 0(r4) +; CHECK-BE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI24_0@toc@l ; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-BE-P9-NEXT: xxsldwi vs0, f0, f0, 1 ; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C9(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C8(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P8-NEXT: lxvdsx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: xxspltw v2, vs0, 1 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 ; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C5(r2) # %const.0 -; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r4) -; CHECK-AIX-64-P9-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: lxvwsx vs0, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C4(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r5, 4(r4) -; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: lwz r5, 0(r4) +; CHECK-AIX-32-P8-NEXT: lwz r4, 4(r4) +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C9(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: xxspltw v3, vs0, 1 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r4 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C5(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs2, vs1 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw v3, vs1, vs0 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, 4(r4) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r4, 4(r4) +; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) ; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C5(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C1(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs2, -32(r1) ; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs2, vs1 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) diff --git a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll --- a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll @@ -833,19 +833,16 @@ define <2 x i64> @test_v16i8_v2i64(i8 %arg1, i64 %arg) { ; CHECK-LE-P8-LABEL: test_v16i8_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: xxspltd v2, vs0, 0 ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 ; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v16i8_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxswapd v2, vs1 +; CHECK-LE-P9-NEXT: mtvsrdd v2, r4, r4 ; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 ; CHECK-LE-P9-NEXT: blr ; @@ -883,31 +880,29 @@ ; ; CHECK-AIX-32-P8-LABEL: test_v16i8_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C0(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: addi r6, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r6 -; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: stw r5, -48(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C1(r2) # %const.1 +; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r6 +; CHECK-AIX-32-P8-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P8-NEXT: stw r4, -48(r1) ; CHECK-AIX-32-P8-NEXT: addi r4, r1, -48 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v16i8_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -48(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs2, -48(r1) ; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 8 -; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r5 -; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: blr entry: %lhs.tmp = insertelement <16 x i8> undef, i8 %arg1, i32 0 @@ -1006,16 +1001,17 @@ ; CHECK-LE-P8-LABEL: test_none_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxspltd v3, vs1, 0 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-LE-P8-NEXT: xxmrgld v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxv v2, 0(r3) -; CHECK-LE-P9-NEXT: mtfprd f0, r4 -; CHECK-LE-P9-NEXT: xxpermdi v2, vs0, v2, 1 +; CHECK-LE-P9-NEXT: mtvsrdd v3, r4, r4 +; CHECK-LE-P9-NEXT: xxmrgld v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_none_v2i64: @@ -1048,28 +1044,26 @@ ; ; CHECK-AIX-32-P8-LABEL: test_none_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r6, L..C2(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P8-NEXT: stw r5, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C3(r2) # %const.1 +; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 ; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r6 -; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v4, v2 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, v2, vs0 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_none_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r3) -; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 -; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 8 -; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r5 -; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, v2, vs0 ; CHECK-AIX-32-P9-NEXT: blr entry: %lhs = load <2 x i64>, ptr %b, align 4 @@ -1308,19 +1302,16 @@ define <2 x i64> @test_v8i16_v2i64(i16 %arg1, i64 %arg) { ; CHECK-LE-P8-LABEL: test_v8i16_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: xxspltd v2, vs0, 0 ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 ; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxswapd v2, vs1 +; CHECK-LE-P9-NEXT: mtvsrdd v2, r4, r4 ; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 ; CHECK-LE-P9-NEXT: blr ; @@ -1358,31 +1349,29 @@ ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C4(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: addi r6, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r6 -; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: stw r5, -48(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C5(r2) # %const.1 +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r6 +; CHECK-AIX-32-P8-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P8-NEXT: stw r4, -48(r1) ; CHECK-AIX-32-P8-NEXT: addi r4, r1, -48 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -48(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs2, -48(r1) ; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 8 -; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r5 -; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 12 +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: blr entry: %lhs.tmp = insertelement <8 x i16> undef, i16 %arg1, i32 0 @@ -1547,18 +1536,16 @@ define <2 x i64> @test_v4i32_v2i64(i32 %arg1, i64 %arg) { ; CHECK-LE-P8-LABEL: test_v4i32_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: xxspltd v2, vs0, 0 +; CHECK-LE-P8-NEXT: mtfprwz f0, r3 ; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: mtvsrws vs0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, vs1 +; CHECK-LE-P9-NEXT: mtfprwz f0, r3 +; CHECK-LE-P9-NEXT: mtvsrdd v2, r4, r4 ; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 ; CHECK-LE-P9-NEXT: blr ; @@ -1594,33 +1581,29 @@ ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: stw r3, -48(r1) +; CHECK-AIX-32-P8-NEXT: addi r6, r1, -16 +; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r6 +; CHECK-AIX-32-P8-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P8-NEXT: stw r4, -48(r1) ; CHECK-AIX-32-P8-NEXT: addi r4, r1, -48 -; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C6(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs1, vs0 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -48(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C0(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -48(r1) -; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: xxmrghw v3, vs1, vs0 -; CHECK-AIX-32-P9-NEXT: lxv vs0, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxperm v2, v3, vs0 +; CHECK-AIX-32-P9-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -48(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs2, -48(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs2, vs1 +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: blr entry: %lhs.tmp = insertelement <4 x i32> undef, i32 %arg1, i32 0 @@ -1633,96 +1616,98 @@ define <2 x i64> @test_v2i64_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_v2i64_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: ld r3, 0(r3) -; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 -; CHECK-LE-P8-NEXT: mtfprd f1, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 -; CHECK-LE-P8-NEXT: xxmrghd v3, vs0, vs1 +; CHECK-LE-P8-NEXT: lxvdsx v2, 0, r3 +; CHECK-LE-P8-NEXT: lxvdsx v3, 0, r4 +; CHECK-LE-P8-NEXT: xxmrgld v3, v3, v2 ; CHECK-LE-P8-NEXT: vaddudm v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: ld r3, 0(r3) -; CHECK-LE-P9-NEXT: lfd f1, 0(r4) -; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: xxmrghd v3, vs1, vs0 +; CHECK-LE-P9-NEXT: lxvdsx v2, 0, r3 +; CHECK-LE-P9-NEXT: lxvdsx v3, 0, r4 +; CHECK-LE-P9-NEXT: xxmrgld v3, v3, v2 ; CHECK-LE-P9-NEXT: vaddudm v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v2i64: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lxsdx v2, 0, r3 -; CHECK-BE-P8-NEXT: lfdx f0, 0, r4 +; CHECK-BE-P8-NEXT: ld r3, 0(r3) +; CHECK-BE-P8-NEXT: ld r4, 0(r4) +; CHECK-BE-P8-NEXT: mtvsrd v2, r3 +; CHECK-BE-P8-NEXT: mtfprd f0, r4 ; CHECK-BE-P8-NEXT: xxmrghd v3, v2, vs0 ; CHECK-BE-P8-NEXT: vaddudm v2, v3, v2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v2i64_v2i64: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsd v2, 0(r3) -; CHECK-BE-P9-NEXT: lfd f0, 0(r4) -; CHECK-BE-P9-NEXT: xxmrghd v3, v2, vs0 +; CHECK-BE-P9-NEXT: ld r3, 0(r3) +; CHECK-BE-P9-NEXT: ld r4, 0(r4) +; CHECK-BE-P9-NEXT: mtvsrd v2, r3 +; CHECK-BE-P9-NEXT: mtvsrdd v3, r3, r4 ; CHECK-BE-P9-NEXT: vaddudm v2, v3, v2 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 -; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r4 +; CHECK-AIX-64-P8-NEXT: ld r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: ld r4, 0(r4) +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r4 ; CHECK-AIX-64-P8-NEXT: xxmrghd v3, v2, vs0 ; CHECK-AIX-64-P8-NEXT: vaddudm v2, v3, v2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) -; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r4) -; CHECK-AIX-64-P9-NEXT: xxmrghd v3, v2, vs0 +; CHECK-AIX-64-P9-NEXT: ld r3, 0(r3) +; CHECK-AIX-64-P9-NEXT: ld r4, 0(r4) +; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r3 +; CHECK-AIX-64-P9-NEXT: mtvsrdd v3, r3, r4 ; CHECK-AIX-64-P9-NEXT: vaddudm v2, v3, v2 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r5, 4(r3) -; CHECK-AIX-32-P8-NEXT: addi r6, r1, -32 -; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 4(r4) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r5 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r6 +; CHECK-AIX-32-P8-NEXT: lwz r5, 0(r4) +; CHECK-AIX-32-P8-NEXT: lwz r4, 4(r4) +; CHECK-AIX-32-P8-NEXT: lwz r6, 0(r3) +; CHECK-AIX-32-P8-NEXT: lwz r3, 4(r3) +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r5, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -48 ; CHECK-AIX-32-P8-NEXT: stw r3, -48(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -64 -; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs1, vs0 -; CHECK-AIX-32-P8-NEXT: stw r3, -64(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -48 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs3, 0, r4 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs3, vs2 +; CHECK-AIX-32-P8-NEXT: stw r6, -64(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r6, r1, -64 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r5 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs3, 0, r6 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: xxmrghw v2, vs3, vs2 ; CHECK-AIX-32-P8-NEXT: xxmrghd v3, v2, vs0 ; CHECK-AIX-32-P8-NEXT: vaddudm v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lwz r5, 4(r3) -; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lwz r6, 0(r4) +; CHECK-AIX-32-P9-NEXT: lwz r4, 4(r4) +; CHECK-AIX-32-P9-NEXT: lwz r5, 0(r3) +; CHECK-AIX-32-P9-NEXT: lwz r3, 4(r3) +; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r6, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -48(r1) +; CHECK-AIX-32-P9-NEXT: stw r5, -64(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 4(r4) ; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -48(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs1, vs0 -; CHECK-AIX-32-P9-NEXT: lxv vs0, -48(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -64(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -64(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs2, -64(r1) ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: lxv vs1, -48(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs2, vs1 ; CHECK-AIX-32-P9-NEXT: xxmrghd v3, v2, vs0 ; CHECK-AIX-32-P9-NEXT: vaddudm v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll --- a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll @@ -29,36 +29,38 @@ ; CHECK-LE-P8-LABEL: test_none_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha -; CHECK-LE-P8-NEXT: lxsdx v2, 0, r3 +; CHECK-LE-P8-NEXT: lhzx r5, 0, r3 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r3 ; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI0_0@toc@l ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: lhz r4, 0(r3) -; CHECK-LE-P8-NEXT: mtvsrd v4, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r5 +; CHECK-LE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-LE-P8-NEXT: xxswapd vs0, v2 ; CHECK-LE-P8-NEXT: stfdx f0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-LE-P9-NEXT: lfd f1, 0(r3) +; CHECK-LE-P9-NEXT: lfd f0, 0(r3) ; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha ; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs2, 0(r3) -; CHECK-LE-P9-NEXT: xxperm vs1, vs0, vs2 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs1 +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: xxperm vs0, v2, vs1 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P9-NEXT: stfd f0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_none_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: lhzx r4, 0, r3 ; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 48 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-BE-P8-NEXT: stfdx f0, 0, r3 ; CHECK-BE-P8-NEXT: blr ; @@ -73,11 +75,11 @@ ; ; CHECK-AIX-64-P8-LABEL: test_none_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: lhzx r4, 0, r3 ; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-AIX-64-P8-NEXT: stfdx f0, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; @@ -92,13 +94,12 @@ ; ; CHECK-AIX-32-P8-LABEL: test_none_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 +; CHECK-AIX-32-P8-NEXT: lhzx r4, 0, r3 ; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 ; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 -; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r4 +; CHECK-AIX-32-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r4 ; CHECK-AIX-32-P8-NEXT: stw r3, 0(r3) ; CHECK-AIX-32-P8-NEXT: lwz r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: stw r3, 0(r3) @@ -106,12 +107,11 @@ ; ; CHECK-AIX-32-P9-LABEL: test_none_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) ; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxsihzx v2, 0, r3 ; CHECK-AIX-32-P9-NEXT: stw r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: stxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) ; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) ; CHECK-AIX-32-P9-NEXT: stw r3, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr @@ -130,71 +130,79 @@ define void @test_v8i16_none(ptr %a) { ; CHECK-LE-P8-LABEL: test_v8i16_none: ; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhzx r4, 0, r3 ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, v2 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_none: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-LE-P9-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: xxmrglw vs0, vs0, v2 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_none: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: lhzx r4, 0, r3 ; CHECK-BE-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-BE-P8-NEXT: mtfprwz f1, r4 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_none: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: lhzx r4, 0, r3 ; CHECK-AIX-64-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_none: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-32-P8-NEXT: lhzx r4, 0, r3 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 -; CHECK-AIX-32-P8-NEXT: mtfprwz f1, r4 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_none: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -212,81 +220,67 @@ ; CHECK-LE-P8-LABEL: test_none_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI2_0@toc@ha -; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: xxspltw v3, v2, 3 ; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs1, 0, r3 -; CHECK-LE-P8-NEXT: mffprwz r3, f0 -; CHECK-LE-P8-NEXT: xxswapd v3, vs1 -; CHECK-LE-P8-NEXT: mtvsrwz v4, r3 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-LE-P8-NEXT: xxswapd vs0, v2 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r5 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: li r3, 0 -; CHECK-LE-P9-NEXT: vextuwrx r3, r3, v2 -; CHECK-LE-P9-NEXT: mtfprwz f0, r3 ; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-LE-P9-NEXT: xxspltw v3, v2, 3 ; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-LE-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 ; CHECK-LE-P9-NEXT: stxv v2, 0(r5) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_none_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: xxsldwi vs0, v2, v2, 3 ; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-BE-P8-NEXT: xxspltw v3, v2, 0 ; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-BE-P8-NEXT: mffprwz r4, f0 -; CHECK-BE-P8-NEXT: mtvsrwz v4, r4 -; CHECK-BE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r5 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: li r3, 0 -; CHECK-BE-P9-NEXT: vextuwlx r3, r3, v2 -; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-BE-P9-NEXT: xxspltw v3, v2, 0 ; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-BE-P9-NEXT: xxperm vs0, v2, vs1 -; CHECK-BE-P9-NEXT: stxv vs0, 0(r5) +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: stxv v2, 0(r5) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_none_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, v2, v2, 3 ; CHECK-AIX-64-P8-NEXT: ld r4, L..C0(r2) # %const.0 -; CHECK-AIX-64-P8-NEXT: mffprwz r5, f0 -; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r5 -; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-64-P8-NEXT: xxspltw v3, v2, 0 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: li r4, 0 -; CHECK-AIX-64-P9-NEXT: vextuwlx r4, r4, v2 -; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 ; CHECK-AIX-64-P9-NEXT: ld r4, L..C0(r2) # %const.0 -; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r4) -; CHECK-AIX-64-P9-NEXT: xxperm vs0, v2, vs1 -; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxspltw v3, v2, 0 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r4) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_none_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lwz r4, L..C0(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 -; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r5 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r5 +; CHECK-AIX-32-P8-NEXT: xxspltw v3, v2, 0 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 ; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 @@ -294,13 +288,11 @@ ; ; CHECK-AIX-32-P9-LABEL: test_none_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: addi r4, r1, -16 -; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-32-P9-NEXT: lwz r4, L..C0(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r4) -; CHECK-AIX-32-P9-NEXT: xxperm vs0, v2, vs1 -; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxspltw v3, v2, 0 +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r4) +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = extractelement <2 x i32> %vec, i64 0 @@ -317,23 +309,19 @@ ; CHECK-LE-P8-LABEL: test_v4i32_none: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI3_0@toc@ha -; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: xxspltw v3, v2, 3 ; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI3_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs1, 0, r3 -; CHECK-LE-P8-NEXT: mffprwz r3, f0 -; CHECK-LE-P8-NEXT: xxswapd v3, vs1 -; CHECK-LE-P8-NEXT: mtvsrwz v4, r3 -; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-LE-P8-NEXT: xxswapd vs0, v2 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r5 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_none: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: li r3, 0 -; CHECK-LE-P9-NEXT: vextuwrx r3, r3, v2 -; CHECK-LE-P9-NEXT: mtfprwz f0, r3 ; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-LE-P9-NEXT: xxspltw vs0, v2, 3 ; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI3_0@toc@l ; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) ; CHECK-LE-P9-NEXT: xxperm vs0, v2, vs1 @@ -342,22 +330,18 @@ ; ; CHECK-BE-P8-LABEL: test_v4i32_none: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: xxsldwi vs0, v2, v2, 3 ; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-BE-P8-NEXT: xxspltw v3, v2, 0 ; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI3_0@toc@l -; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-BE-P8-NEXT: mffprwz r4, f0 -; CHECK-BE-P8-NEXT: mtvsrwz v4, r4 -; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r5 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_none: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: li r3, 0 -; CHECK-BE-P9-NEXT: vextuwlx r3, r3, v2 -; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-BE-P9-NEXT: xxspltw vs0, v2, 0 ; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI3_0@toc@l ; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) ; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 @@ -366,21 +350,17 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, v2, v2, 3 ; CHECK-AIX-64-P8-NEXT: ld r4, L..C1(r2) # %const.0 -; CHECK-AIX-64-P8-NEXT: mffprwz r5, f0 -; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r5 -; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: xxspltw v3, v2, 0 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: li r4, 0 -; CHECK-AIX-64-P9-NEXT: vextuwlx r4, r4, v2 -; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 ; CHECK-AIX-64-P9-NEXT: ld r4, L..C1(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: xxspltw vs0, v2, 0 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r4) ; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) @@ -389,9 +369,7 @@ ; CHECK-AIX-32-P8-LABEL: test_v4i32_none: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lwz r4, L..C1(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 -; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r5 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r5 +; CHECK-AIX-32-P8-NEXT: xxspltw v3, v2, 0 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 ; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 @@ -399,10 +377,8 @@ ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_none: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: addi r4, r1, -16 -; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-32-P9-NEXT: lwz r4, L..C1(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: xxspltw vs0, v2, 0 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r4) ; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) @@ -421,101 +397,103 @@ define void @test_none_v2i64(ptr %ptr, i32 %v1, <2 x i32> %vec) local_unnamed_addr #0 { ; CHECK-LE-P8-LABEL: test_none_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI4_0@toc@ha -; CHECK-LE-P8-NEXT: mtvsrwz v4, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI4_1@toc@ha -; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI4_0@toc@l -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI4_1@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 -; CHECK-LE-P8-NEXT: lxsdx v3, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v4, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 -; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: xxsldwi vs0, v2, v2, 1 +; CHECK-LE-P8-NEXT: lfdx f1, 0, r3 +; CHECK-LE-P8-NEXT: mffprwz r3, f0 +; CHECK-LE-P8-NEXT: mffprwz r5, f1 +; CHECK-LE-P8-NEXT: rldimi r3, r4, 32, 0 +; CHECK-LE-P8-NEXT: rldimi r4, r5, 32, 0 +; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxmrghd vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; ; CHECK-LE-P9-LABEL: test_none_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lfd f0, 0(r3) -; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI4_0@toc@ha -; CHECK-LE-P9-NEXT: mtfprwz f1, r4 -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI4_0@toc@l -; CHECK-LE-P9-NEXT: xxinsertw v2, vs1, 12 -; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-LE-P9-NEXT: xxperm v2, vs0, vs1 -; CHECK-LE-P9-NEXT: stxv v2, 0(r3) +; CHECK-LE-P9-NEXT: li r3, 4 +; CHECK-LE-P9-NEXT: vextuwrx r3, r3, v2 +; CHECK-LE-P9-NEXT: mffprwz r5, f0 +; CHECK-LE-P9-NEXT: rldimi r3, r4, 32, 0 +; CHECK-LE-P9-NEXT: rldimi r4, r5, 32, 0 +; CHECK-LE-P9-NEXT: mtvsrdd vs0, r4, r3 +; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; ; CHECK-BE-P8-LABEL: test_none_v2i64: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI4_0@toc@ha -; CHECK-BE-P8-NEXT: mtvsrwz v4, r4 -; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI4_1@toc@ha -; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI4_0@toc@l -; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI4_1@toc@l -; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r5 -; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 -; CHECK-BE-P8-NEXT: lxsdx v3, 0, r3 -; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 -; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-BE-P8-NEXT: lwz r3, 0(r3) +; CHECK-BE-P8-NEXT: mfvsrwz r5, v2 +; CHECK-BE-P8-NEXT: rldimi r3, r4, 32, 0 +; CHECK-BE-P8-NEXT: rldimi r4, r5, 32, 0 +; CHECK-BE-P8-NEXT: mtfprd f0, r3 +; CHECK-BE-P8-NEXT: mtfprd f1, r4 +; CHECK-BE-P8-NEXT: xxmrghd vs0, vs1, vs0 +; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 ; ; CHECK-BE-P9-LABEL: test_none_v2i64: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lfd f0, 0(r3) -; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI4_0@toc@ha -; CHECK-BE-P9-NEXT: mtfprwz f1, r4 -; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI4_0@toc@l -; CHECK-BE-P9-NEXT: xxinsertw v2, vs1, 0 -; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-BE-P9-NEXT: xxperm vs0, v2, vs1 +; CHECK-BE-P9-NEXT: lwz r3, 0(r3) +; CHECK-BE-P9-NEXT: mfvsrwz r5, v2 +; CHECK-BE-P9-NEXT: rldimi r3, r4, 32, 0 +; CHECK-BE-P9-NEXT: rldimi r4, r5, 32, 0 +; CHECK-BE-P9-NEXT: mtvsrdd vs0, r4, r3 ; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) ; ; CHECK-AIX-64-P8-LABEL: test_none_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C2(r2) # %const.0 -; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r4 -; CHECK-AIX-64-P8-NEXT: ld r4, L..C3(r2) # %const.1 -; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r5 -; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 -; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r3 -; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r4 -; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 -; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: mfvsrwz r5, v2 +; CHECK-AIX-64-P8-NEXT: rldimi r3, r4, 32, 0 +; CHECK-AIX-64-P8-NEXT: rldimi r4, r5, 32, 0 +; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrghd vs0, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 ; ; CHECK-AIX-64-P9-LABEL: test_none_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C2(r2) # %const.0 -; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 -; CHECK-AIX-64-P9-NEXT: xxinsertw v2, vs1, 0 -; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-64-P9-NEXT: xxperm vs0, v2, vs1 +; CHECK-AIX-64-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-64-P9-NEXT: mfvsrwz r5, v2 +; CHECK-AIX-64-P9-NEXT: rldimi r3, r4, 32, 0 +; CHECK-AIX-64-P9-NEXT: rldimi r4, r5, 32, 0 +; CHECK-AIX-64-P9-NEXT: mtvsrdd vs0, r4, r3 ; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) ; ; CHECK-AIX-32-P8-LABEL: test_none_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r5, L..C2(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x v5, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C3(r2) # %const.1 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 -; CHECK-AIX-32-P8-NEXT: vperm v2, v5, v2, v4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 -; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -16 +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r5 +; CHECK-AIX-32-P8-NEXT: stw r4, -48(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -48 +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -64 +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, -12(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, -64(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r5 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs1, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs2, vs0 +; CHECK-AIX-32-P8-NEXT: xxmrghd vs0, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; ; CHECK-AIX-32-P9-LABEL: test_none_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C2(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: mtfprwz f1, r4 -; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs1, 0 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxperm vs0, v2, vs1 +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -48(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -48(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -12(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -64(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs2, -64(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs2, vs0 +; CHECK-AIX-32-P9-NEXT: xxmrghd vs0, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) entry: %0 = load <2 x i32>, ptr %ptr, align 4 @@ -591,74 +569,90 @@ define void @test_v8i16_v8i16(ptr %a) { ; CHECK-LE-P8-LABEL: test_v8i16_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lhz r4, 0(r3) -; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: mtfprd f1, r3 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: lhzx r4, 0, r3 +; CHECK-LE-P8-NEXT: lhzx r3, 0, r3 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-LE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-LE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P8-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P8-NEXT: xxmrglw vs0, v3, v2 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-LE-P9-NEXT: lxsihzx f1, 0, r3 -; CHECK-LE-P9-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P9-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P9-NEXT: xxmrglw vs0, v3, v2 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lhz r4, 0(r3) -; CHECK-BE-P8-NEXT: lhz r3, 0(r3) -; CHECK-BE-P8-NEXT: mtfprwz f0, r4 -; CHECK-BE-P8-NEXT: mtfprwz f1, r3 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: lhzx r4, 0, r3 +; CHECK-BE-P8-NEXT: lhzx r3, 0, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P8-NEXT: vsplth v3, v3, 3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, v2, v3 ; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-BE-P9-NEXT: lxsihzx f1, 0, r3 -; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P9-NEXT: vsplth v3, v3, 3 +; CHECK-BE-P9-NEXT: xxmrghw vs0, v2, v3 ; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r4 -; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: lhzx r4, 0, r3 +; CHECK-AIX-64-P8-NEXT: lhzx r3, 0, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P8-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, v2, v3 ; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: lxsihzx f1, 0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P9-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, v2, v3 ; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: mtfprwz f0, r4 -; CHECK-AIX-32-P8-NEXT: mtfprwz f1, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: lhzx r4, 0, r3 +; CHECK-AIX-32-P8-NEXT: lhzx r3, 0, r3 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-32-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P8-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, v2, v3 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lxsihzx f1, 0, r3 -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-AIX-32-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P9-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, v2, v3 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -678,12 +672,12 @@ define void @test_v8i16_v4i32(ptr %a) { ; CHECK-LE-P8-LABEL: test_v8i16_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lhz r4, 0(r3) +; CHECK-LE-P8-NEXT: lhzx r4, 0, r3 ; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, f0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-LE-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-LE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, v2 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr @@ -691,8 +685,7 @@ ; CHECK-LE-P9-LABEL: test_v8i16_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 +; CHECK-LE-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 ; CHECK-LE-P9-NEXT: xxmrglw vs0, vs0, v2 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) @@ -700,20 +693,19 @@ ; ; CHECK-BE-P8-LABEL: test_v8i16_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lhzx r4, 0, r3 ; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P8-NEXT: lhz r4, 0(r3) -; CHECK-BE-P8-NEXT: sldi r3, r4, 48 -; CHECK-BE-P8-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-BE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P9-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-BE-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 ; CHECK-BE-P9-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) @@ -721,20 +713,19 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lhzx r4, 0, r3 ; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-64-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 ; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) @@ -742,27 +733,21 @@ ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: lhzx r4, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, 0, r3 +; CHECK-AIX-32-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -781,12 +766,12 @@ define void @test_v8i16_v2i64(ptr %a) { ; CHECK-LE-P8-LABEL: test_v8i16_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lhz r4, 0(r3) -; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, f0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: lhzx r4, 0, r3 +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-LE-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-LE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, v2 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr @@ -794,8 +779,7 @@ ; CHECK-LE-P9-LABEL: test_v8i16_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfd f0, 0(r3) -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 +; CHECK-LE-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 ; CHECK-LE-P9-NEXT: xxmrglw vs0, vs0, v2 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) @@ -803,18 +787,19 @@ ; ; CHECK-BE-P8-LABEL: test_v8i16_v2i64: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lhz r4, 0(r3) -; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 48 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-BE-P8-NEXT: lhzx r4, 0, r3 +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-BE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v2i64: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lfd f0, 0(r3) +; CHECK-BE-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 ; CHECK-BE-P9-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) @@ -822,18 +807,19 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: lhzx r4, 0, r3 +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-64-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 ; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) @@ -841,27 +827,21 @@ ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: lhzx r4, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, 0, r3 +; CHECK-AIX-32-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, v2, vs0 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -880,79 +860,76 @@ define <16 x i8> @test_v4i32_v4i32(ptr %a, ptr %b) { ; CHECK-LE-P8-LABEL: test_v4i32_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI9_0@toc@ha -; CHECK-LE-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-LE-P8-NEXT: lxsiwzx v3, 0, r4 -; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI9_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 -; CHECK-LE-P8-NEXT: xxswapd v4, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P8-NEXT: lfiwzx f1, 0, r4 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxspltd v2, vs0, 1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI9_0@toc@ha -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI9_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-LE-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lfiwzx f1, 0, r4 +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: xxspltd v2, vs0, 1 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI9_0@toc@ha -; CHECK-BE-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r4 -; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI9_0@toc@l -; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r5 -; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P8-NEXT: lxsiwzx v2, 0, r4 +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P8-NEXT: vmrgow v2, v3, v2 +; CHECK-BE-P8-NEXT: xxspltd v2, v2, 0 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI9_0@toc@ha ; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r4 -; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI9_0@toc@l -; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-BE-P9-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P9-NEXT: vmrgow v2, v3, v2 +; CHECK-BE-P9-NEXT: xxspltd v2, v2, 0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C4(r2) # %const.0 -; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 -; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: vmrgow v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, v2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r4 -; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P9-NEXT: vmrgow v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, v2 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r5, L..C4(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lwz r4, 0(r4) +; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, vs0 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C3(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r4 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lwz r4, 0(r4) +; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, vs0 ; CHECK-AIX-32-P9-NEXT: blr entry: %load1 = load <4 x i8>, ptr %a @@ -965,12 +942,12 @@ define void @test_v4i32_v8i16(ptr %a) { ; CHECK-LE-P8-LABEL: test_v4i32_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lhz r4, 0(r3) +; CHECK-LE-P8-NEXT: lhzx r4, 0, r3 ; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, f0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-LE-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-LE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P8-NEXT: xxmrglw vs0, v2, vs0 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr @@ -978,8 +955,7 @@ ; CHECK-LE-P9-LABEL: test_v4i32_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 +; CHECK-LE-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 ; CHECK-LE-P9-NEXT: xxmrglw vs0, v2, vs0 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) @@ -987,20 +963,19 @@ ; ; CHECK-BE-P8-LABEL: test_v4i32_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lhzx r4, 0, r3 ; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P8-NEXT: lhz r4, 0(r3) -; CHECK-BE-P8-NEXT: sldi r3, r4, 48 -; CHECK-BE-P8-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-BE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P9-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-BE-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 ; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) @@ -1008,20 +983,19 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lhzx r4, 0, r3 ; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-64-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: xxsldwi vs0, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 ; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) @@ -1029,27 +1003,21 @@ ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: lhzx r4, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, 0, r3 +; CHECK-AIX-32-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1068,59 +1036,51 @@ define void @test_v4i32_v2i64(ptr %a) { ; CHECK-LE-P8-LABEL: test_v4i32_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-LE-P8-NEXT: lfiwzx f1, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd vs0, f0 -; CHECK-LE-P8-NEXT: xxswapd vs1, f1 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lfd f0, 0(r3) +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 ; CHECK-LE-P9-NEXT: lfiwzx f1, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 -; CHECK-LE-P9-NEXT: xxswapd vs1, f1 -; CHECK-LE-P9-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v2i64: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lfiwzx f1, 0, r3 -; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 -; CHECK-BE-P8-NEXT: xxsldwi vs1, f1, f1, 1 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs1, vs0 -; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P8-NEXT: vmrgow v2, v3, v2 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v2i64: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lfiwzx f1, 0, r3 -; CHECK-BE-P9-NEXT: lfd f0, 0(r3) -; CHECK-BE-P9-NEXT: xxsldwi vs1, f1, f1, 1 -; CHECK-BE-P9-NEXT: xxmrghw vs0, vs1, vs0 -; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P9-NEXT: vmrgow v2, v3, v2 +; CHECK-BE-P9-NEXT: stxv v2, 0(r3) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lfiwzx f1, 0, r3 -; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: xxsldwi vs1, f1, f1, 1 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs1, vs0 -; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: vmrgow v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lfiwzx f1, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) -; CHECK-AIX-64-P9-NEXT: xxsldwi vs1, f1, f1, 1 -; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs1, vs0 -; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P9-NEXT: vmrgow v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: @@ -1212,32 +1172,38 @@ ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r4, 0(r3) ; CHECK-AIX-32-P8-NEXT: lwz r5, 4(r3) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 -; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) ; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P8-NEXT: stw r4, -48(r1) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r5, r1, -32 ; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -48 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: lfiwzx f2, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 -; CHECK-AIX-32-P8-NEXT: xxspltw vs1, vs2, 1 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs2, 0, r5 ; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs1, vs1, vs2 +; CHECK-AIX-32-P8-NEXT: xxmrghd vs0, vs1, vs0 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lwz r4, 4(r3) -; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lwz r4, 0(r3) +; CHECK-AIX-32-P9-NEXT: lwz r5, 0(r3) +; CHECK-AIX-32-P9-NEXT: lwz r3, 4(r3) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, -48(r1) +; CHECK-AIX-32-P9-NEXT: stw r5, -32(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 -; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxv vs1, -48(r1) +; CHECK-AIX-32-P9-NEXT: lxv vs2, -32(r1) ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs1, vs2 +; CHECK-AIX-32-P9-NEXT: xxmrghd vs0, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1251,59 +1217,51 @@ define void @test_v2i64_v4i32(ptr %a) { ; CHECK-LE-P8-LABEL: test_v2i64_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-LE-P8-NEXT: lfiwzx f1, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd vs0, f0 -; CHECK-LE-P8-NEXT: xxswapd vs1, f1 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lfd f0, 0(r3) +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 ; CHECK-LE-P9-NEXT: lfiwzx f1, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 -; CHECK-LE-P9-NEXT: xxswapd vs1, f1 -; CHECK-LE-P9-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lfiwzx f1, 0, r3 -; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 -; CHECK-BE-P8-NEXT: xxsldwi vs1, f1, f1, 1 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P8-NEXT: vmrgow v2, v2, v3 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v2i64_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lfiwzx f1, 0, r3 -; CHECK-BE-P9-NEXT: lfd f0, 0(r3) -; CHECK-BE-P9-NEXT: xxsldwi vs1, f1, f1, 1 -; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P9-NEXT: vmrgow v2, v3, v2 +; CHECK-BE-P9-NEXT: stxv v2, 0(r3) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lfiwzx f1, 0, r3 -; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: xxsldwi vs1, f1, f1, 1 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: vmrgow v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lfiwzx f1, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) -; CHECK-AIX-64-P9-NEXT: xxsldwi vs1, f1, f1, 1 -; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P9-NEXT: vmrgow v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32: @@ -1346,12 +1304,12 @@ define void @test_v2i64_v8i16(ptr %a) { ; CHECK-LE-P8-LABEL: test_v2i64_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lhz r4, 0(r3) -; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, f0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: lhzx r4, 0, r3 +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-LE-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-LE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-LE-P8-NEXT: xxmrglw vs0, v2, vs0 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr @@ -1359,8 +1317,7 @@ ; CHECK-LE-P9-LABEL: test_v2i64_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfd f0, 0(r3) -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 +; CHECK-LE-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 ; CHECK-LE-P9-NEXT: xxmrglw vs0, v2, vs0 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) @@ -1368,18 +1325,19 @@ ; ; CHECK-BE-P8-LABEL: test_v2i64_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lhz r4, 0(r3) -; CHECK-BE-P8-NEXT: lfdx f0, 0, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 48 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: lhzx r4, 0, r3 +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-BE-P8-NEXT: vsplth v2, v2, 3 +; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v2i64_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lfd f0, 0(r3) +; CHECK-BE-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 ; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) @@ -1387,18 +1345,19 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-64-P8-NEXT: lfdx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: lhzx r4, 0, r3 +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-64-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: lxvwsx vs0, 0, r3 ; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 ; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) @@ -1406,27 +1365,21 @@ ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: lhzx r4, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 +; CHECK-AIX-32-P8-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, 0, r3 +; CHECK-AIX-32-P9-NEXT: vsplth v2, v2, 3 +; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, v2 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll --- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll @@ -29,83 +29,91 @@ ; CHECK-LE-P8-LABEL: test_none_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha -; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: lhzx r3, 0, r3 ; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI0_0@toc@l ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: mtvsrd v4, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-LE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-LE-P8-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-LE-P8-NEXT: xxswapd vs0, v2 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r3 ; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha ; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-LE-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-LE-P9-NEXT: lxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P9-NEXT: xxperm v2, v3, vs0 ; CHECK-LE-P9-NEXT: stxv v2, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_none_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lhzx r3, 0, r3 ; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha -; CHECK-BE-P8-NEXT: lhz r3, 0(r3) -; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI0_0@toc@l -; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 -; CHECK-BE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: addi r3, r4, .LCPI0_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vsplth v3, v3, 3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-BE-P9-NEXT: lxsihzx v3, 0, r3 ; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha ; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l -; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-BE-P9-NEXT: xxperm vs0, v2, vs1 -; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: lxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: vsplth v3, v3, 3 +; CHECK-BE-P9-NEXT: xxperm v3, v2, vs0 +; CHECK-BE-P9-NEXT: stxv v3, 0(r3) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_none_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: lhzx r3, 0, r3 ; CHECK-AIX-64-P8-NEXT: ld r4, L..C0(r2) # %const.0 -; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 -; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-64-P8-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsihzx v3, 0, r3 ; CHECK-AIX-64-P9-NEXT: ld r3, L..C0(r2) # %const.0 -; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-64-P9-NEXT: xxperm vs0, v2, vs1 -; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-64-P9-NEXT: xxperm v3, v2, vs0 +; CHECK-AIX-64-P9-NEXT: stxv v3, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_none_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lhzx r3, 0, r3 ; CHECK-AIX-32-P8-NEXT: lwz r4, L..C0(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: mtvsrwz v4, r3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_none_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxsihzx v3, 0, r3 ; CHECK-AIX-32-P9-NEXT: lwz r3, L..C0(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxperm vs0, v2, vs1 -; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P9-NEXT: xxperm v3, v2, vs0 +; CHECK-AIX-32-P9-NEXT: stxv v3, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: %load0.tmp = load <2 x i8>, ptr %a0 @@ -123,7 +131,7 @@ ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI1_0@toc@ha ; CHECK-LE-P8-NEXT: mtvsrd v4, r9 -; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: lhzx r3, 0, r3 ; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI1_1@toc@ha @@ -131,50 +139,54 @@ ; CHECK-LE-P8-NEXT: xxswapd v3, vs0 ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: mtvsrd v4, r3 -; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-LE-P8-NEXT: xxswapd v4, vs0 +; CHECK-LE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-LE-P8-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-LE-P8-NEXT: xxswapd vs0, v2 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_none: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r3 ; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha -; CHECK-LE-P9-NEXT: mtvsrwz v3, r9 +; CHECK-LE-P9-NEXT: mtvsrwz v4, r9 ; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l -; CHECK-LE-P9-NEXT: vinsertb v2, v3, 15 -; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-LE-P9-NEXT: xxperm vs0, v2, vs1 -; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: vinsertb v2, v4, 15 +; CHECK-LE-P9-NEXT: lxv vs0, 0(r3) +; CHECK-LE-P9-NEXT: vsplth v3, v3, 3 +; CHECK-LE-P9-NEXT: xxperm v3, v2, vs0 +; CHECK-LE-P9-NEXT: stxv v3, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_none: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI1_0@toc@ha ; CHECK-BE-P8-NEXT: mtvsrwz v4, r9 -; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: lhzx r3, 0, r3 ; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI1_0@toc@l ; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI1_1@toc@ha -; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI1_1@toc@l -; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 -; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 ; CHECK-BE-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI1_1@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI1_1@toc@l +; CHECK-BE-P8-NEXT: vsplth v3, v3, 3 +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_none: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-BE-P9-NEXT: lxsihzx v3, 0, r3 ; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha -; CHECK-BE-P9-NEXT: mtvsrwz v3, r9 +; CHECK-BE-P9-NEXT: mtvsrwz v4, r9 ; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l -; CHECK-BE-P9-NEXT: vinsertb v2, v3, 0 -; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-BE-P9-NEXT: vinsertb v2, v4, 0 +; CHECK-BE-P9-NEXT: lxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: vsplth v3, v3, 3 +; CHECK-BE-P9-NEXT: xxperm v2, v3, vs0 ; CHECK-BE-P9-NEXT: stxv v2, 0(r3) ; CHECK-BE-P9-NEXT: blr ; @@ -182,24 +194,26 @@ ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: ld r4, L..C1(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r5 -; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: lhzx r3, 0, r3 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: ld r4, L..C2(r2) # %const.1 ; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 -; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 -; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P8-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsihzx v3, 0, r3 ; CHECK-AIX-64-P9-NEXT: ld r3, L..C1(r2) # %const.0 -; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r5 -; CHECK-AIX-64-P9-NEXT: vinsertb v2, v3, 0 -; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v4, r5 +; CHECK-AIX-64-P9-NEXT: vinsertb v2, v4, 0 +; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-64-P9-NEXT: xxperm v2, v3, vs0 ; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; @@ -207,24 +221,26 @@ ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lwz r4, L..C1(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: mtvsrwz v4, r5 -; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lhzx r3, 0, r3 ; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 ; CHECK-AIX-32-P8-NEXT: lwz r4, L..C2(r2) # %const.1 ; CHECK-AIX-32-P8-NEXT: vperm v2, v4, v2, v3 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-AIX-32-P8-NEXT: mtvsrwz v4, r3 -; CHECK-AIX-32-P8-NEXT: vperm v2, v4, v2, v3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-32-P8-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_none: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxsihzx v3, 0, r3 ; CHECK-AIX-32-P9-NEXT: lwz r3, L..C1(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: mtvsrwz v3, r5 -; CHECK-AIX-32-P9-NEXT: vinsertb v2, v3, 0 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: mtvsrwz v4, r5 +; CHECK-AIX-32-P9-NEXT: vinsertb v2, v4, 0 +; CHECK-AIX-32-P9-NEXT: lxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: vsplth v3, v3, 3 +; CHECK-AIX-32-P9-NEXT: xxperm v2, v3, vs0 ; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -556,8 +572,9 @@ ; CHECK-AIX-32-P8-LABEL: test_none_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lwz r5, L..C5(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 ; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 ; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-32-P8-NEXT: xxlxor v3, v3, v3 @@ -567,7 +584,7 @@ ; ; CHECK-AIX-32-P9-LABEL: test_none_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lxvwsx v2, 0, r3 ; CHECK-AIX-32-P9-NEXT: lwz r3, L..C4(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: lxv vs0, 0(r4) ; CHECK-AIX-32-P9-NEXT: xxlxor v3, v3, v3 @@ -791,82 +808,83 @@ ; CHECK-LE-P8-LABEL: test_v8i16_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-LE-P8-NEXT: mtfprd f1, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 -; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: lhz r4, 0(r4) +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: mtvsrd v3, r4 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r4 +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI7_0@toc@ha +; CHECK-BE-P8-NEXT: lhz r4, 0(r4) ; CHECK-BE-P8-NEXT: lhz r3, 0(r3) -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI7_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-BE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-BE-P9-NEXT: xxsldwi v3, f0, f0, 1 -; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: addis r5, r2, .LCPI7_0@toc@ha +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r4 +; CHECK-BE-P9-NEXT: lxsihzx f1, 0, r3 +; CHECK-BE-P9-NEXT: addi r5, r5, .LCPI7_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs0, 0(r5) +; CHECK-BE-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r4 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C9(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r4) ; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-64-P9-NEXT: xxsldwi v3, f0, f0, 1 -; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: ld r5, L..C8(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxsihzx f1, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r5) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) ; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) ; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) -; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr %a @@ -885,78 +903,83 @@ ; CHECK-LE-P8-LABEL: test_v8i16_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 -; CHECK-LE-P8-NEXT: mtfprd f1, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 -; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: lhz r4, 0(r4) +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: mtvsrd v3, r4 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v2i64: ; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI8_0@toc@ha +; CHECK-BE-P8-NEXT: lhz r4, 0(r4) ; CHECK-BE-P8-NEXT: lhz r3, 0(r3) -; CHECK-BE-P8-NEXT: lxsdx v2, 0, r4 -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI8_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-BE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v2i64: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) -; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: addis r5, r2, .LCPI8_0@toc@ha +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r4 +; CHECK-BE-P9-NEXT: lxsihzx f1, 0, r3 +; CHECK-BE-P9-NEXT: addi r5, r5, .LCPI8_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs0, 0(r5) +; CHECK-BE-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C10(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r4) ; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r4 -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) -; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: ld r5, L..C9(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxsihzx f1, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r5) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) ; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) ; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) -; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr %a @@ -1035,7 +1058,7 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C9(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C11(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 @@ -1048,7 +1071,7 @@ ; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C8(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C10(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-64-P9-NEXT: xxlxor v3, v3, v3 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) @@ -1092,82 +1115,83 @@ define <16 x i8> @test_v4i32_v8i16(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_v4i32_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r4, 0(r4) ; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-LE-P8-NEXT: mtfprd f1, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 -; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: mtvsrd v2, r4 +; CHECK-LE-P8-NEXT: mtvsrd v3, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r4 +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r4 +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI10_0@toc@ha ; CHECK-BE-P8-NEXT: lhz r3, 0(r3) -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: lhz r4, 0(r4) +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI10_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r4 +; CHECK-BE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: addis r5, r2, .LCPI10_0@toc@ha ; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-BE-P9-NEXT: xxsldwi v3, f0, f0, 1 -; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: lxsihzx f1, 0, r4 +; CHECK-BE-P9-NEXT: addi r5, r5, .LCPI10_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs0, 0(r5) +; CHECK-BE-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r4 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C12(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: ld r5, L..C11(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-64-P9-NEXT: xxsldwi v3, f0, f0, 1 -; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: lxsihzx f1, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r5) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 ; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1186,70 +1210,84 @@ define <16 x i8> @test_v4i32_v2i64(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_v4i32_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P8-NEXT: lfdx f1, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, f0 -; CHECK-LE-P8-NEXT: xxswapd v3, f1 -; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: lhz r4, 0(r4) +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: mtvsrd v3, r4 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, f0 -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r4 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v2i64: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI11_0@toc@ha +; CHECK-BE-P8-NEXT: lhz r4, 0(r4) +; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI11_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r3 +; CHECK-BE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v2i64: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) -; CHECK-BE-P9-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: addis r5, r2, .LCPI11_0@toc@ha +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r4 +; CHECK-BE-P9-NEXT: lxsihzx f1, 0, r3 +; CHECK-BE-P9-NEXT: addi r5, r5, .LCPI11_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs0, 0(r5) +; CHECK-BE-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C13(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) -; CHECK-AIX-64-P9-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: ld r5, L..C12(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxsihzx f1, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r5) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r5, L..C9(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 -; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C8(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r4 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr %a, align 4 @@ -1328,7 +1366,7 @@ ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C10(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C14(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 @@ -1341,7 +1379,7 @@ ; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C9(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C13(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r4) ; CHECK-AIX-64-P9-NEXT: xxlxor v3, v3, v3 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) @@ -1352,9 +1390,11 @@ ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r5, L..C10(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: lwz r5, L..C9(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfiwzx f1, 0, r4 +; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 +; CHECK-AIX-32-P8-NEXT: xxspltw v3, vs1, 1 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 ; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-32-P8-NEXT: xxlxor v3, v3, v3 @@ -1364,9 +1404,9 @@ ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C9(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C8(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxvwsx vs0, 0, r4 ; CHECK-AIX-32-P9-NEXT: xxlxor v3, v3, v3 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 @@ -1385,70 +1425,84 @@ define <16 x i8> @test_v2i64_v4i32(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_v2i64_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P8-NEXT: lfdx f1, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, f0 -; CHECK-LE-P8-NEXT: xxswapd v3, f1 -; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: lhz r4, 0(r4) +; CHECK-LE-P8-NEXT: lhz r3, 0(r3) +; CHECK-LE-P8-NEXT: mtvsrd v2, r4 +; CHECK-LE-P8-NEXT: mtvsrd v3, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, f0 -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r4 +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI13_0@toc@ha +; CHECK-BE-P8-NEXT: lhz r3, 0(r3) +; CHECK-BE-P8-NEXT: lhz r4, 0(r4) +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI13_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r4 +; CHECK-BE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v2i64_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) -; CHECK-BE-P9-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: addis r5, r2, .LCPI13_0@toc@ha +; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: lxsihzx f1, 0, r4 +; CHECK-BE-P9-NEXT: addi r5, r5, .LCPI13_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs0, 0(r5) +; CHECK-BE-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C15(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) -; CHECK-AIX-64-P9-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: ld r5, L..C14(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: lxsihzx f1, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r5) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lwz r5, L..C11(r2) # %const.0 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r5 -; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C10(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr %a, align 4 @@ -1466,78 +1520,83 @@ define <16 x i8> @test_v2i64_v8i16(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_v2i64_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: lhz r4, 0(r4) ; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 -; CHECK-LE-P8-NEXT: mtfprd f1, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 -; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: mtvsrd v2, r4 +; CHECK-LE-P8-NEXT: mtvsrd v3, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r4 +; CHECK-LE-P9-NEXT: lxsihzx v3, 0, r3 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI14_0@toc@ha ; CHECK-BE-P8-NEXT: lhz r3, 0(r3) -; CHECK-BE-P8-NEXT: lxsdx v2, 0, r4 -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: lhz r4, 0(r4) +; CHECK-BE-P8-NEXT: addi r5, r5, .LCPI14_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: mtvsrwz v4, r4 +; CHECK-BE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v2i64_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: addis r5, r2, .LCPI14_0@toc@ha ; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) -; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: lxsihzx f1, 0, r4 +; CHECK-BE-P9-NEXT: addi r5, r5, .LCPI14_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs0, 0(r5) +; CHECK-BE-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C16(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r4 -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r5 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r4 +; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: ld r5, L..C15(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) -; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: lxsihzx f1, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r5) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs1, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 -; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r4 -; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P8-NEXT: sth r4, -32(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: lxvw4x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r4 ; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r4) +; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll --- a/llvm/test/CodeGen/RISCV/add-before-shl.ll +++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll @@ -60,11 +60,10 @@ ; ; RV64I-LABEL: add_large_const: ; RV64I: # %bb.0: -; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: lui a1, 4095 -; RV64I-NEXT: slli a1, a1, 36 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: lui a1, 65520 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: sraiw a0, a0, 16 ; RV64I-NEXT: jalr zero, 0(ra) ; ; RV32C-LABEL: add_large_const: @@ -77,11 +76,10 @@ ; ; RV64C-LABEL: add_large_const: ; RV64C: # %bb.0: -; RV64C-NEXT: c.lui a1, 1 -; RV64C-NEXT: c.addiw a1, -1 +; RV64C-NEXT: c.slli a0, 16 +; RV64C-NEXT: lui a1, 65520 ; RV64C-NEXT: c.add a0, a1 -; RV64C-NEXT: c.slli a0, 48 -; RV64C-NEXT: c.srai a0, 48 +; RV64C-NEXT: sraiw a0, a0, 16 ; RV64C-NEXT: c.jr ra %1 = add i32 %a, 4095 %2 = shl i32 %1, 16 @@ -100,11 +98,10 @@ ; ; RV64I-LABEL: add_huge_const: ; RV64I: # %bb.0: -; RV64I-NEXT: slli a0, a0, 48 -; RV64I-NEXT: lui a1, 32767 -; RV64I-NEXT: slli a1, a1, 36 +; RV64I-NEXT: slli a0, a0, 16 +; RV64I-NEXT: lui a1, 524272 ; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: srai a0, a0, 48 +; RV64I-NEXT: sraiw a0, a0, 16 ; RV64I-NEXT: jalr zero, 0(ra) ; ; RV32C-LABEL: add_huge_const: @@ -117,11 +114,10 @@ ; ; RV64C-LABEL: add_huge_const: ; RV64C: # %bb.0: -; RV64C-NEXT: c.lui a1, 8 -; RV64C-NEXT: c.addiw a1, -1 +; RV64C-NEXT: c.slli a0, 16 +; RV64C-NEXT: lui a1, 524272 ; RV64C-NEXT: c.add a0, a1 -; RV64C-NEXT: c.slli a0, 48 -; RV64C-NEXT: c.srai a0, 48 +; RV64C-NEXT: sraiw a0, a0, 16 ; RV64C-NEXT: c.jr ra %1 = add i32 %a, 32767 %2 = shl i32 %1, 16 diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll --- a/llvm/test/CodeGen/RISCV/alu64.ll +++ b/llvm/test/CodeGen/RISCV/alu64.ll @@ -58,7 +58,8 @@ ; RV32I-LABEL: sltiu: ; RV32I: # %bb.0: ; RV32I-NEXT: sltiu a0, a0, 3 -; RV32I-NEXT: seqz a1, a1 +; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: addi a1, a1, -1 ; RV32I-NEXT: and a0, a1, a0 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/bitreverse-shift.ll b/llvm/test/CodeGen/RISCV/bitreverse-shift.ll --- a/llvm/test/CodeGen/RISCV/bitreverse-shift.ll +++ b/llvm/test/CodeGen/RISCV/bitreverse-shift.ll @@ -120,8 +120,8 @@ ; RV32ZBKB: # %bb.0: ; RV32ZBKB-NEXT: rev8 a0, a0 ; RV32ZBKB-NEXT: brev8 a0, a0 -; RV32ZBKB-NEXT: srli a0, a0, 24 -; RV32ZBKB-NEXT: slli a0, a0, 3 +; RV32ZBKB-NEXT: srli a0, a0, 21 +; RV32ZBKB-NEXT: andi a0, a0, 248 ; RV32ZBKB-NEXT: rev8 a0, a0 ; RV32ZBKB-NEXT: brev8 a0, a0 ; RV32ZBKB-NEXT: srli a0, a0, 24 @@ -131,8 +131,8 @@ ; RV64ZBKB: # %bb.0: ; RV64ZBKB-NEXT: rev8 a0, a0 ; RV64ZBKB-NEXT: brev8 a0, a0 -; RV64ZBKB-NEXT: srli a0, a0, 56 -; RV64ZBKB-NEXT: slli a0, a0, 3 +; RV64ZBKB-NEXT: srli a0, a0, 53 +; RV64ZBKB-NEXT: andi a0, a0, 248 ; RV64ZBKB-NEXT: rev8 a0, a0 ; RV64ZBKB-NEXT: brev8 a0, a0 ; RV64ZBKB-NEXT: srli a0, a0, 56 @@ -148,8 +148,10 @@ ; RV32ZBKB: # %bb.0: ; RV32ZBKB-NEXT: rev8 a0, a0 ; RV32ZBKB-NEXT: brev8 a0, a0 -; RV32ZBKB-NEXT: srli a0, a0, 16 -; RV32ZBKB-NEXT: slli a0, a0, 7 +; RV32ZBKB-NEXT: srli a0, a0, 9 +; RV32ZBKB-NEXT: lui a1, 16 +; RV32ZBKB-NEXT: addi a1, a1, -128 +; RV32ZBKB-NEXT: and a0, a0, a1 ; RV32ZBKB-NEXT: rev8 a0, a0 ; RV32ZBKB-NEXT: brev8 a0, a0 ; RV32ZBKB-NEXT: srli a0, a0, 16 @@ -159,8 +161,10 @@ ; RV64ZBKB: # %bb.0: ; RV64ZBKB-NEXT: rev8 a0, a0 ; RV64ZBKB-NEXT: brev8 a0, a0 -; RV64ZBKB-NEXT: srli a0, a0, 48 -; RV64ZBKB-NEXT: slli a0, a0, 7 +; RV64ZBKB-NEXT: srli a0, a0, 41 +; RV64ZBKB-NEXT: lui a1, 16 +; RV64ZBKB-NEXT: addiw a1, a1, -128 +; RV64ZBKB-NEXT: and a0, a0, a1 ; RV64ZBKB-NEXT: rev8 a0, a0 ; RV64ZBKB-NEXT: brev8 a0, a0 ; RV64ZBKB-NEXT: srli a0, a0, 48 @@ -185,7 +189,8 @@ ; RV64ZBKB: # %bb.0: ; RV64ZBKB-NEXT: rev8 a0, a0 ; RV64ZBKB-NEXT: brev8 a0, a0 -; RV64ZBKB-NEXT: srli a0, a0, 32 +; RV64ZBKB-NEXT: srli a0, a0, 17 +; RV64ZBKB-NEXT: srliw a0, a0, 15 ; RV64ZBKB-NEXT: slli a0, a0, 15 ; RV64ZBKB-NEXT: rev8 a0, a0 ; RV64ZBKB-NEXT: brev8 a0, a0 diff --git a/llvm/test/CodeGen/RISCV/bittest.ll b/llvm/test/CodeGen/RISCV/bittest.ll --- a/llvm/test/CodeGen/RISCV/bittest.ll +++ b/llvm/test/CodeGen/RISCV/bittest.ll @@ -1033,9 +1033,9 @@ ; ; RV64-LABEL: bit_31_nz_select_i64: ; RV64: # %bb.0: -; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: srliw a3, a0, 31 ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: bltz a3, .LBB30_2 +; RV64-NEXT: bnez a3, .LBB30_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a0, a2 ; RV64-NEXT: .LBB30_2: @@ -1402,7 +1402,8 @@ define void @bit_31_z_branch_i32(i32 signext %0) { ; RV32-LABEL: bit_31_z_branch_i32: ; RV32: # %bb.0: -; RV32-NEXT: bltz a0, .LBB43_2 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: bnez a0, .LBB43_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: tail bar@plt ; RV32-NEXT: .LBB43_2: @@ -1624,7 +1625,8 @@ define void @bit_31_z_branch_i64(i64 %0) { ; RV32-LABEL: bit_31_z_branch_i64: ; RV32: # %bb.0: -; RV32-NEXT: bltz a0, .LBB51_2 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: bnez a0, .LBB51_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: tail bar@plt ; RV32-NEXT: .LBB51_2: @@ -1802,7 +1804,8 @@ define void @bit_63_z_branch_i64(i64 %0) { ; RV32-LABEL: bit_63_z_branch_i64: ; RV32: # %bb.0: -; RV32-NEXT: bltz a1, .LBB57_2 +; RV32-NEXT: srli a1, a1, 31 +; RV32-NEXT: bnez a1, .LBB57_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: tail bar@plt ; RV32-NEXT: .LBB57_2: @@ -1810,7 +1813,8 @@ ; ; RV64-LABEL: bit_63_z_branch_i64: ; RV64: # %bb.0: -; RV64-NEXT: bltz a0, .LBB57_2 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: bnez a0, .LBB57_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: tail bar@plt ; RV64-NEXT: .LBB57_2: diff --git a/llvm/test/CodeGen/RISCV/bswap-shift.ll b/llvm/test/CodeGen/RISCV/bswap-shift.ll --- a/llvm/test/CodeGen/RISCV/bswap-shift.ll +++ b/llvm/test/CodeGen/RISCV/bswap-shift.ll @@ -41,12 +41,16 @@ define i16 @test_bswap_srli_8_bswap_i16(i16 %a) nounwind { ; RV32ZB-LABEL: test_bswap_srli_8_bswap_i16: ; RV32ZB: # %bb.0: -; RV32ZB-NEXT: slli a0, a0, 8 +; RV32ZB-NEXT: andi a0, a0, 255 +; RV32ZB-NEXT: rev8 a0, a0 +; RV32ZB-NEXT: srli a0, a0, 16 ; RV32ZB-NEXT: ret ; ; RV64ZB-LABEL: test_bswap_srli_8_bswap_i16: ; RV64ZB: # %bb.0: -; RV64ZB-NEXT: slli a0, a0, 8 +; RV64ZB-NEXT: andi a0, a0, 255 +; RV64ZB-NEXT: rev8 a0, a0 +; RV64ZB-NEXT: srli a0, a0, 48 ; RV64ZB-NEXT: ret %1 = call i16 @llvm.bswap.i16(i16 %a) %2 = lshr i16 %1, 8 @@ -89,12 +93,15 @@ define i32 @test_bswap_srli_24_bswap_i32(i32 %a) nounwind { ; RV32ZB-LABEL: test_bswap_srli_24_bswap_i32: ; RV32ZB: # %bb.0: -; RV32ZB-NEXT: slli a0, a0, 24 +; RV32ZB-NEXT: andi a0, a0, 255 +; RV32ZB-NEXT: rev8 a0, a0 ; RV32ZB-NEXT: ret ; ; RV64ZB-LABEL: test_bswap_srli_24_bswap_i32: ; RV64ZB: # %bb.0: -; RV64ZB-NEXT: slliw a0, a0, 24 +; RV64ZB-NEXT: andi a0, a0, 255 +; RV64ZB-NEXT: rev8 a0, a0 +; RV64ZB-NEXT: srli a0, a0, 32 ; RV64ZB-NEXT: ret %1 = call i32 @llvm.bswap.i32(i32 %a) %2 = lshr i32 %1, 24 @@ -146,14 +153,16 @@ define i16 @test_bswap_shli_8_bswap_i16(i16 %a) nounwind { ; RV32ZB-LABEL: test_bswap_shli_8_bswap_i16: ; RV32ZB: # %bb.0: -; RV32ZB-NEXT: slli a0, a0, 16 -; RV32ZB-NEXT: srli a0, a0, 24 +; RV32ZB-NEXT: andi a0, a0, -256 +; RV32ZB-NEXT: rev8 a0, a0 +; RV32ZB-NEXT: srli a0, a0, 16 ; RV32ZB-NEXT: ret ; ; RV64ZB-LABEL: test_bswap_shli_8_bswap_i16: ; RV64ZB: # %bb.0: -; RV64ZB-NEXT: slli a0, a0, 48 -; RV64ZB-NEXT: srli a0, a0, 56 +; RV64ZB-NEXT: andi a0, a0, -256 +; RV64ZB-NEXT: rev8 a0, a0 +; RV64ZB-NEXT: srli a0, a0, 48 ; RV64ZB-NEXT: ret %1 = call i16 @llvm.bswap.i16(i16 %a) %2 = shl i16 %1, 8 @@ -196,12 +205,17 @@ define i32 @test_bswap_shli_24_bswap_i32(i32 %a) nounwind { ; RV32ZB-LABEL: test_bswap_shli_24_bswap_i32: ; RV32ZB: # %bb.0: -; RV32ZB-NEXT: srli a0, a0, 24 +; RV32ZB-NEXT: lui a1, 1044480 +; RV32ZB-NEXT: and a0, a0, a1 +; RV32ZB-NEXT: rev8 a0, a0 ; RV32ZB-NEXT: ret ; ; RV64ZB-LABEL: test_bswap_shli_24_bswap_i32: ; RV64ZB: # %bb.0: -; RV64ZB-NEXT: srliw a0, a0, 24 +; RV64ZB-NEXT: lui a1, 1044480 +; RV64ZB-NEXT: and a0, a0, a1 +; RV64ZB-NEXT: rev8 a0, a0 +; RV64ZB-NEXT: srli a0, a0, 32 ; RV64ZB-NEXT: ret %1 = call i32 @llvm.bswap.i32(i32 %a) %2 = shl i32 %1, 24 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll @@ -560,10 +560,10 @@ ; RV32I-FPELIM-NEXT: sw a2, 32(sp) ; RV32I-FPELIM-NEXT: li a3, 4 ; RV32I-FPELIM-NEXT: sw a3, 36(sp) -; RV32I-FPELIM-NEXT: sw a0, 8(sp) -; RV32I-FPELIM-NEXT: sw a1, 12(sp) -; RV32I-FPELIM-NEXT: sw a2, 16(sp) ; RV32I-FPELIM-NEXT: sw a3, 20(sp) +; RV32I-FPELIM-NEXT: sw a2, 16(sp) +; RV32I-FPELIM-NEXT: sw a1, 12(sp) +; RV32I-FPELIM-NEXT: sw a0, 8(sp) ; RV32I-FPELIM-NEXT: addi a0, sp, 8 ; RV32I-FPELIM-NEXT: call callee_large_struct@plt ; RV32I-FPELIM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload @@ -584,10 +584,10 @@ ; RV32I-WITHFP-NEXT: sw a2, -16(s0) ; RV32I-WITHFP-NEXT: li a3, 4 ; RV32I-WITHFP-NEXT: sw a3, -12(s0) -; RV32I-WITHFP-NEXT: sw a0, -40(s0) -; RV32I-WITHFP-NEXT: sw a1, -36(s0) -; RV32I-WITHFP-NEXT: sw a2, -32(s0) ; RV32I-WITHFP-NEXT: sw a3, -28(s0) +; RV32I-WITHFP-NEXT: sw a2, -32(s0) +; RV32I-WITHFP-NEXT: sw a1, -36(s0) +; RV32I-WITHFP-NEXT: sw a0, -40(s0) ; RV32I-WITHFP-NEXT: addi a0, s0, -40 ; RV32I-WITHFP-NEXT: call callee_large_struct@plt ; RV32I-WITHFP-NEXT: lw ra, 44(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll --- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll @@ -300,10 +300,10 @@ ; RV64I-NEXT: sd a2, 56(sp) ; RV64I-NEXT: li a3, 4 ; RV64I-NEXT: sd a3, 64(sp) -; RV64I-NEXT: sd a0, 8(sp) -; RV64I-NEXT: sd a1, 16(sp) -; RV64I-NEXT: sd a2, 24(sp) ; RV64I-NEXT: sd a3, 32(sp) +; RV64I-NEXT: sd a2, 24(sp) +; RV64I-NEXT: sd a1, 16(sp) +; RV64I-NEXT: sd a0, 8(sp) ; RV64I-NEXT: addi a0, sp, 8 ; RV64I-NEXT: call callee_large_struct@plt ; RV64I-NEXT: ld ra, 72(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -145,10 +145,12 @@ ; RV32_NOZBB-NEXT: add a0, a2, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 -; RV32_NOZBB-NEXT: andi a1, a0, 15 -; RV32_NOZBB-NEXT: slli a0, a0, 20 -; RV32_NOZBB-NEXT: srli a0, a0, 28 -; RV32_NOZBB-NEXT: add a0, a1, a0 +; RV32_NOZBB-NEXT: lui a1, 1 +; RV32_NOZBB-NEXT: addi a1, a1, -241 +; RV32_NOZBB-NEXT: and a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: add a0, a0, a1 +; RV32_NOZBB-NEXT: andi a0, a0, 31 ; RV32_NOZBB-NEXT: ret ; RV32_NOZBB-NEXT: .LBB1_2: ; RV32_NOZBB-NEXT: li a0, 16 @@ -175,10 +177,12 @@ ; RV64NOZBB-NEXT: add a0, a2, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 ; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 15 -; RV64NOZBB-NEXT: slli a0, a0, 52 -; RV64NOZBB-NEXT: srli a0, a0, 60 -; RV64NOZBB-NEXT: add a0, a1, a0 +; RV64NOZBB-NEXT: lui a1, 1 +; RV64NOZBB-NEXT: addiw a1, a1, -241 +; RV64NOZBB-NEXT: and a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 8 +; RV64NOZBB-NEXT: add a0, a0, a1 +; RV64NOZBB-NEXT: andi a0, a0, 31 ; RV64NOZBB-NEXT: ret ; RV64NOZBB-NEXT: .LBB1_2: ; RV64NOZBB-NEXT: li a0, 16 @@ -634,10 +638,12 @@ ; RV32_NOZBB-NEXT: add a0, a2, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 -; RV32_NOZBB-NEXT: andi a1, a0, 15 -; RV32_NOZBB-NEXT: slli a0, a0, 20 -; RV32_NOZBB-NEXT: srli a0, a0, 28 -; RV32_NOZBB-NEXT: add a0, a1, a0 +; RV32_NOZBB-NEXT: lui a1, 1 +; RV32_NOZBB-NEXT: addi a1, a1, -241 +; RV32_NOZBB-NEXT: and a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: add a0, a0, a1 +; RV32_NOZBB-NEXT: andi a0, a0, 31 ; RV32_NOZBB-NEXT: ret ; ; RV64NOZBB-LABEL: test_cttz_i16_zero_undef: @@ -658,10 +664,12 @@ ; RV64NOZBB-NEXT: add a0, a2, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 ; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 15 -; RV64NOZBB-NEXT: slli a0, a0, 52 -; RV64NOZBB-NEXT: srli a0, a0, 60 -; RV64NOZBB-NEXT: add a0, a1, a0 +; RV64NOZBB-NEXT: lui a1, 1 +; RV64NOZBB-NEXT: addiw a1, a1, -241 +; RV64NOZBB-NEXT: and a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 8 +; RV64NOZBB-NEXT: add a0, a0, a1 +; RV64NOZBB-NEXT: andi a0, a0, 31 ; RV64NOZBB-NEXT: ret ; ; RV32ZBB-LABEL: test_cttz_i16_zero_undef: @@ -1076,10 +1084,12 @@ ; RV32_NOZBB-NEXT: add a0, a2, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 -; RV32_NOZBB-NEXT: andi a1, a0, 15 -; RV32_NOZBB-NEXT: slli a0, a0, 20 -; RV32_NOZBB-NEXT: srli a0, a0, 28 -; RV32_NOZBB-NEXT: add a0, a1, a0 +; RV32_NOZBB-NEXT: lui a1, 1 +; RV32_NOZBB-NEXT: addi a1, a1, -241 +; RV32_NOZBB-NEXT: and a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: add a0, a0, a1 +; RV32_NOZBB-NEXT: andi a0, a0, 31 ; RV32_NOZBB-NEXT: ret ; RV32_NOZBB-NEXT: .LBB9_2: ; RV32_NOZBB-NEXT: li a0, 16 @@ -1115,10 +1125,12 @@ ; RV64NOZBB-NEXT: add a0, a2, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 ; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 15 -; RV64NOZBB-NEXT: slli a0, a0, 52 -; RV64NOZBB-NEXT: srli a0, a0, 60 -; RV64NOZBB-NEXT: add a0, a1, a0 +; RV64NOZBB-NEXT: lui a1, 1 +; RV64NOZBB-NEXT: addiw a1, a1, -241 +; RV64NOZBB-NEXT: and a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 8 +; RV64NOZBB-NEXT: add a0, a0, a1 +; RV64NOZBB-NEXT: andi a0, a0, 31 ; RV64NOZBB-NEXT: ret ; RV64NOZBB-NEXT: .LBB9_2: ; RV64NOZBB-NEXT: li a0, 16 @@ -1757,10 +1769,12 @@ ; RV32_NOZBB-NEXT: add a0, a2, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 -; RV32_NOZBB-NEXT: andi a1, a0, 15 -; RV32_NOZBB-NEXT: slli a0, a0, 20 -; RV32_NOZBB-NEXT: srli a0, a0, 28 -; RV32_NOZBB-NEXT: add a0, a1, a0 +; RV32_NOZBB-NEXT: lui a1, 1 +; RV32_NOZBB-NEXT: addi a1, a1, -241 +; RV32_NOZBB-NEXT: and a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: add a0, a0, a1 +; RV32_NOZBB-NEXT: andi a0, a0, 31 ; RV32_NOZBB-NEXT: ret ; ; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef: @@ -1791,10 +1805,12 @@ ; RV64NOZBB-NEXT: add a0, a2, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 ; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 15 -; RV64NOZBB-NEXT: slli a0, a0, 52 -; RV64NOZBB-NEXT: srli a0, a0, 60 -; RV64NOZBB-NEXT: add a0, a1, a0 +; RV64NOZBB-NEXT: lui a1, 1 +; RV64NOZBB-NEXT: addiw a1, a1, -241 +; RV64NOZBB-NEXT: and a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 8 +; RV64NOZBB-NEXT: add a0, a0, a1 +; RV64NOZBB-NEXT: andi a0, a0, 31 ; RV64NOZBB-NEXT: ret ; ; RV32ZBB-LABEL: test_ctlz_i16_zero_undef: @@ -2377,10 +2393,12 @@ ; RV32_NOZBB-NEXT: add a0, a2, a0 ; RV32_NOZBB-NEXT: srli a1, a0, 4 ; RV32_NOZBB-NEXT: add a0, a0, a1 -; RV32_NOZBB-NEXT: andi a1, a0, 15 -; RV32_NOZBB-NEXT: slli a0, a0, 20 -; RV32_NOZBB-NEXT: srli a0, a0, 28 -; RV32_NOZBB-NEXT: add a0, a1, a0 +; RV32_NOZBB-NEXT: lui a1, 1 +; RV32_NOZBB-NEXT: addi a1, a1, -241 +; RV32_NOZBB-NEXT: and a0, a0, a1 +; RV32_NOZBB-NEXT: srli a1, a0, 8 +; RV32_NOZBB-NEXT: add a0, a0, a1 +; RV32_NOZBB-NEXT: andi a0, a0, 31 ; RV32_NOZBB-NEXT: ret ; ; RV64NOZBB-LABEL: test_ctpop_i16: @@ -2398,10 +2416,12 @@ ; RV64NOZBB-NEXT: add a0, a2, a0 ; RV64NOZBB-NEXT: srli a1, a0, 4 ; RV64NOZBB-NEXT: add a0, a0, a1 -; RV64NOZBB-NEXT: andi a1, a0, 15 -; RV64NOZBB-NEXT: slli a0, a0, 52 -; RV64NOZBB-NEXT: srli a0, a0, 60 -; RV64NOZBB-NEXT: add a0, a1, a0 +; RV64NOZBB-NEXT: lui a1, 1 +; RV64NOZBB-NEXT: addiw a1, a1, -241 +; RV64NOZBB-NEXT: and a0, a0, a1 +; RV64NOZBB-NEXT: srli a1, a0, 8 +; RV64NOZBB-NEXT: add a0, a0, a1 +; RV64NOZBB-NEXT: andi a0, a0, 31 ; RV64NOZBB-NEXT: ret ; ; RV32ZBB-LABEL: test_ctpop_i16: @@ -2431,10 +2451,12 @@ ; RV32XTHEADBB-NEXT: add a0, a2, a0 ; RV32XTHEADBB-NEXT: srli a1, a0, 4 ; RV32XTHEADBB-NEXT: add a0, a0, a1 -; RV32XTHEADBB-NEXT: andi a1, a0, 15 -; RV32XTHEADBB-NEXT: slli a0, a0, 20 -; RV32XTHEADBB-NEXT: srli a0, a0, 28 -; RV32XTHEADBB-NEXT: add a0, a1, a0 +; RV32XTHEADBB-NEXT: lui a1, 1 +; RV32XTHEADBB-NEXT: addi a1, a1, -241 +; RV32XTHEADBB-NEXT: and a0, a0, a1 +; RV32XTHEADBB-NEXT: srli a1, a0, 8 +; RV32XTHEADBB-NEXT: add a0, a0, a1 +; RV32XTHEADBB-NEXT: andi a0, a0, 31 ; RV32XTHEADBB-NEXT: ret ; ; RV64XTHEADBB-LABEL: test_ctpop_i16: @@ -2452,10 +2474,12 @@ ; RV64XTHEADBB-NEXT: add a0, a2, a0 ; RV64XTHEADBB-NEXT: srli a1, a0, 4 ; RV64XTHEADBB-NEXT: add a0, a0, a1 -; RV64XTHEADBB-NEXT: andi a1, a0, 15 -; RV64XTHEADBB-NEXT: slli a0, a0, 52 -; RV64XTHEADBB-NEXT: srli a0, a0, 60 -; RV64XTHEADBB-NEXT: add a0, a1, a0 +; RV64XTHEADBB-NEXT: lui a1, 1 +; RV64XTHEADBB-NEXT: addiw a1, a1, -241 +; RV64XTHEADBB-NEXT: and a0, a0, a1 +; RV64XTHEADBB-NEXT: srli a1, a0, 8 +; RV64XTHEADBB-NEXT: add a0, a0, a1 +; RV64XTHEADBB-NEXT: andi a0, a0, 31 ; RV64XTHEADBB-NEXT: ret %1 = call i16 @llvm.ctpop.i16(i16 %a) ret i16 %1 diff --git a/llvm/test/CodeGen/RISCV/div-pow2.ll b/llvm/test/CodeGen/RISCV/div-pow2.ll --- a/llvm/test/CodeGen/RISCV/div-pow2.ll +++ b/llvm/test/CodeGen/RISCV/div-pow2.ll @@ -404,11 +404,13 @@ ; RV32I-LABEL: sdiv64_pow2_8589934592: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: srli a2, a1, 31 -; RV32I-NEXT: add a2, a1, a2 +; RV32I-NEXT: add a3, a1, a2 ; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: add a1, a0, a1 ; RV32I-NEXT: sltu a0, a1, a0 -; RV32I-NEXT: add a1, a2, a0 +; RV32I-NEXT: add a1, a3, a0 ; RV32I-NEXT: srai a0, a1, 1 ; RV32I-NEXT: srai a1, a1, 31 ; RV32I-NEXT: ret @@ -429,15 +431,16 @@ ; RV32I-LABEL: sdiv64_pow2_negative_8589934592: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: srli a2, a1, 31 -; RV32I-NEXT: add a2, a1, a2 +; RV32I-NEXT: add a3, a1, a2 ; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: or a1, a2, a1 ; RV32I-NEXT: add a1, a0, a1 ; RV32I-NEXT: sltu a0, a1, a0 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: srli a1, a0, 31 ; RV32I-NEXT: srai a0, a0, 1 ; RV32I-NEXT: snez a2, a0 -; RV32I-NEXT: neg a1, a1 ; RV32I-NEXT: sub a1, a1, a2 ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/double-calling-conv.ll b/llvm/test/CodeGen/RISCV/double-calling-conv.ll --- a/llvm/test/CodeGen/RISCV/double-calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/double-calling-conv.ll @@ -211,20 +211,20 @@ ; RV32IFD: # %bb.0: ; RV32IFD-NEXT: addi sp, sp, -32 ; RV32IFD-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32IFD-NEXT: lui a0, 262510 +; RV32IFD-NEXT: lui a0, 262574 ; RV32IFD-NEXT: addi a0, a0, 327 -; RV32IFD-NEXT: sw a0, 4(sp) +; RV32IFD-NEXT: sw a0, 12(sp) ; RV32IFD-NEXT: lui a0, 713032 ; RV32IFD-NEXT: addi a1, a0, -1311 -; RV32IFD-NEXT: sw a1, 0(sp) -; RV32IFD-NEXT: lui a0, 262574 +; RV32IFD-NEXT: sw a1, 8(sp) +; RV32IFD-NEXT: lui a0, 262510 ; RV32IFD-NEXT: addi a0, a0, 327 -; RV32IFD-NEXT: sw a0, 12(sp) +; RV32IFD-NEXT: sw a0, 4(sp) ; RV32IFD-NEXT: li a0, 1 ; RV32IFD-NEXT: li a2, 2 ; RV32IFD-NEXT: li a4, 3 ; RV32IFD-NEXT: li a6, 4 -; RV32IFD-NEXT: sw a1, 8(sp) +; RV32IFD-NEXT: sw a1, 0(sp) ; RV32IFD-NEXT: li a1, 0 ; RV32IFD-NEXT: li a3, 0 ; RV32IFD-NEXT: li a5, 0 @@ -238,20 +238,20 @@ ; RV32IZFINXZDINX: # %bb.0: ; RV32IZFINXZDINX-NEXT: addi sp, sp, -32 ; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32IZFINXZDINX-NEXT: lui a0, 262510 +; RV32IZFINXZDINX-NEXT: lui a0, 262574 ; RV32IZFINXZDINX-NEXT: addi a0, a0, 327 -; RV32IZFINXZDINX-NEXT: sw a0, 4(sp) +; RV32IZFINXZDINX-NEXT: sw a0, 12(sp) ; RV32IZFINXZDINX-NEXT: lui a0, 713032 ; RV32IZFINXZDINX-NEXT: addi a1, a0, -1311 -; RV32IZFINXZDINX-NEXT: sw a1, 0(sp) -; RV32IZFINXZDINX-NEXT: lui a0, 262574 +; RV32IZFINXZDINX-NEXT: sw a1, 8(sp) +; RV32IZFINXZDINX-NEXT: lui a0, 262510 ; RV32IZFINXZDINX-NEXT: addi a0, a0, 327 -; RV32IZFINXZDINX-NEXT: sw a0, 12(sp) +; RV32IZFINXZDINX-NEXT: sw a0, 4(sp) ; RV32IZFINXZDINX-NEXT: li a0, 1 ; RV32IZFINXZDINX-NEXT: li a2, 2 ; RV32IZFINXZDINX-NEXT: li a4, 3 ; RV32IZFINXZDINX-NEXT: li a6, 4 -; RV32IZFINXZDINX-NEXT: sw a1, 8(sp) +; RV32IZFINXZDINX-NEXT: sw a1, 0(sp) ; RV32IZFINXZDINX-NEXT: li a1, 0 ; RV32IZFINXZDINX-NEXT: li a3, 0 ; RV32IZFINXZDINX-NEXT: li a5, 0 diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll --- a/llvm/test/CodeGen/RISCV/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/double-convert.ll @@ -144,11 +144,10 @@ define i32 @fcvt_w_d_sat(double %a) nounwind { ; CHECKIFD-LABEL: fcvt_w_d_sat: ; CHECKIFD: # %bb.0: # %start -; CHECKIFD-NEXT: fcvt.w.d a0, fa0, rtz -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.w.d a1, fa0, rtz +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcvt_w_d_sat: @@ -158,21 +157,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.w.d a2, a0, rtz -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rtz +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcvt_w_d_sat: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: fcvt.w.d a1, a0, rtz -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_d_sat: @@ -382,20 +379,18 @@ define i32 @fcvt_wu_d_sat(double %a) nounwind { ; RV32IFD-LABEL: fcvt_wu_d_sat: ; RV32IFD: # %bb.0: # %start -; RV32IFD-NEXT: fcvt.wu.d a0, fa0, rtz -; RV32IFD-NEXT: feq.d a1, fa0, fa0 -; RV32IFD-NEXT: seqz a1, a1 -; RV32IFD-NEXT: addi a1, a1, -1 -; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: feq.d a0, fa0, fa0 +; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: fcvt.wu.d a1, fa0, rtz +; RV32IFD-NEXT: and a0, a0, a1 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fcvt_wu_d_sat: ; RV64IFD: # %bb.0: # %start -; RV64IFD-NEXT: fcvt.wu.d a0, fa0, rtz -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addiw a1, a1, -1 -; RV64IFD-NEXT: and a0, a0, a1 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: negw a0, a0 +; RV64IFD-NEXT: fcvt.wu.d a1, fa0, rtz +; RV64IFD-NEXT: and a0, a1, a0 ; RV64IFD-NEXT: slli a0, a0, 32 ; RV64IFD-NEXT: srli a0, a0, 32 ; RV64IFD-NEXT: ret @@ -407,21 +402,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.wu.d a2, a0, rtz -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rtz +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcvt_wu_d_sat: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: fcvt.wu.d a1, a0, rtz -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addiw a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a1, a0 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: negw a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a0, a1 ; RV64IZFINXZDINX-NEXT: slli a0, a0, 32 ; RV64IZFINXZDINX-NEXT: srli a0, a0, 32 ; RV64IZFINXZDINX-NEXT: ret @@ -787,11 +780,10 @@ ; ; RV64IFD-LABEL: fcvt_l_d_sat: ; RV64IFD: # %bb.0: # %start -; RV64IFD-NEXT: fcvt.l.d a0, fa0, rtz -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.l.d a1, fa0, rtz +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcvt_l_d_sat: @@ -839,11 +831,10 @@ ; ; RV64IZFINXZDINX-LABEL: fcvt_l_d_sat: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: fcvt.l.d a1, a0, rtz -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcvt_l_d_sat: @@ -1035,11 +1026,10 @@ ; ; RV64IFD-LABEL: fcvt_lu_d_sat: ; RV64IFD: # %bb.0: # %start -; RV64IFD-NEXT: fcvt.lu.d a0, fa0, rtz -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.lu.d a1, fa0, rtz +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcvt_lu_d_sat: @@ -1073,11 +1063,10 @@ ; ; RV64IZFINXZDINX-LABEL: fcvt_lu_d_sat: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: fcvt.lu.d a1, a0, rtz -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.lu.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcvt_lu_d_sat: @@ -2497,20 +2486,18 @@ define zeroext i32 @fcvt_wu_d_sat_zext(double %a) nounwind { ; RV32IFD-LABEL: fcvt_wu_d_sat_zext: ; RV32IFD: # %bb.0: # %start -; RV32IFD-NEXT: fcvt.wu.d a0, fa0, rtz -; RV32IFD-NEXT: feq.d a1, fa0, fa0 -; RV32IFD-NEXT: seqz a1, a1 -; RV32IFD-NEXT: addi a1, a1, -1 -; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: feq.d a0, fa0, fa0 +; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: fcvt.wu.d a1, fa0, rtz +; RV32IFD-NEXT: and a0, a0, a1 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: fcvt_wu_d_sat_zext: ; RV64IFD: # %bb.0: # %start -; RV64IFD-NEXT: fcvt.wu.d a0, fa0, rtz -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addiw a1, a1, -1 -; RV64IFD-NEXT: and a0, a0, a1 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: negw a0, a0 +; RV64IFD-NEXT: fcvt.wu.d a1, fa0, rtz +; RV64IFD-NEXT: and a0, a1, a0 ; RV64IFD-NEXT: slli a0, a0, 32 ; RV64IFD-NEXT: srli a0, a0, 32 ; RV64IFD-NEXT: ret @@ -2522,21 +2509,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.wu.d a2, a0, rtz -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rtz +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcvt_wu_d_sat_zext: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: fcvt.wu.d a1, a0, rtz -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addiw a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a1, a0 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: negw a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a0, a1 ; RV64IZFINXZDINX-NEXT: slli a0, a0, 32 ; RV64IZFINXZDINX-NEXT: srli a0, a0, 32 ; RV64IZFINXZDINX-NEXT: ret @@ -2623,11 +2608,10 @@ define signext i32 @fcvt_w_d_sat_sext(double %a) nounwind { ; CHECKIFD-LABEL: fcvt_w_d_sat_sext: ; CHECKIFD: # %bb.0: # %start -; CHECKIFD-NEXT: fcvt.w.d a0, fa0, rtz -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.w.d a1, fa0, rtz +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: fcvt_w_d_sat_sext: @@ -2637,21 +2621,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.w.d a2, a0, rtz -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rtz +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcvt_w_d_sat_sext: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: fcvt.w.d a1, a0, rtz -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_d_sat_sext: diff --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll --- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll @@ -11,11 +11,10 @@ define signext i32 @test_floor_si32(double %x) { ; CHECKIFD-LABEL: test_floor_si32: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fcvt.w.d a0, fa0, rdn -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.w.d a1, fa0, rdn +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_floor_si32: @@ -26,21 +25,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.w.d a2, a0, rdn -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rdn +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: test_floor_si32: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.w.d a1, a0, rdn -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rdn +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.floor.f64(double %x) %b = call i32 @llvm.fptosi.sat.i32.f64(double %a) @@ -89,11 +86,10 @@ ; ; RV64IFD-LABEL: test_floor_si64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: fcvt.l.d a0, fa0, rdn -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.l.d a1, fa0, rdn +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_floor_si64: @@ -144,11 +140,10 @@ ; ; RV64IZFINXZDINX-LABEL: test_floor_si64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.l.d a1, a0, rdn -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rdn +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.floor.f64(double %x) %b = call i64 @llvm.fptosi.sat.i64.f64(double %a) @@ -158,11 +153,10 @@ define signext i32 @test_floor_ui32(double %x) { ; CHECKIFD-LABEL: test_floor_ui32: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fcvt.wu.d a0, fa0, rdn -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.wu.d a1, fa0, rdn +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_floor_ui32: @@ -173,21 +167,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.wu.d a2, a0, rdn -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rdn +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: test_floor_ui32: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.wu.d a1, a0, rdn -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rdn +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.floor.f64(double %x) %b = call i32 @llvm.fptoui.sat.i32.f64(double %a) @@ -222,11 +214,10 @@ ; ; RV64IFD-LABEL: test_floor_ui64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: fcvt.lu.d a0, fa0, rdn -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.lu.d a1, fa0, rdn +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_floor_ui64: @@ -263,11 +254,10 @@ ; ; RV64IZFINXZDINX-LABEL: test_floor_ui64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.lu.d a1, a0, rdn -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.lu.d a0, a0, rdn +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.floor.f64(double %x) %b = call i64 @llvm.fptoui.sat.i64.f64(double %a) @@ -277,11 +267,10 @@ define signext i32 @test_ceil_si32(double %x) { ; CHECKIFD-LABEL: test_ceil_si32: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fcvt.w.d a0, fa0, rup -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.w.d a1, fa0, rup +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_ceil_si32: @@ -292,21 +281,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.w.d a2, a0, rup -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rup +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: test_ceil_si32: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.w.d a1, a0, rup -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rup +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.ceil.f64(double %x) %b = call i32 @llvm.fptosi.sat.i32.f64(double %a) @@ -355,11 +342,10 @@ ; ; RV64IFD-LABEL: test_ceil_si64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: fcvt.l.d a0, fa0, rup -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.l.d a1, fa0, rup +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_ceil_si64: @@ -410,11 +396,10 @@ ; ; RV64IZFINXZDINX-LABEL: test_ceil_si64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.l.d a1, a0, rup -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rup +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.ceil.f64(double %x) %b = call i64 @llvm.fptosi.sat.i64.f64(double %a) @@ -424,11 +409,10 @@ define signext i32 @test_ceil_ui32(double %x) { ; CHECKIFD-LABEL: test_ceil_ui32: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fcvt.wu.d a0, fa0, rup -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.wu.d a1, fa0, rup +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_ceil_ui32: @@ -439,21 +423,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.wu.d a2, a0, rup -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rup +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: test_ceil_ui32: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.wu.d a1, a0, rup -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rup +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.ceil.f64(double %x) %b = call i32 @llvm.fptoui.sat.i32.f64(double %a) @@ -488,11 +470,10 @@ ; ; RV64IFD-LABEL: test_ceil_ui64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: fcvt.lu.d a0, fa0, rup -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.lu.d a1, fa0, rup +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_ceil_ui64: @@ -529,11 +510,10 @@ ; ; RV64IZFINXZDINX-LABEL: test_ceil_ui64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.lu.d a1, a0, rup -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.lu.d a0, a0, rup +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.ceil.f64(double %x) %b = call i64 @llvm.fptoui.sat.i64.f64(double %a) @@ -543,11 +523,10 @@ define signext i32 @test_trunc_si32(double %x) { ; CHECKIFD-LABEL: test_trunc_si32: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fcvt.w.d a0, fa0, rtz -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.w.d a1, fa0, rtz +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_trunc_si32: @@ -558,21 +537,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.w.d a2, a0, rtz -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rtz +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: test_trunc_si32: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.w.d a1, a0, rtz -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.trunc.f64(double %x) %b = call i32 @llvm.fptosi.sat.i32.f64(double %a) @@ -621,11 +598,10 @@ ; ; RV64IFD-LABEL: test_trunc_si64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: fcvt.l.d a0, fa0, rtz -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.l.d a1, fa0, rtz +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_trunc_si64: @@ -676,11 +652,10 @@ ; ; RV64IZFINXZDINX-LABEL: test_trunc_si64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.l.d a1, a0, rtz -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.trunc.f64(double %x) %b = call i64 @llvm.fptosi.sat.i64.f64(double %a) @@ -690,11 +665,10 @@ define signext i32 @test_trunc_ui32(double %x) { ; CHECKIFD-LABEL: test_trunc_ui32: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fcvt.wu.d a0, fa0, rtz -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.wu.d a1, fa0, rtz +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_trunc_ui32: @@ -705,21 +679,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.wu.d a2, a0, rtz -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rtz +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: test_trunc_ui32: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.wu.d a1, a0, rtz -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.trunc.f64(double %x) %b = call i32 @llvm.fptoui.sat.i32.f64(double %a) @@ -754,11 +726,10 @@ ; ; RV64IFD-LABEL: test_trunc_ui64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: fcvt.lu.d a0, fa0, rtz -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.lu.d a1, fa0, rtz +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_trunc_ui64: @@ -795,11 +766,10 @@ ; ; RV64IZFINXZDINX-LABEL: test_trunc_ui64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.lu.d a1, a0, rtz -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.lu.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.trunc.f64(double %x) %b = call i64 @llvm.fptoui.sat.i64.f64(double %a) @@ -809,11 +779,10 @@ define signext i32 @test_round_si32(double %x) { ; CHECKIFD-LABEL: test_round_si32: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fcvt.w.d a0, fa0, rmm -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.w.d a1, fa0, rmm +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_round_si32: @@ -824,21 +793,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.w.d a2, a0, rmm -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rmm +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: test_round_si32: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.w.d a1, a0, rmm -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rmm +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.round.f64(double %x) %b = call i32 @llvm.fptosi.sat.i32.f64(double %a) @@ -887,11 +854,10 @@ ; ; RV64IFD-LABEL: test_round_si64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: fcvt.l.d a0, fa0, rmm -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.l.d a1, fa0, rmm +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_round_si64: @@ -942,11 +908,10 @@ ; ; RV64IZFINXZDINX-LABEL: test_round_si64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.l.d a1, a0, rmm -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rmm +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.round.f64(double %x) %b = call i64 @llvm.fptosi.sat.i64.f64(double %a) @@ -956,11 +921,10 @@ define signext i32 @test_round_ui32(double %x) { ; CHECKIFD-LABEL: test_round_ui32: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fcvt.wu.d a0, fa0, rmm -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.wu.d a1, fa0, rmm +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_round_ui32: @@ -971,21 +935,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.wu.d a2, a0, rmm -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rmm +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: test_round_ui32: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.wu.d a1, a0, rmm -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rmm +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.round.f64(double %x) %b = call i32 @llvm.fptoui.sat.i32.f64(double %a) @@ -1020,11 +982,10 @@ ; ; RV64IFD-LABEL: test_round_ui64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: fcvt.lu.d a0, fa0, rmm -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.lu.d a1, fa0, rmm +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_round_ui64: @@ -1061,11 +1022,10 @@ ; ; RV64IZFINXZDINX-LABEL: test_round_ui64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.lu.d a1, a0, rmm -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.lu.d a0, a0, rmm +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.round.f64(double %x) %b = call i64 @llvm.fptoui.sat.i64.f64(double %a) @@ -1075,11 +1035,10 @@ define signext i32 @test_roundeven_si32(double %x) { ; CHECKIFD-LABEL: test_roundeven_si32: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fcvt.w.d a0, fa0, rne -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.w.d a1, fa0, rne +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_roundeven_si32: @@ -1090,21 +1049,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.w.d a2, a0, rne -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rne +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: test_roundeven_si32: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.w.d a1, a0, rne -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.w.d a0, a0, rne +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.roundeven.f64(double %x) %b = call i32 @llvm.fptosi.sat.i32.f64(double %a) @@ -1153,11 +1110,10 @@ ; ; RV64IFD-LABEL: test_roundeven_si64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: fcvt.l.d a0, fa0, rne -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.l.d a1, fa0, rne +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_roundeven_si64: @@ -1208,11 +1164,10 @@ ; ; RV64IZFINXZDINX-LABEL: test_roundeven_si64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.l.d a1, a0, rne -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rne +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.roundeven.f64(double %x) %b = call i64 @llvm.fptosi.sat.i64.f64(double %a) @@ -1222,11 +1177,10 @@ define signext i32 @test_roundeven_ui32(double %x) { ; CHECKIFD-LABEL: test_roundeven_ui32: ; CHECKIFD: # %bb.0: -; CHECKIFD-NEXT: fcvt.wu.d a0, fa0, rne -; CHECKIFD-NEXT: feq.d a1, fa0, fa0 -; CHECKIFD-NEXT: seqz a1, a1 -; CHECKIFD-NEXT: addi a1, a1, -1 -; CHECKIFD-NEXT: and a0, a1, a0 +; CHECKIFD-NEXT: feq.d a0, fa0, fa0 +; CHECKIFD-NEXT: neg a0, a0 +; CHECKIFD-NEXT: fcvt.wu.d a1, fa0, rne +; CHECKIFD-NEXT: and a0, a0, a1 ; CHECKIFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_roundeven_ui32: @@ -1237,21 +1191,19 @@ ; RV32IZFINXZDINX-NEXT: sw a1, 12(sp) ; RV32IZFINXZDINX-NEXT: lw a0, 8(sp) ; RV32IZFINXZDINX-NEXT: lw a1, 12(sp) -; RV32IZFINXZDINX-NEXT: fcvt.wu.d a2, a0, rne -; RV32IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV32IZFINXZDINX-NEXT: seqz a0, a0 -; RV32IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV32IZFINXZDINX-NEXT: and a0, a0, a2 +; RV32IZFINXZDINX-NEXT: feq.d a2, a0, a0 +; RV32IZFINXZDINX-NEXT: neg a2, a2 +; RV32IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rne +; RV32IZFINXZDINX-NEXT: and a0, a2, a0 ; RV32IZFINXZDINX-NEXT: addi sp, sp, 16 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: test_roundeven_ui32: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.wu.d a1, a0, rne -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.wu.d a0, a0, rne +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.roundeven.f64(double %x) %b = call i32 @llvm.fptoui.sat.i32.f64(double %a) @@ -1286,11 +1238,10 @@ ; ; RV64IFD-LABEL: test_roundeven_ui64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: fcvt.lu.d a0, fa0, rne -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.lu.d a1, fa0, rne +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret ; ; RV32IZFINXZDINX-LABEL: test_roundeven_ui64: @@ -1327,11 +1278,10 @@ ; ; RV64IZFINXZDINX-LABEL: test_roundeven_ui64: ; RV64IZFINXZDINX: # %bb.0: -; RV64IZFINXZDINX-NEXT: fcvt.lu.d a1, a0, rne -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: seqz a0, a0 -; RV64IZFINXZDINX-NEXT: addi a0, a0, -1 -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: fcvt.lu.d a0, a0, rne +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret %a = call double @llvm.roundeven.f64(double %x) %b = call i64 @llvm.fptoui.sat.i64.f64(double %a) diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll --- a/llvm/test/CodeGen/RISCV/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/float-convert.ll @@ -47,20 +47,18 @@ define i32 @fcvt_w_s_sat(float %a) nounwind { ; CHECKIF-LABEL: fcvt_w_s_sat: ; CHECKIF: # %bb.0: # %start -; CHECKIF-NEXT: fcvt.w.s a0, fa0, rtz -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.w.s a1, fa0, rtz +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: fcvt_w_s_sat: ; CHECKIZFINX: # %bb.0: # %start -; CHECKIZFINX-NEXT: fcvt.w.s a1, a0, rtz -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.w.s a0, a0, rtz +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_s_sat: @@ -231,40 +229,36 @@ define i32 @fcvt_wu_s_sat(float %a) nounwind { ; RV32IF-LABEL: fcvt_wu_s_sat: ; RV32IF: # %bb.0: # %start -; RV32IF-NEXT: fcvt.wu.s a0, fa0, rtz -; RV32IF-NEXT: feq.s a1, fa0, fa0 -; RV32IF-NEXT: seqz a1, a1 -; RV32IF-NEXT: addi a1, a1, -1 -; RV32IF-NEXT: and a0, a1, a0 +; RV32IF-NEXT: feq.s a0, fa0, fa0 +; RV32IF-NEXT: neg a0, a0 +; RV32IF-NEXT: fcvt.wu.s a1, fa0, rtz +; RV32IF-NEXT: and a0, a0, a1 ; RV32IF-NEXT: ret ; ; RV64IF-LABEL: fcvt_wu_s_sat: ; RV64IF: # %bb.0: # %start -; RV64IF-NEXT: fcvt.wu.s a0, fa0, rtz -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addiw a1, a1, -1 -; RV64IF-NEXT: and a0, a0, a1 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: negw a0, a0 +; RV64IF-NEXT: fcvt.wu.s a1, fa0, rtz +; RV64IF-NEXT: and a0, a1, a0 ; RV64IF-NEXT: slli a0, a0, 32 ; RV64IF-NEXT: srli a0, a0, 32 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: fcvt_wu_s_sat: ; RV32IZFINX: # %bb.0: # %start -; RV32IZFINX-NEXT: fcvt.wu.s a1, a0, rtz -; RV32IZFINX-NEXT: feq.s a0, a0, a0 -; RV32IZFINX-NEXT: seqz a0, a0 -; RV32IZFINX-NEXT: addi a0, a0, -1 -; RV32IZFINX-NEXT: and a0, a0, a1 +; RV32IZFINX-NEXT: feq.s a1, a0, a0 +; RV32IZFINX-NEXT: neg a1, a1 +; RV32IZFINX-NEXT: fcvt.wu.s a0, a0, rtz +; RV32IZFINX-NEXT: and a0, a1, a0 ; RV32IZFINX-NEXT: ret ; ; RV64IZFINX-LABEL: fcvt_wu_s_sat: ; RV64IZFINX: # %bb.0: # %start -; RV64IZFINX-NEXT: fcvt.wu.s a1, a0, rtz -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addiw a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a1, a0 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: negw a1, a1 +; RV64IZFINX-NEXT: fcvt.wu.s a0, a0, rtz +; RV64IZFINX-NEXT: and a0, a0, a1 ; RV64IZFINX-NEXT: slli a0, a0, 32 ; RV64IZFINX-NEXT: srli a0, a0, 32 ; RV64IZFINX-NEXT: ret @@ -651,11 +645,10 @@ ; ; RV64IF-LABEL: fcvt_l_s_sat: ; RV64IF: # %bb.0: # %start -; RV64IF-NEXT: fcvt.l.s a0, fa0, rtz -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.l.s a1, fa0, rtz +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: fcvt_l_s_sat: @@ -698,11 +691,10 @@ ; ; RV64IZFINX-LABEL: fcvt_l_s_sat: ; RV64IZFINX: # %bb.0: # %start -; RV64IZFINX-NEXT: fcvt.l.s a1, a0, rtz -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.l.s a0, a0, rtz +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret ; ; RV32I-LABEL: fcvt_l_s_sat: @@ -885,11 +877,10 @@ ; ; RV64IF-LABEL: fcvt_lu_s_sat: ; RV64IF: # %bb.0: # %start -; RV64IF-NEXT: fcvt.lu.s a0, fa0, rtz -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.lu.s a1, fa0, rtz +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: fcvt_lu_s_sat: @@ -919,11 +910,10 @@ ; ; RV64IZFINX-LABEL: fcvt_lu_s_sat: ; RV64IZFINX: # %bb.0: # %start -; RV64IZFINX-NEXT: fcvt.lu.s a1, a0, rtz -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.lu.s a0, a0, rtz +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret ; ; RV32I-LABEL: fcvt_lu_s_sat: @@ -2045,40 +2035,36 @@ define zeroext i32 @fcvt_wu_s_sat_zext(float %a) nounwind { ; RV32IF-LABEL: fcvt_wu_s_sat_zext: ; RV32IF: # %bb.0: # %start -; RV32IF-NEXT: fcvt.wu.s a0, fa0, rtz -; RV32IF-NEXT: feq.s a1, fa0, fa0 -; RV32IF-NEXT: seqz a1, a1 -; RV32IF-NEXT: addi a1, a1, -1 -; RV32IF-NEXT: and a0, a1, a0 +; RV32IF-NEXT: feq.s a0, fa0, fa0 +; RV32IF-NEXT: neg a0, a0 +; RV32IF-NEXT: fcvt.wu.s a1, fa0, rtz +; RV32IF-NEXT: and a0, a0, a1 ; RV32IF-NEXT: ret ; ; RV64IF-LABEL: fcvt_wu_s_sat_zext: ; RV64IF: # %bb.0: # %start -; RV64IF-NEXT: fcvt.wu.s a0, fa0, rtz -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addiw a1, a1, -1 -; RV64IF-NEXT: and a0, a0, a1 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: negw a0, a0 +; RV64IF-NEXT: fcvt.wu.s a1, fa0, rtz +; RV64IF-NEXT: and a0, a1, a0 ; RV64IF-NEXT: slli a0, a0, 32 ; RV64IF-NEXT: srli a0, a0, 32 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: fcvt_wu_s_sat_zext: ; RV32IZFINX: # %bb.0: # %start -; RV32IZFINX-NEXT: fcvt.wu.s a1, a0, rtz -; RV32IZFINX-NEXT: feq.s a0, a0, a0 -; RV32IZFINX-NEXT: seqz a0, a0 -; RV32IZFINX-NEXT: addi a0, a0, -1 -; RV32IZFINX-NEXT: and a0, a0, a1 +; RV32IZFINX-NEXT: feq.s a1, a0, a0 +; RV32IZFINX-NEXT: neg a1, a1 +; RV32IZFINX-NEXT: fcvt.wu.s a0, a0, rtz +; RV32IZFINX-NEXT: and a0, a1, a0 ; RV32IZFINX-NEXT: ret ; ; RV64IZFINX-LABEL: fcvt_wu_s_sat_zext: ; RV64IZFINX: # %bb.0: # %start -; RV64IZFINX-NEXT: fcvt.wu.s a1, a0, rtz -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addiw a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a1, a0 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: negw a1, a1 +; RV64IZFINX-NEXT: fcvt.wu.s a0, a0, rtz +; RV64IZFINX-NEXT: and a0, a0, a1 ; RV64IZFINX-NEXT: slli a0, a0, 32 ; RV64IZFINX-NEXT: srli a0, a0, 32 ; RV64IZFINX-NEXT: ret @@ -2154,20 +2140,18 @@ define signext i32 @fcvt_w_s_sat_sext(float %a) nounwind { ; CHECKIF-LABEL: fcvt_w_s_sat_sext: ; CHECKIF: # %bb.0: # %start -; CHECKIF-NEXT: fcvt.w.s a0, fa0, rtz -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.w.s a1, fa0, rtz +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: fcvt_w_s_sat_sext: ; CHECKIZFINX: # %bb.0: # %start -; CHECKIZFINX-NEXT: fcvt.w.s a1, a0, rtz -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.w.s a0, a0, rtz +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_s_sat_sext: diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll --- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll @@ -11,20 +11,18 @@ define signext i32 @test_floor_si32(float %x) { ; CHECKIF-LABEL: test_floor_si32: ; CHECKIF: # %bb.0: -; CHECKIF-NEXT: fcvt.w.s a0, fa0, rdn -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.w.s a1, fa0, rdn +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: test_floor_si32: ; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fcvt.w.s a1, a0, rdn -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.w.s a0, a0, rdn +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret %a = call float @llvm.floor.f32(float %x) %b = call i32 @llvm.fptosi.sat.i32.f32(float %a) @@ -83,11 +81,10 @@ ; ; RV64IF-LABEL: test_floor_si64: ; RV64IF: # %bb.0: -; RV64IF-NEXT: fcvt.l.s a0, fa0, rdn -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.l.s a1, fa0, rdn +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: test_floor_si64: @@ -141,11 +138,10 @@ ; ; RV64IZFINX-LABEL: test_floor_si64: ; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: fcvt.l.s a1, a0, rdn -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.l.s a0, a0, rdn +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret %a = call float @llvm.floor.f32(float %x) %b = call i64 @llvm.fptosi.sat.i64.f32(float %a) @@ -155,20 +151,18 @@ define signext i32 @test_floor_ui32(float %x) { ; CHECKIF-LABEL: test_floor_ui32: ; CHECKIF: # %bb.0: -; CHECKIF-NEXT: fcvt.wu.s a0, fa0, rdn -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.wu.s a1, fa0, rdn +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: test_floor_ui32: ; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fcvt.wu.s a1, a0, rdn -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.wu.s a0, a0, rdn +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret %a = call float @llvm.floor.f32(float %x) %b = call i32 @llvm.fptoui.sat.i32.f32(float %a) @@ -214,11 +208,10 @@ ; ; RV64IF-LABEL: test_floor_ui64: ; RV64IF: # %bb.0: -; RV64IF-NEXT: fcvt.lu.s a0, fa0, rdn -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.lu.s a1, fa0, rdn +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: test_floor_ui64: @@ -257,11 +250,10 @@ ; ; RV64IZFINX-LABEL: test_floor_ui64: ; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: fcvt.lu.s a1, a0, rdn -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.lu.s a0, a0, rdn +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret %a = call float @llvm.floor.f32(float %x) %b = call i64 @llvm.fptoui.sat.i64.f32(float %a) @@ -271,20 +263,18 @@ define signext i32 @test_ceil_si32(float %x) { ; CHECKIF-LABEL: test_ceil_si32: ; CHECKIF: # %bb.0: -; CHECKIF-NEXT: fcvt.w.s a0, fa0, rup -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.w.s a1, fa0, rup +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: test_ceil_si32: ; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fcvt.w.s a1, a0, rup -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.w.s a0, a0, rup +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret %a = call float @llvm.ceil.f32(float %x) %b = call i32 @llvm.fptosi.sat.i32.f32(float %a) @@ -343,11 +333,10 @@ ; ; RV64IF-LABEL: test_ceil_si64: ; RV64IF: # %bb.0: -; RV64IF-NEXT: fcvt.l.s a0, fa0, rup -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.l.s a1, fa0, rup +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: test_ceil_si64: @@ -401,11 +390,10 @@ ; ; RV64IZFINX-LABEL: test_ceil_si64: ; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: fcvt.l.s a1, a0, rup -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.l.s a0, a0, rup +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret %a = call float @llvm.ceil.f32(float %x) %b = call i64 @llvm.fptosi.sat.i64.f32(float %a) @@ -415,20 +403,18 @@ define signext i32 @test_ceil_ui32(float %x) { ; CHECKIF-LABEL: test_ceil_ui32: ; CHECKIF: # %bb.0: -; CHECKIF-NEXT: fcvt.wu.s a0, fa0, rup -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.wu.s a1, fa0, rup +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: test_ceil_ui32: ; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fcvt.wu.s a1, a0, rup -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.wu.s a0, a0, rup +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret %a = call float @llvm.ceil.f32(float %x) %b = call i32 @llvm.fptoui.sat.i32.f32(float %a) @@ -474,11 +460,10 @@ ; ; RV64IF-LABEL: test_ceil_ui64: ; RV64IF: # %bb.0: -; RV64IF-NEXT: fcvt.lu.s a0, fa0, rup -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.lu.s a1, fa0, rup +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: test_ceil_ui64: @@ -517,11 +502,10 @@ ; ; RV64IZFINX-LABEL: test_ceil_ui64: ; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: fcvt.lu.s a1, a0, rup -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.lu.s a0, a0, rup +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret %a = call float @llvm.ceil.f32(float %x) %b = call i64 @llvm.fptoui.sat.i64.f32(float %a) @@ -531,20 +515,18 @@ define signext i32 @test_trunc_si32(float %x) { ; CHECKIF-LABEL: test_trunc_si32: ; CHECKIF: # %bb.0: -; CHECKIF-NEXT: fcvt.w.s a0, fa0, rtz -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.w.s a1, fa0, rtz +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: test_trunc_si32: ; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fcvt.w.s a1, a0, rtz -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.w.s a0, a0, rtz +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret %a = call float @llvm.trunc.f32(float %x) %b = call i32 @llvm.fptosi.sat.i32.f32(float %a) @@ -603,11 +585,10 @@ ; ; RV64IF-LABEL: test_trunc_si64: ; RV64IF: # %bb.0: -; RV64IF-NEXT: fcvt.l.s a0, fa0, rtz -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.l.s a1, fa0, rtz +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: test_trunc_si64: @@ -661,11 +642,10 @@ ; ; RV64IZFINX-LABEL: test_trunc_si64: ; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: fcvt.l.s a1, a0, rtz -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.l.s a0, a0, rtz +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret %a = call float @llvm.trunc.f32(float %x) %b = call i64 @llvm.fptosi.sat.i64.f32(float %a) @@ -675,20 +655,18 @@ define signext i32 @test_trunc_ui32(float %x) { ; CHECKIF-LABEL: test_trunc_ui32: ; CHECKIF: # %bb.0: -; CHECKIF-NEXT: fcvt.wu.s a0, fa0, rtz -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.wu.s a1, fa0, rtz +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: test_trunc_ui32: ; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fcvt.wu.s a1, a0, rtz -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.wu.s a0, a0, rtz +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret %a = call float @llvm.trunc.f32(float %x) %b = call i32 @llvm.fptoui.sat.i32.f32(float %a) @@ -734,11 +712,10 @@ ; ; RV64IF-LABEL: test_trunc_ui64: ; RV64IF: # %bb.0: -; RV64IF-NEXT: fcvt.lu.s a0, fa0, rtz -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.lu.s a1, fa0, rtz +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: test_trunc_ui64: @@ -777,11 +754,10 @@ ; ; RV64IZFINX-LABEL: test_trunc_ui64: ; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: fcvt.lu.s a1, a0, rtz -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.lu.s a0, a0, rtz +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret %a = call float @llvm.trunc.f32(float %x) %b = call i64 @llvm.fptoui.sat.i64.f32(float %a) @@ -791,20 +767,18 @@ define signext i32 @test_round_si32(float %x) { ; CHECKIF-LABEL: test_round_si32: ; CHECKIF: # %bb.0: -; CHECKIF-NEXT: fcvt.w.s a0, fa0, rmm -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.w.s a1, fa0, rmm +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: test_round_si32: ; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fcvt.w.s a1, a0, rmm -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.w.s a0, a0, rmm +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret %a = call float @llvm.round.f32(float %x) %b = call i32 @llvm.fptosi.sat.i32.f32(float %a) @@ -863,11 +837,10 @@ ; ; RV64IF-LABEL: test_round_si64: ; RV64IF: # %bb.0: -; RV64IF-NEXT: fcvt.l.s a0, fa0, rmm -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.l.s a1, fa0, rmm +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: test_round_si64: @@ -921,11 +894,10 @@ ; ; RV64IZFINX-LABEL: test_round_si64: ; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: fcvt.l.s a1, a0, rmm -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.l.s a0, a0, rmm +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret %a = call float @llvm.round.f32(float %x) %b = call i64 @llvm.fptosi.sat.i64.f32(float %a) @@ -935,20 +907,18 @@ define signext i32 @test_round_ui32(float %x) { ; CHECKIF-LABEL: test_round_ui32: ; CHECKIF: # %bb.0: -; CHECKIF-NEXT: fcvt.wu.s a0, fa0, rmm -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.wu.s a1, fa0, rmm +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: test_round_ui32: ; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fcvt.wu.s a1, a0, rmm -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.wu.s a0, a0, rmm +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret %a = call float @llvm.round.f32(float %x) %b = call i32 @llvm.fptoui.sat.i32.f32(float %a) @@ -994,11 +964,10 @@ ; ; RV64IF-LABEL: test_round_ui64: ; RV64IF: # %bb.0: -; RV64IF-NEXT: fcvt.lu.s a0, fa0, rmm -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.lu.s a1, fa0, rmm +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: test_round_ui64: @@ -1037,11 +1006,10 @@ ; ; RV64IZFINX-LABEL: test_round_ui64: ; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: fcvt.lu.s a1, a0, rmm -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.lu.s a0, a0, rmm +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret %a = call float @llvm.round.f32(float %x) %b = call i64 @llvm.fptoui.sat.i64.f32(float %a) @@ -1051,20 +1019,18 @@ define signext i32 @test_roundeven_si32(float %x) { ; CHECKIF-LABEL: test_roundeven_si32: ; CHECKIF: # %bb.0: -; CHECKIF-NEXT: fcvt.w.s a0, fa0, rne -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.w.s a1, fa0, rne +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: test_roundeven_si32: ; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fcvt.w.s a1, a0, rne -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.w.s a0, a0, rne +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret %a = call float @llvm.roundeven.f32(float %x) %b = call i32 @llvm.fptosi.sat.i32.f32(float %a) @@ -1123,11 +1089,10 @@ ; ; RV64IF-LABEL: test_roundeven_si64: ; RV64IF: # %bb.0: -; RV64IF-NEXT: fcvt.l.s a0, fa0, rne -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.l.s a1, fa0, rne +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: test_roundeven_si64: @@ -1181,11 +1146,10 @@ ; ; RV64IZFINX-LABEL: test_roundeven_si64: ; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: fcvt.l.s a1, a0, rne -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.l.s a0, a0, rne +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret %a = call float @llvm.roundeven.f32(float %x) %b = call i64 @llvm.fptosi.sat.i64.f32(float %a) @@ -1195,20 +1159,18 @@ define signext i32 @test_roundeven_ui32(float %x) { ; CHECKIF-LABEL: test_roundeven_ui32: ; CHECKIF: # %bb.0: -; CHECKIF-NEXT: fcvt.wu.s a0, fa0, rne -; CHECKIF-NEXT: feq.s a1, fa0, fa0 -; CHECKIF-NEXT: seqz a1, a1 -; CHECKIF-NEXT: addi a1, a1, -1 -; CHECKIF-NEXT: and a0, a1, a0 +; CHECKIF-NEXT: feq.s a0, fa0, fa0 +; CHECKIF-NEXT: neg a0, a0 +; CHECKIF-NEXT: fcvt.wu.s a1, fa0, rne +; CHECKIF-NEXT: and a0, a0, a1 ; CHECKIF-NEXT: ret ; ; CHECKIZFINX-LABEL: test_roundeven_ui32: ; CHECKIZFINX: # %bb.0: -; CHECKIZFINX-NEXT: fcvt.wu.s a1, a0, rne -; CHECKIZFINX-NEXT: feq.s a0, a0, a0 -; CHECKIZFINX-NEXT: seqz a0, a0 -; CHECKIZFINX-NEXT: addi a0, a0, -1 -; CHECKIZFINX-NEXT: and a0, a0, a1 +; CHECKIZFINX-NEXT: feq.s a1, a0, a0 +; CHECKIZFINX-NEXT: neg a1, a1 +; CHECKIZFINX-NEXT: fcvt.wu.s a0, a0, rne +; CHECKIZFINX-NEXT: and a0, a1, a0 ; CHECKIZFINX-NEXT: ret %a = call float @llvm.roundeven.f32(float %x) %b = call i32 @llvm.fptoui.sat.i32.f32(float %a) @@ -1254,11 +1216,10 @@ ; ; RV64IF-LABEL: test_roundeven_ui64: ; RV64IF: # %bb.0: -; RV64IF-NEXT: fcvt.lu.s a0, fa0, rne -; RV64IF-NEXT: feq.s a1, fa0, fa0 -; RV64IF-NEXT: seqz a1, a1 -; RV64IF-NEXT: addi a1, a1, -1 -; RV64IF-NEXT: and a0, a1, a0 +; RV64IF-NEXT: feq.s a0, fa0, fa0 +; RV64IF-NEXT: neg a0, a0 +; RV64IF-NEXT: fcvt.lu.s a1, fa0, rne +; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret ; ; RV32IZFINX-LABEL: test_roundeven_ui64: @@ -1297,11 +1258,10 @@ ; ; RV64IZFINX-LABEL: test_roundeven_ui64: ; RV64IZFINX: # %bb.0: -; RV64IZFINX-NEXT: fcvt.lu.s a1, a0, rne -; RV64IZFINX-NEXT: feq.s a0, a0, a0 -; RV64IZFINX-NEXT: seqz a0, a0 -; RV64IZFINX-NEXT: addi a0, a0, -1 -; RV64IZFINX-NEXT: and a0, a0, a1 +; RV64IZFINX-NEXT: feq.s a1, a0, a0 +; RV64IZFINX-NEXT: neg a1, a1 +; RV64IZFINX-NEXT: fcvt.lu.s a0, a0, rne +; RV64IZFINX-NEXT: and a0, a1, a0 ; RV64IZFINX-NEXT: ret %a = call float @llvm.roundeven.f32(float %x) %b = call i64 @llvm.fptoui.sat.i64.f32(float %a) diff --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll --- a/llvm/test/CodeGen/RISCV/forced-atomics.ll +++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll @@ -3672,7 +3672,8 @@ ; RV32-NEXT: .LBB52_2: # %atomicrmw.start ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: sltiu a0, a4, 2 -; RV32-NEXT: seqz a2, a1 +; RV32-NEXT: snez a2, a1 +; RV32-NEXT: addi a2, a2, -1 ; RV32-NEXT: and a0, a2, a0 ; RV32-NEXT: mv a2, a4 ; RV32-NEXT: bnez a0, .LBB52_1 diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -73,11 +73,10 @@ ; ; RV32IFD-LABEL: stest_f64i32: ; RV32IFD: # %bb.0: # %entry -; RV32IFD-NEXT: fcvt.w.d a0, fa0, rtz -; RV32IFD-NEXT: feq.d a1, fa0, fa0 -; RV32IFD-NEXT: seqz a1, a1 -; RV32IFD-NEXT: addi a1, a1, -1 -; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: feq.d a0, fa0, fa0 +; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: fcvt.w.d a1, fa0, rtz +; RV32IFD-NEXT: and a0, a0, a1 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: stest_f64i32: @@ -115,7 +114,8 @@ ; RV32IF-NEXT: .cfi_offset ra, -4 ; RV32IF-NEXT: call __fixunsdfdi@plt ; RV32IF-NEXT: sltiu a2, a0, -1 -; RV32IF-NEXT: seqz a1, a1 +; RV32IF-NEXT: snez a1, a1 +; RV32IF-NEXT: addi a1, a1, -1 ; RV32IF-NEXT: and a1, a1, a2 ; RV32IF-NEXT: addi a1, a1, -1 ; RV32IF-NEXT: or a0, a1, a0 @@ -142,11 +142,10 @@ ; ; RV32IFD-LABEL: utest_f64i32: ; RV32IFD: # %bb.0: # %entry -; RV32IFD-NEXT: fcvt.wu.d a0, fa0, rtz -; RV32IFD-NEXT: feq.d a1, fa0, fa0 -; RV32IFD-NEXT: seqz a1, a1 -; RV32IFD-NEXT: addi a1, a1, -1 -; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: feq.d a0, fa0, fa0 +; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: fcvt.wu.d a1, fa0, rtz +; RV32IFD-NEXT: and a0, a0, a1 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: utest_f64i32: @@ -221,11 +220,10 @@ ; ; RV32IFD-LABEL: ustest_f64i32: ; RV32IFD: # %bb.0: # %entry -; RV32IFD-NEXT: fcvt.wu.d a0, fa0, rtz -; RV32IFD-NEXT: feq.d a1, fa0, fa0 -; RV32IFD-NEXT: seqz a1, a1 -; RV32IFD-NEXT: addi a1, a1, -1 -; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: feq.d a0, fa0, fa0 +; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: fcvt.wu.d a1, fa0, rtz +; RV32IFD-NEXT: and a0, a0, a1 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: ustest_f64i32: @@ -254,11 +252,10 @@ define i32 @stest_f32i32(float %x) { ; RV32-LABEL: stest_f32i32: ; RV32: # %bb.0: # %entry -; RV32-NEXT: fcvt.w.s a0, fa0, rtz -; RV32-NEXT: feq.s a1, fa0, fa0 -; RV32-NEXT: seqz a1, a1 -; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: feq.s a0, fa0, fa0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: fcvt.w.s a1, fa0, rtz +; RV32-NEXT: and a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: stest_f32i32: @@ -290,11 +287,10 @@ define i32 @utest_f32i32(float %x) { ; RV32-LABEL: utest_f32i32: ; RV32: # %bb.0: # %entry -; RV32-NEXT: fcvt.wu.s a0, fa0, rtz -; RV32-NEXT: feq.s a1, fa0, fa0 -; RV32-NEXT: seqz a1, a1 -; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: feq.s a0, fa0, fa0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: fcvt.wu.s a1, fa0, rtz +; RV32-NEXT: and a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: utest_f32i32: @@ -318,11 +314,10 @@ define i32 @ustest_f32i32(float %x) { ; RV32-LABEL: ustest_f32i32: ; RV32: # %bb.0: # %entry -; RV32-NEXT: fcvt.wu.s a0, fa0, rtz -; RV32-NEXT: feq.s a1, fa0, fa0 -; RV32-NEXT: seqz a1, a1 -; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: feq.s a0, fa0, fa0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: fcvt.wu.s a1, fa0, rtz +; RV32-NEXT: and a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: ustest_f32i32: @@ -433,7 +428,8 @@ ; RV32-NEXT: call __extendhfsf2@plt ; RV32-NEXT: call __fixunssfdi@plt ; RV32-NEXT: sltiu a2, a0, -1 -; RV32-NEXT: seqz a1, a1 +; RV32-NEXT: snez a1, a1 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: or a0, a1, a0 @@ -1208,11 +1204,10 @@ ; ; RV64IFD-LABEL: stest_f64i64: ; RV64IFD: # %bb.0: # %entry -; RV64IFD-NEXT: fcvt.l.d a0, fa0, rtz -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.l.d a1, fa0, rtz +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret entry: %conv = fptosi double %x to i128 @@ -1239,8 +1234,10 @@ ; RV32IF-NEXT: lw a1, 20(sp) ; RV32IF-NEXT: lw a2, 12(sp) ; RV32IF-NEXT: lw a3, 8(sp) -; RV32IF-NEXT: or a4, a1, a0 -; RV32IF-NEXT: seqz a4, a4 +; RV32IF-NEXT: seqz a4, a0 +; RV32IF-NEXT: snez a5, a1 +; RV32IF-NEXT: addi a5, a5, -1 +; RV32IF-NEXT: and a4, a5, a4 ; RV32IF-NEXT: xori a0, a0, 1 ; RV32IF-NEXT: or a0, a0, a1 ; RV32IF-NEXT: seqz a0, a0 @@ -1279,8 +1276,10 @@ ; RV32IFD-NEXT: lw a1, 20(sp) ; RV32IFD-NEXT: lw a2, 12(sp) ; RV32IFD-NEXT: lw a3, 8(sp) -; RV32IFD-NEXT: or a4, a1, a0 -; RV32IFD-NEXT: seqz a4, a4 +; RV32IFD-NEXT: seqz a4, a0 +; RV32IFD-NEXT: snez a5, a1 +; RV32IFD-NEXT: addi a5, a5, -1 +; RV32IFD-NEXT: and a4, a5, a4 ; RV32IFD-NEXT: xori a0, a0, 1 ; RV32IFD-NEXT: or a0, a0, a1 ; RV32IFD-NEXT: seqz a0, a0 @@ -1506,11 +1505,10 @@ ; ; RV64-LABEL: stest_f32i64: ; RV64: # %bb.0: # %entry -; RV64-NEXT: fcvt.l.s a0, fa0, rtz -; RV64-NEXT: feq.s a1, fa0, fa0 -; RV64-NEXT: seqz a1, a1 -; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: feq.s a0, fa0, fa0 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: fcvt.l.s a1, fa0, rtz +; RV64-NEXT: and a0, a0, a1 ; RV64-NEXT: ret entry: %conv = fptosi float %x to i128 @@ -1535,8 +1533,10 @@ ; RV32-NEXT: lw a1, 20(sp) ; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: seqz a4, a4 +; RV32-NEXT: seqz a4, a0 +; RV32-NEXT: snez a5, a1 +; RV32-NEXT: addi a5, a5, -1 +; RV32-NEXT: and a4, a5, a4 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: seqz a0, a0 @@ -1787,8 +1787,10 @@ ; RV32-NEXT: lw a1, 20(sp) ; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: seqz a4, a4 +; RV32-NEXT: seqz a4, a0 +; RV32-NEXT: snez a5, a1 +; RV32-NEXT: addi a5, a5, -1 +; RV32-NEXT: and a4, a5, a4 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: seqz a0, a0 @@ -1995,11 +1997,10 @@ ; ; RV32IFD-LABEL: stest_f64i32_mm: ; RV32IFD: # %bb.0: # %entry -; RV32IFD-NEXT: fcvt.w.d a0, fa0, rtz -; RV32IFD-NEXT: feq.d a1, fa0, fa0 -; RV32IFD-NEXT: seqz a1, a1 -; RV32IFD-NEXT: addi a1, a1, -1 -; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: feq.d a0, fa0, fa0 +; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: fcvt.w.d a1, fa0, rtz +; RV32IFD-NEXT: and a0, a0, a1 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: stest_f64i32_mm: @@ -2060,11 +2061,10 @@ ; ; RV32IFD-LABEL: utest_f64i32_mm: ; RV32IFD: # %bb.0: # %entry -; RV32IFD-NEXT: fcvt.wu.d a0, fa0, rtz -; RV32IFD-NEXT: feq.d a1, fa0, fa0 -; RV32IFD-NEXT: seqz a1, a1 -; RV32IFD-NEXT: addi a1, a1, -1 -; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: feq.d a0, fa0, fa0 +; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: fcvt.wu.d a1, fa0, rtz +; RV32IFD-NEXT: and a0, a0, a1 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: utest_f64i32_mm: @@ -2129,11 +2129,10 @@ ; ; RV32IFD-LABEL: ustest_f64i32_mm: ; RV32IFD: # %bb.0: # %entry -; RV32IFD-NEXT: fcvt.wu.d a0, fa0, rtz -; RV32IFD-NEXT: feq.d a1, fa0, fa0 -; RV32IFD-NEXT: seqz a1, a1 -; RV32IFD-NEXT: addi a1, a1, -1 -; RV32IFD-NEXT: and a0, a1, a0 +; RV32IFD-NEXT: feq.d a0, fa0, fa0 +; RV32IFD-NEXT: neg a0, a0 +; RV32IFD-NEXT: fcvt.wu.d a1, fa0, rtz +; RV32IFD-NEXT: and a0, a0, a1 ; RV32IFD-NEXT: ret ; ; RV64IFD-LABEL: ustest_f64i32_mm: @@ -2160,11 +2159,10 @@ define i32 @stest_f32i32_mm(float %x) { ; RV32-LABEL: stest_f32i32_mm: ; RV32: # %bb.0: # %entry -; RV32-NEXT: fcvt.w.s a0, fa0, rtz -; RV32-NEXT: feq.s a1, fa0, fa0 -; RV32-NEXT: seqz a1, a1 -; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: feq.s a0, fa0, fa0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: fcvt.w.s a1, fa0, rtz +; RV32-NEXT: and a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: stest_f32i32_mm: @@ -2194,11 +2192,10 @@ define i32 @utest_f32i32_mm(float %x) { ; RV32-LABEL: utest_f32i32_mm: ; RV32: # %bb.0: # %entry -; RV32-NEXT: fcvt.wu.s a0, fa0, rtz -; RV32-NEXT: feq.s a1, fa0, fa0 -; RV32-NEXT: seqz a1, a1 -; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: feq.s a0, fa0, fa0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: fcvt.wu.s a1, fa0, rtz +; RV32-NEXT: and a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: utest_f32i32_mm: @@ -2221,11 +2218,10 @@ define i32 @ustest_f32i32_mm(float %x) { ; RV32-LABEL: ustest_f32i32_mm: ; RV32: # %bb.0: # %entry -; RV32-NEXT: fcvt.wu.s a0, fa0, rtz -; RV32-NEXT: feq.s a1, fa0, fa0 -; RV32-NEXT: seqz a1, a1 -; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: feq.s a0, fa0, fa0 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: fcvt.wu.s a1, fa0, rtz +; RV32-NEXT: and a0, a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: ustest_f32i32_mm: @@ -3158,11 +3154,10 @@ ; ; RV64IFD-LABEL: stest_f64i64_mm: ; RV64IFD: # %bb.0: # %entry -; RV64IFD-NEXT: fcvt.l.d a0, fa0, rtz -; RV64IFD-NEXT: feq.d a1, fa0, fa0 -; RV64IFD-NEXT: seqz a1, a1 -; RV64IFD-NEXT: addi a1, a1, -1 -; RV64IFD-NEXT: and a0, a1, a0 +; RV64IFD-NEXT: feq.d a0, fa0, fa0 +; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: fcvt.l.d a1, fa0, rtz +; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret entry: %conv = fptosi double %x to i128 @@ -3187,9 +3182,11 @@ ; RV32IF-NEXT: lw a1, 20(sp) ; RV32IF-NEXT: lw a2, 12(sp) ; RV32IF-NEXT: lw a3, 8(sp) -; RV32IF-NEXT: or a4, a1, a0 -; RV32IF-NEXT: snez a4, a4 -; RV32IF-NEXT: addi a4, a4, -1 +; RV32IF-NEXT: seqz a4, a0 +; RV32IF-NEXT: snez a5, a1 +; RV32IF-NEXT: addi a5, a5, -1 +; RV32IF-NEXT: and a4, a5, a4 +; RV32IF-NEXT: neg a4, a4 ; RV32IF-NEXT: and a3, a4, a3 ; RV32IF-NEXT: xori a0, a0, 1 ; RV32IF-NEXT: or a0, a0, a1 @@ -3232,9 +3229,11 @@ ; RV32IFD-NEXT: lw a1, 20(sp) ; RV32IFD-NEXT: lw a2, 12(sp) ; RV32IFD-NEXT: lw a3, 8(sp) -; RV32IFD-NEXT: or a4, a1, a0 -; RV32IFD-NEXT: snez a4, a4 -; RV32IFD-NEXT: addi a4, a4, -1 +; RV32IFD-NEXT: seqz a4, a0 +; RV32IFD-NEXT: snez a5, a1 +; RV32IFD-NEXT: addi a5, a5, -1 +; RV32IFD-NEXT: and a4, a5, a4 +; RV32IFD-NEXT: neg a4, a4 ; RV32IFD-NEXT: and a3, a4, a3 ; RV32IFD-NEXT: xori a0, a0, 1 ; RV32IFD-NEXT: or a0, a0, a1 @@ -3460,11 +3459,10 @@ ; ; RV64-LABEL: stest_f32i64_mm: ; RV64: # %bb.0: # %entry -; RV64-NEXT: fcvt.l.s a0, fa0, rtz -; RV64-NEXT: feq.s a1, fa0, fa0 -; RV64-NEXT: seqz a1, a1 -; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: feq.s a0, fa0, fa0 +; RV64-NEXT: neg a0, a0 +; RV64-NEXT: fcvt.l.s a1, fa0, rtz +; RV64-NEXT: and a0, a0, a1 ; RV64-NEXT: ret entry: %conv = fptosi float %x to i128 @@ -3487,9 +3485,11 @@ ; RV32-NEXT: lw a1, 20(sp) ; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: snez a4, a4 -; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: seqz a4, a0 +; RV32-NEXT: snez a5, a1 +; RV32-NEXT: addi a5, a5, -1 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: neg a4, a4 ; RV32-NEXT: and a3, a4, a3 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: or a0, a0, a1 @@ -3764,9 +3764,11 @@ ; RV32-NEXT: lw a1, 20(sp) ; RV32-NEXT: lw a2, 12(sp) ; RV32-NEXT: lw a3, 8(sp) -; RV32-NEXT: or a4, a1, a0 -; RV32-NEXT: snez a4, a4 -; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: seqz a4, a0 +; RV32-NEXT: snez a5, a1 +; RV32-NEXT: addi a5, a5, -1 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: neg a4, a4 ; RV32-NEXT: and a3, a4, a3 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: or a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll --- a/llvm/test/CodeGen/RISCV/half-convert.ll +++ b/llvm/test/CodeGen/RISCV/half-convert.ll @@ -858,47 +858,42 @@ define i32 @fcvt_w_h_sat(half %a) nounwind { ; CHECKIZFH-LABEL: fcvt_w_h_sat: ; CHECKIZFH: # %bb.0: # %start -; CHECKIZFH-NEXT: fcvt.w.h a0, fa0, rtz -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.w.h a1, fa0, rtz +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_w_h_sat: ; RV32IDZFH: # %bb.0: # %start -; RV32IDZFH-NEXT: fcvt.w.h a0, fa0, rtz -; RV32IDZFH-NEXT: feq.h a1, fa0, fa0 -; RV32IDZFH-NEXT: seqz a1, a1 -; RV32IDZFH-NEXT: addi a1, a1, -1 -; RV32IDZFH-NEXT: and a0, a1, a0 +; RV32IDZFH-NEXT: feq.h a0, fa0, fa0 +; RV32IDZFH-NEXT: neg a0, a0 +; RV32IDZFH-NEXT: fcvt.w.h a1, fa0, rtz +; RV32IDZFH-NEXT: and a0, a0, a1 ; RV32IDZFH-NEXT: ret ; ; RV64IDZFH-LABEL: fcvt_w_h_sat: ; RV64IDZFH: # %bb.0: # %start -; RV64IDZFH-NEXT: fcvt.w.h a0, fa0, rtz -; RV64IDZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IDZFH-NEXT: seqz a1, a1 -; RV64IDZFH-NEXT: addi a1, a1, -1 -; RV64IDZFH-NEXT: and a0, a1, a0 +; RV64IDZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IDZFH-NEXT: neg a0, a0 +; RV64IDZFH-NEXT: fcvt.w.h a1, fa0, rtz +; RV64IDZFH-NEXT: and a0, a0, a1 ; RV64IDZFH-NEXT: ret ; ; CHECKIZHINX-LABEL: fcvt_w_h_sat: ; CHECKIZHINX: # %bb.0: # %start -; CHECKIZHINX-NEXT: fcvt.w.h a1, a0, rtz -; CHECKIZHINX-NEXT: feq.h a0, a0, a0 -; CHECKIZHINX-NEXT: seqz a0, a0 -; CHECKIZHINX-NEXT: addi a0, a0, -1 -; CHECKIZHINX-NEXT: and a0, a0, a1 +; CHECKIZHINX-NEXT: feq.h a1, a0, a0 +; CHECKIZHINX-NEXT: neg a1, a1 +; CHECKIZHINX-NEXT: fcvt.w.h a0, a0, rtz +; CHECKIZHINX-NEXT: and a0, a1, a0 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZDINXZHINX-LABEL: fcvt_w_h_sat: ; CHECKIZDINXZHINX: # %bb.0: # %start -; CHECKIZDINXZHINX-NEXT: fcvt.w.h a1, a0, rtz -; CHECKIZDINXZHINX-NEXT: feq.h a0, a0, a0 -; CHECKIZDINXZHINX-NEXT: seqz a0, a0 -; CHECKIZDINXZHINX-NEXT: addi a0, a0, -1 -; CHECKIZDINXZHINX-NEXT: and a0, a0, a1 +; CHECKIZDINXZHINX-NEXT: feq.h a1, a0, a0 +; CHECKIZDINXZHINX-NEXT: neg a1, a1 +; CHECKIZDINXZHINX-NEXT: fcvt.w.h a0, a0, rtz +; CHECKIZDINXZHINX-NEXT: and a0, a1, a0 ; CHECKIZDINXZHINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_h_sat: @@ -994,61 +989,55 @@ ; CHECK32-IZFHMIN-LABEL: fcvt_w_h_sat: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK32-IZFHMIN-NEXT: fcvt.w.s a0, fa5, rtz -; CHECK32-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK32-IZFHMIN-NEXT: seqz a1, a1 -; CHECK32-IZFHMIN-NEXT: addi a1, a1, -1 -; CHECK32-IZFHMIN-NEXT: and a0, a1, a0 +; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK32-IZFHMIN-NEXT: neg a0, a0 +; CHECK32-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz +; CHECK32-IZFHMIN-NEXT: and a0, a0, a1 ; CHECK32-IZFHMIN-NEXT: ret ; ; CHECK64-IZFHMIN-LABEL: fcvt_w_h_sat: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: fcvt.w.s a0, fa5, rtz -; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: seqz a1, a1 -; CHECK64-IZFHMIN-NEXT: addi a1, a1, -1 -; CHECK64-IZFHMIN-NEXT: and a0, a1, a0 +; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: neg a0, a0 +; CHECK64-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz +; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 ; CHECK64-IZFHMIN-NEXT: ret ; ; CHECK32-IZHINXMIN-LABEL: fcvt_w_h_sat: ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECK32-IZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK32-IZHINXMIN-NEXT: seqz a0, a0 -; CHECK32-IZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK32-IZHINXMIN-NEXT: and a0, a0, a1 +; CHECK32-IZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK32-IZHINXMIN-NEXT: neg a1, a1 +; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0 ; CHECK32-IZHINXMIN-NEXT: ret ; ; CHECK64-IZHINXMIN-LABEL: fcvt_w_h_sat: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECK64-IZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK64-IZHINXMIN-NEXT: and a0, a0, a1 +; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZHINXMIN-NEXT: neg a1, a1 +; CHECK64-IZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 ; CHECK64-IZHINXMIN-NEXT: ret ; ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_w_h_sat: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: seqz a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a0, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: ret ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_w_h_sat: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a0, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: neg a1, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: ret start: %0 = tail call i32 @llvm.fptosi.sat.i32.f16(half %a) @@ -1267,80 +1256,72 @@ define i32 @fcvt_wu_h_sat(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_wu_h_sat: ; RV32IZFH: # %bb.0: # %start -; RV32IZFH-NEXT: fcvt.wu.h a0, fa0, rtz -; RV32IZFH-NEXT: feq.h a1, fa0, fa0 -; RV32IZFH-NEXT: seqz a1, a1 -; RV32IZFH-NEXT: addi a1, a1, -1 -; RV32IZFH-NEXT: and a0, a1, a0 +; RV32IZFH-NEXT: feq.h a0, fa0, fa0 +; RV32IZFH-NEXT: neg a0, a0 +; RV32IZFH-NEXT: fcvt.wu.h a1, fa0, rtz +; RV32IZFH-NEXT: and a0, a0, a1 ; RV32IZFH-NEXT: ret ; ; RV64IZFH-LABEL: fcvt_wu_h_sat: ; RV64IZFH: # %bb.0: # %start -; RV64IZFH-NEXT: fcvt.wu.h a0, fa0, rtz -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addiw a1, a1, -1 -; RV64IZFH-NEXT: and a0, a0, a1 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: negw a0, a0 +; RV64IZFH-NEXT: fcvt.wu.h a1, fa0, rtz +; RV64IZFH-NEXT: and a0, a1, a0 ; RV64IZFH-NEXT: slli a0, a0, 32 ; RV64IZFH-NEXT: srli a0, a0, 32 ; RV64IZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_wu_h_sat: ; RV32IDZFH: # %bb.0: # %start -; RV32IDZFH-NEXT: fcvt.wu.h a0, fa0, rtz -; RV32IDZFH-NEXT: feq.h a1, fa0, fa0 -; RV32IDZFH-NEXT: seqz a1, a1 -; RV32IDZFH-NEXT: addi a1, a1, -1 -; RV32IDZFH-NEXT: and a0, a1, a0 +; RV32IDZFH-NEXT: feq.h a0, fa0, fa0 +; RV32IDZFH-NEXT: neg a0, a0 +; RV32IDZFH-NEXT: fcvt.wu.h a1, fa0, rtz +; RV32IDZFH-NEXT: and a0, a0, a1 ; RV32IDZFH-NEXT: ret ; ; RV64IDZFH-LABEL: fcvt_wu_h_sat: ; RV64IDZFH: # %bb.0: # %start -; RV64IDZFH-NEXT: fcvt.wu.h a0, fa0, rtz -; RV64IDZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IDZFH-NEXT: seqz a1, a1 -; RV64IDZFH-NEXT: addiw a1, a1, -1 -; RV64IDZFH-NEXT: and a0, a0, a1 +; RV64IDZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IDZFH-NEXT: negw a0, a0 +; RV64IDZFH-NEXT: fcvt.wu.h a1, fa0, rtz +; RV64IDZFH-NEXT: and a0, a1, a0 ; RV64IDZFH-NEXT: slli a0, a0, 32 ; RV64IDZFH-NEXT: srli a0, a0, 32 ; RV64IDZFH-NEXT: ret ; ; RV32IZHINX-LABEL: fcvt_wu_h_sat: ; RV32IZHINX: # %bb.0: # %start -; RV32IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV32IZHINX-NEXT: feq.h a0, a0, a0 -; RV32IZHINX-NEXT: seqz a0, a0 -; RV32IZHINX-NEXT: addi a0, a0, -1 -; RV32IZHINX-NEXT: and a0, a0, a1 +; RV32IZHINX-NEXT: feq.h a1, a0, a0 +; RV32IZHINX-NEXT: neg a1, a1 +; RV32IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV32IZHINX-NEXT: and a0, a1, a0 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: fcvt_wu_h_sat: ; RV64IZHINX: # %bb.0: # %start -; RV64IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addiw a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a1, a0 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: negw a1, a1 +; RV64IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a0, a1 ; RV64IZHINX-NEXT: slli a0, a0, 32 ; RV64IZHINX-NEXT: srli a0, a0, 32 ; RV64IZHINX-NEXT: ret ; ; RV32IZDINXZHINX-LABEL: fcvt_wu_h_sat: ; RV32IZDINXZHINX: # %bb.0: # %start -; RV32IZDINXZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV32IZDINXZHINX-NEXT: feq.h a0, a0, a0 -; RV32IZDINXZHINX-NEXT: seqz a0, a0 -; RV32IZDINXZHINX-NEXT: addi a0, a0, -1 -; RV32IZDINXZHINX-NEXT: and a0, a0, a1 +; RV32IZDINXZHINX-NEXT: feq.h a1, a0, a0 +; RV32IZDINXZHINX-NEXT: neg a1, a1 +; RV32IZDINXZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV32IZDINXZHINX-NEXT: and a0, a1, a0 ; RV32IZDINXZHINX-NEXT: ret ; ; RV64IZDINXZHINX-LABEL: fcvt_wu_h_sat: ; RV64IZDINXZHINX: # %bb.0: # %start -; RV64IZDINXZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV64IZDINXZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZDINXZHINX-NEXT: seqz a0, a0 -; RV64IZDINXZHINX-NEXT: addiw a0, a0, -1 -; RV64IZDINXZHINX-NEXT: and a0, a1, a0 +; RV64IZDINXZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZDINXZHINX-NEXT: negw a1, a1 +; RV64IZDINXZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV64IZDINXZHINX-NEXT: and a0, a0, a1 ; RV64IZDINXZHINX-NEXT: slli a0, a0, 32 ; RV64IZDINXZHINX-NEXT: srli a0, a0, 32 ; RV64IZDINXZHINX-NEXT: ret @@ -1418,21 +1399,19 @@ ; CHECK32-IZFHMIN-LABEL: fcvt_wu_h_sat: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK32-IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; CHECK32-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK32-IZFHMIN-NEXT: seqz a1, a1 -; CHECK32-IZFHMIN-NEXT: addi a1, a1, -1 -; CHECK32-IZFHMIN-NEXT: and a0, a1, a0 +; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK32-IZFHMIN-NEXT: neg a0, a0 +; CHECK32-IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; CHECK32-IZFHMIN-NEXT: and a0, a0, a1 ; CHECK32-IZFHMIN-NEXT: ret ; ; CHECK64-IZFHMIN-LABEL: fcvt_wu_h_sat: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: seqz a1, a1 -; CHECK64-IZFHMIN-NEXT: addiw a1, a1, -1 -; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 +; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: negw a0, a0 +; CHECK64-IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; CHECK64-IZFHMIN-NEXT: and a0, a1, a0 ; CHECK64-IZFHMIN-NEXT: slli a0, a0, 32 ; CHECK64-IZFHMIN-NEXT: srli a0, a0, 32 ; CHECK64-IZFHMIN-NEXT: ret @@ -1440,21 +1419,19 @@ ; CHECK32-IZHINXMIN-LABEL: fcvt_wu_h_sat: ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; CHECK32-IZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK32-IZHINXMIN-NEXT: seqz a0, a0 -; CHECK32-IZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK32-IZHINXMIN-NEXT: and a0, a0, a1 +; CHECK32-IZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK32-IZHINXMIN-NEXT: neg a1, a1 +; CHECK32-IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0 ; CHECK32-IZHINXMIN-NEXT: ret ; ; CHECK64-IZHINXMIN-LABEL: fcvt_wu_h_sat: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; CHECK64-IZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZHINXMIN-NEXT: addiw a0, a0, -1 -; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 +; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZHINXMIN-NEXT: negw a1, a1 +; CHECK64-IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; CHECK64-IZHINXMIN-NEXT: and a0, a0, a1 ; CHECK64-IZHINXMIN-NEXT: slli a0, a0, 32 ; CHECK64-IZHINXMIN-NEXT: srli a0, a0, 32 ; CHECK64-IZHINXMIN-NEXT: ret @@ -1462,21 +1439,19 @@ ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_wu_h_sat: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: seqz a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a0, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: ret ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_wu_h_sat: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: addiw a0, a0, -1 -; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: negw a1, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a0, a1 ; CHECK64-IZDINXZHINXMIN-NEXT: slli a0, a0, 32 ; CHECK64-IZDINXZHINXMIN-NEXT: srli a0, a0, 32 ; CHECK64-IZDINXZHINXMIN-NEXT: ret @@ -1657,11 +1632,10 @@ ; ; RV64IZFH-LABEL: fcvt_l_h_sat: ; RV64IZFH: # %bb.0: # %start -; RV64IZFH-NEXT: fcvt.l.h a0, fa0, rtz -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.l.h a1, fa0, rtz +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_l_h_sat: @@ -1705,11 +1679,10 @@ ; ; RV64IDZFH-LABEL: fcvt_l_h_sat: ; RV64IDZFH: # %bb.0: # %start -; RV64IDZFH-NEXT: fcvt.l.h a0, fa0, rtz -; RV64IDZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IDZFH-NEXT: seqz a1, a1 -; RV64IDZFH-NEXT: addi a1, a1, -1 -; RV64IDZFH-NEXT: and a0, a1, a0 +; RV64IDZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IDZFH-NEXT: neg a0, a0 +; RV64IDZFH-NEXT: fcvt.l.h a1, fa0, rtz +; RV64IDZFH-NEXT: and a0, a0, a1 ; RV64IDZFH-NEXT: ret ; ; RV32IZHINX-LABEL: fcvt_l_h_sat: @@ -1758,11 +1731,10 @@ ; ; RV64IZHINX-LABEL: fcvt_l_h_sat: ; RV64IZHINX: # %bb.0: # %start -; RV64IZHINX-NEXT: fcvt.l.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.l.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZDINXZHINX-LABEL: fcvt_l_h_sat: @@ -1811,11 +1783,10 @@ ; ; RV64IZDINXZHINX-LABEL: fcvt_l_h_sat: ; RV64IZDINXZHINX: # %bb.0: # %start -; RV64IZDINXZHINX-NEXT: fcvt.l.h a1, a0, rtz -; RV64IZDINXZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZDINXZHINX-NEXT: seqz a0, a0 -; RV64IZDINXZHINX-NEXT: addi a0, a0, -1 -; RV64IZDINXZHINX-NEXT: and a0, a0, a1 +; RV64IZDINXZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZDINXZHINX-NEXT: neg a1, a1 +; RV64IZDINXZHINX-NEXT: fcvt.l.h a0, a0, rtz +; RV64IZDINXZHINX-NEXT: and a0, a1, a0 ; RV64IZDINXZHINX-NEXT: ret ; ; RV32I-LABEL: fcvt_l_h_sat: @@ -1961,11 +1932,10 @@ ; CHECK64-IZFHMIN-LABEL: fcvt_l_h_sat: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: fcvt.l.s a0, fa5, rtz -; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: seqz a1, a1 -; CHECK64-IZFHMIN-NEXT: addi a1, a1, -1 -; CHECK64-IZFHMIN-NEXT: and a0, a1, a0 +; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: neg a0, a0 +; CHECK64-IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz +; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 ; CHECK64-IZFHMIN-NEXT: ret ; ; RV32IDZFHMIN-LABEL: fcvt_l_h_sat: @@ -2054,11 +2024,10 @@ ; CHECK64-IZHINXMIN-LABEL: fcvt_l_h_sat: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: fcvt.l.s a1, a0, rtz -; CHECK64-IZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK64-IZHINXMIN-NEXT: and a0, a0, a1 +; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZHINXMIN-NEXT: neg a1, a1 +; CHECK64-IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz +; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 ; CHECK64-IZHINXMIN-NEXT: ret ; ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_l_h_sat: @@ -2108,11 +2077,10 @@ ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_l_h_sat: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.l.s a1, a0, rtz -; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a0, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: neg a1, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz +; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: ret start: %0 = tail call i64 @llvm.fptosi.sat.i64.f16(half %a) @@ -2277,11 +2245,10 @@ ; ; RV64IZFH-LABEL: fcvt_lu_h_sat: ; RV64IZFH: # %bb.0: # %start -; RV64IZFH-NEXT: fcvt.lu.h a0, fa0, rtz -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.lu.h a1, fa0, rtz +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_lu_h_sat: @@ -2311,11 +2278,10 @@ ; ; RV64IDZFH-LABEL: fcvt_lu_h_sat: ; RV64IDZFH: # %bb.0: # %start -; RV64IDZFH-NEXT: fcvt.lu.h a0, fa0, rtz -; RV64IDZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IDZFH-NEXT: seqz a1, a1 -; RV64IDZFH-NEXT: addi a1, a1, -1 -; RV64IDZFH-NEXT: and a0, a1, a0 +; RV64IDZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IDZFH-NEXT: neg a0, a0 +; RV64IDZFH-NEXT: fcvt.lu.h a1, fa0, rtz +; RV64IDZFH-NEXT: and a0, a0, a1 ; RV64IDZFH-NEXT: ret ; ; RV32IZHINX-LABEL: fcvt_lu_h_sat: @@ -2344,11 +2310,10 @@ ; ; RV64IZHINX-LABEL: fcvt_lu_h_sat: ; RV64IZHINX: # %bb.0: # %start -; RV64IZHINX-NEXT: fcvt.lu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.lu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZDINXZHINX-LABEL: fcvt_lu_h_sat: @@ -2377,11 +2342,10 @@ ; ; RV64IZDINXZHINX-LABEL: fcvt_lu_h_sat: ; RV64IZDINXZHINX: # %bb.0: # %start -; RV64IZDINXZHINX-NEXT: fcvt.lu.h a1, a0, rtz -; RV64IZDINXZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZDINXZHINX-NEXT: seqz a0, a0 -; RV64IZDINXZHINX-NEXT: addi a0, a0, -1 -; RV64IZDINXZHINX-NEXT: and a0, a0, a1 +; RV64IZDINXZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZDINXZHINX-NEXT: neg a1, a1 +; RV64IZDINXZHINX-NEXT: fcvt.lu.h a0, a0, rtz +; RV64IZDINXZHINX-NEXT: and a0, a1, a0 ; RV64IZDINXZHINX-NEXT: ret ; ; RV32I-LABEL: fcvt_lu_h_sat: @@ -2476,11 +2440,10 @@ ; CHECK64-IZFHMIN-LABEL: fcvt_lu_h_sat: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz -; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: seqz a1, a1 -; CHECK64-IZFHMIN-NEXT: addi a1, a1, -1 -; CHECK64-IZFHMIN-NEXT: and a0, a1, a0 +; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: neg a0, a0 +; CHECK64-IZFHMIN-NEXT: fcvt.lu.s a1, fa5, rtz +; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 ; CHECK64-IZFHMIN-NEXT: ret ; ; CHECK32-IZHINXMIN-LABEL: fcvt_lu_h_sat: @@ -2510,11 +2473,10 @@ ; CHECK64-IZHINXMIN-LABEL: fcvt_lu_h_sat: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: fcvt.lu.s a1, a0, rtz -; CHECK64-IZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK64-IZHINXMIN-NEXT: and a0, a0, a1 +; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZHINXMIN-NEXT: neg a1, a1 +; CHECK64-IZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz +; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 ; CHECK64-IZHINXMIN-NEXT: ret ; ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_lu_h_sat: @@ -2544,11 +2506,10 @@ ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_lu_h_sat: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.lu.s a1, a0, rtz -; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a0, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: neg a1, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz +; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: ret start: %0 = tail call i64 @llvm.fptoui.sat.i64.f16(half %a) @@ -5976,80 +5937,72 @@ define zeroext i32 @fcvt_wu_h_sat_zext(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_wu_h_sat_zext: ; RV32IZFH: # %bb.0: # %start -; RV32IZFH-NEXT: fcvt.wu.h a0, fa0, rtz -; RV32IZFH-NEXT: feq.h a1, fa0, fa0 -; RV32IZFH-NEXT: seqz a1, a1 -; RV32IZFH-NEXT: addi a1, a1, -1 -; RV32IZFH-NEXT: and a0, a1, a0 +; RV32IZFH-NEXT: feq.h a0, fa0, fa0 +; RV32IZFH-NEXT: neg a0, a0 +; RV32IZFH-NEXT: fcvt.wu.h a1, fa0, rtz +; RV32IZFH-NEXT: and a0, a0, a1 ; RV32IZFH-NEXT: ret ; ; RV64IZFH-LABEL: fcvt_wu_h_sat_zext: ; RV64IZFH: # %bb.0: # %start -; RV64IZFH-NEXT: fcvt.wu.h a0, fa0, rtz -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addiw a1, a1, -1 -; RV64IZFH-NEXT: and a0, a0, a1 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: negw a0, a0 +; RV64IZFH-NEXT: fcvt.wu.h a1, fa0, rtz +; RV64IZFH-NEXT: and a0, a1, a0 ; RV64IZFH-NEXT: slli a0, a0, 32 ; RV64IZFH-NEXT: srli a0, a0, 32 ; RV64IZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_wu_h_sat_zext: ; RV32IDZFH: # %bb.0: # %start -; RV32IDZFH-NEXT: fcvt.wu.h a0, fa0, rtz -; RV32IDZFH-NEXT: feq.h a1, fa0, fa0 -; RV32IDZFH-NEXT: seqz a1, a1 -; RV32IDZFH-NEXT: addi a1, a1, -1 -; RV32IDZFH-NEXT: and a0, a1, a0 +; RV32IDZFH-NEXT: feq.h a0, fa0, fa0 +; RV32IDZFH-NEXT: neg a0, a0 +; RV32IDZFH-NEXT: fcvt.wu.h a1, fa0, rtz +; RV32IDZFH-NEXT: and a0, a0, a1 ; RV32IDZFH-NEXT: ret ; ; RV64IDZFH-LABEL: fcvt_wu_h_sat_zext: ; RV64IDZFH: # %bb.0: # %start -; RV64IDZFH-NEXT: fcvt.wu.h a0, fa0, rtz -; RV64IDZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IDZFH-NEXT: seqz a1, a1 -; RV64IDZFH-NEXT: addiw a1, a1, -1 -; RV64IDZFH-NEXT: and a0, a0, a1 +; RV64IDZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IDZFH-NEXT: negw a0, a0 +; RV64IDZFH-NEXT: fcvt.wu.h a1, fa0, rtz +; RV64IDZFH-NEXT: and a0, a1, a0 ; RV64IDZFH-NEXT: slli a0, a0, 32 ; RV64IDZFH-NEXT: srli a0, a0, 32 ; RV64IDZFH-NEXT: ret ; ; RV32IZHINX-LABEL: fcvt_wu_h_sat_zext: ; RV32IZHINX: # %bb.0: # %start -; RV32IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV32IZHINX-NEXT: feq.h a0, a0, a0 -; RV32IZHINX-NEXT: seqz a0, a0 -; RV32IZHINX-NEXT: addi a0, a0, -1 -; RV32IZHINX-NEXT: and a0, a0, a1 +; RV32IZHINX-NEXT: feq.h a1, a0, a0 +; RV32IZHINX-NEXT: neg a1, a1 +; RV32IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV32IZHINX-NEXT: and a0, a1, a0 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: fcvt_wu_h_sat_zext: ; RV64IZHINX: # %bb.0: # %start -; RV64IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addiw a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a1, a0 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: negw a1, a1 +; RV64IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a0, a1 ; RV64IZHINX-NEXT: slli a0, a0, 32 ; RV64IZHINX-NEXT: srli a0, a0, 32 ; RV64IZHINX-NEXT: ret ; ; RV32IZDINXZHINX-LABEL: fcvt_wu_h_sat_zext: ; RV32IZDINXZHINX: # %bb.0: # %start -; RV32IZDINXZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV32IZDINXZHINX-NEXT: feq.h a0, a0, a0 -; RV32IZDINXZHINX-NEXT: seqz a0, a0 -; RV32IZDINXZHINX-NEXT: addi a0, a0, -1 -; RV32IZDINXZHINX-NEXT: and a0, a0, a1 +; RV32IZDINXZHINX-NEXT: feq.h a1, a0, a0 +; RV32IZDINXZHINX-NEXT: neg a1, a1 +; RV32IZDINXZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV32IZDINXZHINX-NEXT: and a0, a1, a0 ; RV32IZDINXZHINX-NEXT: ret ; ; RV64IZDINXZHINX-LABEL: fcvt_wu_h_sat_zext: ; RV64IZDINXZHINX: # %bb.0: # %start -; RV64IZDINXZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV64IZDINXZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZDINXZHINX-NEXT: seqz a0, a0 -; RV64IZDINXZHINX-NEXT: addiw a0, a0, -1 -; RV64IZDINXZHINX-NEXT: and a0, a1, a0 +; RV64IZDINXZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZDINXZHINX-NEXT: negw a1, a1 +; RV64IZDINXZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV64IZDINXZHINX-NEXT: and a0, a0, a1 ; RV64IZDINXZHINX-NEXT: slli a0, a0, 32 ; RV64IZDINXZHINX-NEXT: srli a0, a0, 32 ; RV64IZDINXZHINX-NEXT: ret @@ -6129,21 +6082,19 @@ ; CHECK32-IZFHMIN-LABEL: fcvt_wu_h_sat_zext: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK32-IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; CHECK32-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK32-IZFHMIN-NEXT: seqz a1, a1 -; CHECK32-IZFHMIN-NEXT: addi a1, a1, -1 -; CHECK32-IZFHMIN-NEXT: and a0, a1, a0 +; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK32-IZFHMIN-NEXT: neg a0, a0 +; CHECK32-IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; CHECK32-IZFHMIN-NEXT: and a0, a0, a1 ; CHECK32-IZFHMIN-NEXT: ret ; ; CHECK64-IZFHMIN-LABEL: fcvt_wu_h_sat_zext: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: seqz a1, a1 -; CHECK64-IZFHMIN-NEXT: addiw a1, a1, -1 -; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 +; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: negw a0, a0 +; CHECK64-IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; CHECK64-IZFHMIN-NEXT: and a0, a1, a0 ; CHECK64-IZFHMIN-NEXT: slli a0, a0, 32 ; CHECK64-IZFHMIN-NEXT: srli a0, a0, 32 ; CHECK64-IZFHMIN-NEXT: ret @@ -6151,21 +6102,19 @@ ; CHECK32-IZHINXMIN-LABEL: fcvt_wu_h_sat_zext: ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; CHECK32-IZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK32-IZHINXMIN-NEXT: seqz a0, a0 -; CHECK32-IZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK32-IZHINXMIN-NEXT: and a0, a0, a1 +; CHECK32-IZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK32-IZHINXMIN-NEXT: neg a1, a1 +; CHECK32-IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0 ; CHECK32-IZHINXMIN-NEXT: ret ; ; CHECK64-IZHINXMIN-LABEL: fcvt_wu_h_sat_zext: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; CHECK64-IZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZHINXMIN-NEXT: addiw a0, a0, -1 -; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 +; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZHINXMIN-NEXT: negw a1, a1 +; CHECK64-IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; CHECK64-IZHINXMIN-NEXT: and a0, a0, a1 ; CHECK64-IZHINXMIN-NEXT: slli a0, a0, 32 ; CHECK64-IZHINXMIN-NEXT: srli a0, a0, 32 ; CHECK64-IZHINXMIN-NEXT: ret @@ -6173,21 +6122,19 @@ ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_wu_h_sat_zext: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: seqz a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a0, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: ret ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_wu_h_sat_zext: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: addiw a0, a0, -1 -; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: negw a1, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a0, a1 ; CHECK64-IZDINXZHINXMIN-NEXT: slli a0, a0, 32 ; CHECK64-IZDINXZHINXMIN-NEXT: srli a0, a0, 32 ; CHECK64-IZDINXZHINXMIN-NEXT: ret @@ -6199,47 +6146,42 @@ define signext i32 @fcvt_w_h_sat_sext(half %a) nounwind { ; CHECKIZFH-LABEL: fcvt_w_h_sat_sext: ; CHECKIZFH: # %bb.0: # %start -; CHECKIZFH-NEXT: fcvt.w.h a0, fa0, rtz -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.w.h a1, fa0, rtz +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_w_h_sat_sext: ; RV32IDZFH: # %bb.0: # %start -; RV32IDZFH-NEXT: fcvt.w.h a0, fa0, rtz -; RV32IDZFH-NEXT: feq.h a1, fa0, fa0 -; RV32IDZFH-NEXT: seqz a1, a1 -; RV32IDZFH-NEXT: addi a1, a1, -1 -; RV32IDZFH-NEXT: and a0, a1, a0 +; RV32IDZFH-NEXT: feq.h a0, fa0, fa0 +; RV32IDZFH-NEXT: neg a0, a0 +; RV32IDZFH-NEXT: fcvt.w.h a1, fa0, rtz +; RV32IDZFH-NEXT: and a0, a0, a1 ; RV32IDZFH-NEXT: ret ; ; RV64IDZFH-LABEL: fcvt_w_h_sat_sext: ; RV64IDZFH: # %bb.0: # %start -; RV64IDZFH-NEXT: fcvt.w.h a0, fa0, rtz -; RV64IDZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IDZFH-NEXT: seqz a1, a1 -; RV64IDZFH-NEXT: addi a1, a1, -1 -; RV64IDZFH-NEXT: and a0, a1, a0 +; RV64IDZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IDZFH-NEXT: neg a0, a0 +; RV64IDZFH-NEXT: fcvt.w.h a1, fa0, rtz +; RV64IDZFH-NEXT: and a0, a0, a1 ; RV64IDZFH-NEXT: ret ; ; CHECKIZHINX-LABEL: fcvt_w_h_sat_sext: ; CHECKIZHINX: # %bb.0: # %start -; CHECKIZHINX-NEXT: fcvt.w.h a1, a0, rtz -; CHECKIZHINX-NEXT: feq.h a0, a0, a0 -; CHECKIZHINX-NEXT: seqz a0, a0 -; CHECKIZHINX-NEXT: addi a0, a0, -1 -; CHECKIZHINX-NEXT: and a0, a0, a1 +; CHECKIZHINX-NEXT: feq.h a1, a0, a0 +; CHECKIZHINX-NEXT: neg a1, a1 +; CHECKIZHINX-NEXT: fcvt.w.h a0, a0, rtz +; CHECKIZHINX-NEXT: and a0, a1, a0 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZDINXZHINX-LABEL: fcvt_w_h_sat_sext: ; CHECKIZDINXZHINX: # %bb.0: # %start -; CHECKIZDINXZHINX-NEXT: fcvt.w.h a1, a0, rtz -; CHECKIZDINXZHINX-NEXT: feq.h a0, a0, a0 -; CHECKIZDINXZHINX-NEXT: seqz a0, a0 -; CHECKIZDINXZHINX-NEXT: addi a0, a0, -1 -; CHECKIZDINXZHINX-NEXT: and a0, a0, a1 +; CHECKIZDINXZHINX-NEXT: feq.h a1, a0, a0 +; CHECKIZDINXZHINX-NEXT: neg a1, a1 +; CHECKIZDINXZHINX-NEXT: fcvt.w.h a0, a0, rtz +; CHECKIZDINXZHINX-NEXT: and a0, a1, a0 ; CHECKIZDINXZHINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_h_sat_sext: @@ -6336,61 +6278,55 @@ ; CHECK32-IZFHMIN-LABEL: fcvt_w_h_sat_sext: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK32-IZFHMIN-NEXT: fcvt.w.s a0, fa5, rtz -; CHECK32-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK32-IZFHMIN-NEXT: seqz a1, a1 -; CHECK32-IZFHMIN-NEXT: addi a1, a1, -1 -; CHECK32-IZFHMIN-NEXT: and a0, a1, a0 +; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK32-IZFHMIN-NEXT: neg a0, a0 +; CHECK32-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz +; CHECK32-IZFHMIN-NEXT: and a0, a0, a1 ; CHECK32-IZFHMIN-NEXT: ret ; ; CHECK64-IZFHMIN-LABEL: fcvt_w_h_sat_sext: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: fcvt.w.s a0, fa5, rtz -; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: seqz a1, a1 -; CHECK64-IZFHMIN-NEXT: addi a1, a1, -1 -; CHECK64-IZFHMIN-NEXT: and a0, a1, a0 +; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: neg a0, a0 +; CHECK64-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz +; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 ; CHECK64-IZFHMIN-NEXT: ret ; ; CHECK32-IZHINXMIN-LABEL: fcvt_w_h_sat_sext: ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECK32-IZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK32-IZHINXMIN-NEXT: seqz a0, a0 -; CHECK32-IZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK32-IZHINXMIN-NEXT: and a0, a0, a1 +; CHECK32-IZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK32-IZHINXMIN-NEXT: neg a1, a1 +; CHECK32-IZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECK32-IZHINXMIN-NEXT: and a0, a1, a0 ; CHECK32-IZHINXMIN-NEXT: ret ; ; CHECK64-IZHINXMIN-LABEL: fcvt_w_h_sat_sext: ; CHECK64-IZHINXMIN: # %bb.0: # %start ; CHECK64-IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECK64-IZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK64-IZHINXMIN-NEXT: and a0, a0, a1 +; CHECK64-IZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZHINXMIN-NEXT: neg a1, a1 +; CHECK64-IZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECK64-IZHINXMIN-NEXT: and a0, a1, a0 ; CHECK64-IZHINXMIN-NEXT: ret ; ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_w_h_sat_sext: ; CHECK32-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: seqz a0, a0 -; CHECK32-IZDINXZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a0, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK32-IZDINXZHINXMIN-NEXT: neg a1, a1 +; CHECK32-IZDINXZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECK32-IZDINXZHINXMIN-NEXT: and a0, a1, a0 ; CHECK32-IZDINXZHINXMIN-NEXT: ret ; ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_w_h_sat_sext: ; CHECK64-IZDINXZHINXMIN: # %bb.0: # %start ; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: seqz a0, a0 -; CHECK64-IZDINXZHINXMIN-NEXT: addi a0, a0, -1 -; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a0, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECK64-IZDINXZHINXMIN-NEXT: neg a1, a1 +; CHECK64-IZDINXZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECK64-IZDINXZHINXMIN-NEXT: and a0, a1, a0 ; CHECK64-IZDINXZHINXMIN-NEXT: ret start: %0 = tail call i32 @llvm.fptosi.sat.i32.f16(half %a) diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll --- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll @@ -19,11 +19,10 @@ define signext i32 @test_floor_si32(half %x) { ; CHECKIZFH-LABEL: test_floor_si32: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: fcvt.w.h a0, fa0, rdn -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.w.h a1, fa0, rdn +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; CHECKIZHINX-LABEL: test_floor_si32: @@ -38,11 +37,10 @@ ; CHECKIZHINX-NEXT: fcvt.h.w a1, a1, rdn ; CHECKIZHINX-NEXT: fsgnj.h a0, a1, a0 ; CHECKIZHINX-NEXT: .LBB0_2: -; CHECKIZHINX-NEXT: fcvt.w.h a1, a0, rtz -; CHECKIZHINX-NEXT: feq.h a0, a0, a0 -; CHECKIZHINX-NEXT: seqz a0, a0 -; CHECKIZHINX-NEXT: addi a0, a0, -1 -; CHECKIZHINX-NEXT: and a0, a0, a1 +; CHECKIZHINX-NEXT: feq.h a1, a0, a0 +; CHECKIZHINX-NEXT: neg a1, a1 +; CHECKIZHINX-NEXT: fcvt.w.h a0, a0, rtz +; CHECKIZHINX-NEXT: and a0, a1, a0 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: test_floor_si32: @@ -60,11 +58,10 @@ ; CHECKIZFHMIN-NEXT: .LBB0_2: ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.w.s a0, fa5, rtz -; CHECKIZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECKIZFHMIN-NEXT: seqz a1, a1 -; CHECKIZFHMIN-NEXT: addi a1, a1, -1 -; CHECKIZFHMIN-NEXT: and a0, a1, a0 +; CHECKIZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECKIZFHMIN-NEXT: neg a0, a0 +; CHECKIZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz +; CHECKIZFHMIN-NEXT: and a0, a0, a1 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: test_floor_si32: @@ -81,11 +78,10 @@ ; CHECKIZHINXMIN-NEXT: .LBB0_2: ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECKIZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECKIZHINXMIN-NEXT: seqz a0, a0 -; CHECKIZHINXMIN-NEXT: addi a0, a0, -1 -; CHECKIZHINXMIN-NEXT: and a0, a0, a1 +; CHECKIZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECKIZHINXMIN-NEXT: neg a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECKIZHINXMIN-NEXT: and a0, a1, a0 ; CHECKIZHINXMIN-NEXT: ret %a = call half @llvm.floor.f16(half %x) %b = call i32 @llvm.fptosi.sat.i32.f16(half %a) @@ -144,11 +140,10 @@ ; ; RV64IZFH-LABEL: test_floor_si64: ; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: fcvt.l.h a0, fa0, rdn -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.l.h a1, fa0, rdn +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_floor_si64: @@ -213,11 +208,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rdn ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB1_2: -; RV64IZHINX-NEXT: fcvt.l.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.l.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_floor_si64: @@ -286,11 +280,10 @@ ; RV64IZFHMIN-NEXT: .LBB1_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.l.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a1, a0 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a0, a1 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_floor_si64: @@ -358,11 +351,10 @@ ; RV64IZHINXMIN-NEXT: .LBB1_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.l.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a0, a1 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.floor.f16(half %x) %b = call i64 @llvm.fptosi.sat.i64.f16(half %a) @@ -372,11 +364,10 @@ define signext i32 @test_floor_ui32(half %x) { ; CHECKIZFH-LABEL: test_floor_ui32: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: fcvt.wu.h a0, fa0, rdn -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.wu.h a1, fa0, rdn +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_floor_ui32: @@ -391,11 +382,10 @@ ; RV32IZHINX-NEXT: fcvt.h.w a1, a1, rdn ; RV32IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV32IZHINX-NEXT: .LBB2_2: -; RV32IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV32IZHINX-NEXT: feq.h a0, a0, a0 -; RV32IZHINX-NEXT: seqz a0, a0 -; RV32IZHINX-NEXT: addi a0, a0, -1 -; RV32IZHINX-NEXT: and a0, a0, a1 +; RV32IZHINX-NEXT: feq.h a1, a0, a0 +; RV32IZHINX-NEXT: neg a1, a1 +; RV32IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV32IZHINX-NEXT: and a0, a1, a0 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: test_floor_ui32: @@ -410,11 +400,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rdn ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB2_2: -; RV64IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a1, a0 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a0, a1 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_floor_ui32: @@ -432,11 +421,10 @@ ; RV32IZFHMIN-NEXT: .LBB2_2: ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; RV32IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV32IZFHMIN-NEXT: seqz a1, a1 -; RV32IZFHMIN-NEXT: addi a1, a1, -1 -; RV32IZFHMIN-NEXT: and a0, a1, a0 +; RV32IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV32IZFHMIN-NEXT: neg a0, a0 +; RV32IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; RV32IZFHMIN-NEXT: and a0, a0, a1 ; RV32IZFHMIN-NEXT: ret ; ; RV64IZFHMIN-LABEL: test_floor_ui32: @@ -454,11 +442,10 @@ ; RV64IZFHMIN-NEXT: .LBB2_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a0, a1 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a1, a0 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_floor_ui32: @@ -475,11 +462,10 @@ ; RV32IZHINXMIN-NEXT: .LBB2_2: ; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV32IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; RV32IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV32IZHINXMIN-NEXT: seqz a0, a0 -; RV32IZHINXMIN-NEXT: addi a0, a0, -1 -; RV32IZHINXMIN-NEXT: and a0, a0, a1 +; RV32IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV32IZHINXMIN-NEXT: neg a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; RV32IZHINXMIN-NEXT: and a0, a1, a0 ; RV32IZHINXMIN-NEXT: ret ; ; RV64IZHINXMIN-LABEL: test_floor_ui32: @@ -496,11 +482,10 @@ ; RV64IZHINXMIN-NEXT: .LBB2_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a1, a0 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a0, a1 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.floor.f16(half %x) %b = call i32 @llvm.fptoui.sat.i32.f16(half %a) @@ -546,11 +531,10 @@ ; ; RV64IZFH-LABEL: test_floor_ui64: ; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: fcvt.lu.h a0, fa0, rdn -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.lu.h a1, fa0, rdn +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_floor_ui64: @@ -600,11 +584,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rdn ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB3_2: -; RV64IZHINX-NEXT: fcvt.lu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.lu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_floor_ui64: @@ -660,11 +643,10 @@ ; RV64IZFHMIN-NEXT: .LBB3_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a1, a0 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.lu.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a0, a1 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_floor_ui64: @@ -717,11 +699,10 @@ ; RV64IZHINXMIN-NEXT: .LBB3_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.lu.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a0, a1 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.floor.f16(half %x) %b = call i64 @llvm.fptoui.sat.i64.f16(half %a) @@ -731,11 +712,10 @@ define signext i32 @test_ceil_si32(half %x) { ; CHECKIZFH-LABEL: test_ceil_si32: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: fcvt.w.h a0, fa0, rup -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.w.h a1, fa0, rup +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; CHECKIZHINX-LABEL: test_ceil_si32: @@ -750,11 +730,10 @@ ; CHECKIZHINX-NEXT: fcvt.h.w a1, a1, rup ; CHECKIZHINX-NEXT: fsgnj.h a0, a1, a0 ; CHECKIZHINX-NEXT: .LBB4_2: -; CHECKIZHINX-NEXT: fcvt.w.h a1, a0, rtz -; CHECKIZHINX-NEXT: feq.h a0, a0, a0 -; CHECKIZHINX-NEXT: seqz a0, a0 -; CHECKIZHINX-NEXT: addi a0, a0, -1 -; CHECKIZHINX-NEXT: and a0, a0, a1 +; CHECKIZHINX-NEXT: feq.h a1, a0, a0 +; CHECKIZHINX-NEXT: neg a1, a1 +; CHECKIZHINX-NEXT: fcvt.w.h a0, a0, rtz +; CHECKIZHINX-NEXT: and a0, a1, a0 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: test_ceil_si32: @@ -772,11 +751,10 @@ ; CHECKIZFHMIN-NEXT: .LBB4_2: ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.w.s a0, fa5, rtz -; CHECKIZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECKIZFHMIN-NEXT: seqz a1, a1 -; CHECKIZFHMIN-NEXT: addi a1, a1, -1 -; CHECKIZFHMIN-NEXT: and a0, a1, a0 +; CHECKIZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECKIZFHMIN-NEXT: neg a0, a0 +; CHECKIZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz +; CHECKIZFHMIN-NEXT: and a0, a0, a1 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: test_ceil_si32: @@ -793,11 +771,10 @@ ; CHECKIZHINXMIN-NEXT: .LBB4_2: ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECKIZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECKIZHINXMIN-NEXT: seqz a0, a0 -; CHECKIZHINXMIN-NEXT: addi a0, a0, -1 -; CHECKIZHINXMIN-NEXT: and a0, a0, a1 +; CHECKIZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECKIZHINXMIN-NEXT: neg a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECKIZHINXMIN-NEXT: and a0, a1, a0 ; CHECKIZHINXMIN-NEXT: ret %a = call half @llvm.ceil.f16(half %x) %b = call i32 @llvm.fptosi.sat.i32.f16(half %a) @@ -856,11 +833,10 @@ ; ; RV64IZFH-LABEL: test_ceil_si64: ; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: fcvt.l.h a0, fa0, rup -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.l.h a1, fa0, rup +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_ceil_si64: @@ -925,11 +901,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rup ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB5_2: -; RV64IZHINX-NEXT: fcvt.l.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.l.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_ceil_si64: @@ -998,11 +973,10 @@ ; RV64IZFHMIN-NEXT: .LBB5_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.l.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a1, a0 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a0, a1 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_ceil_si64: @@ -1070,11 +1044,10 @@ ; RV64IZHINXMIN-NEXT: .LBB5_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.l.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a0, a1 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.ceil.f16(half %x) %b = call i64 @llvm.fptosi.sat.i64.f16(half %a) @@ -1084,11 +1057,10 @@ define signext i32 @test_ceil_ui32(half %x) { ; CHECKIZFH-LABEL: test_ceil_ui32: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: fcvt.wu.h a0, fa0, rup -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.wu.h a1, fa0, rup +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_ceil_ui32: @@ -1103,11 +1075,10 @@ ; RV32IZHINX-NEXT: fcvt.h.w a1, a1, rup ; RV32IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV32IZHINX-NEXT: .LBB6_2: -; RV32IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV32IZHINX-NEXT: feq.h a0, a0, a0 -; RV32IZHINX-NEXT: seqz a0, a0 -; RV32IZHINX-NEXT: addi a0, a0, -1 -; RV32IZHINX-NEXT: and a0, a0, a1 +; RV32IZHINX-NEXT: feq.h a1, a0, a0 +; RV32IZHINX-NEXT: neg a1, a1 +; RV32IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV32IZHINX-NEXT: and a0, a1, a0 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: test_ceil_ui32: @@ -1122,11 +1093,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rup ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB6_2: -; RV64IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a1, a0 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a0, a1 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_ceil_ui32: @@ -1144,11 +1114,10 @@ ; RV32IZFHMIN-NEXT: .LBB6_2: ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; RV32IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV32IZFHMIN-NEXT: seqz a1, a1 -; RV32IZFHMIN-NEXT: addi a1, a1, -1 -; RV32IZFHMIN-NEXT: and a0, a1, a0 +; RV32IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV32IZFHMIN-NEXT: neg a0, a0 +; RV32IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; RV32IZFHMIN-NEXT: and a0, a0, a1 ; RV32IZFHMIN-NEXT: ret ; ; RV64IZFHMIN-LABEL: test_ceil_ui32: @@ -1166,11 +1135,10 @@ ; RV64IZFHMIN-NEXT: .LBB6_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a0, a1 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a1, a0 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_ceil_ui32: @@ -1187,11 +1155,10 @@ ; RV32IZHINXMIN-NEXT: .LBB6_2: ; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV32IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; RV32IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV32IZHINXMIN-NEXT: seqz a0, a0 -; RV32IZHINXMIN-NEXT: addi a0, a0, -1 -; RV32IZHINXMIN-NEXT: and a0, a0, a1 +; RV32IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV32IZHINXMIN-NEXT: neg a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; RV32IZHINXMIN-NEXT: and a0, a1, a0 ; RV32IZHINXMIN-NEXT: ret ; ; RV64IZHINXMIN-LABEL: test_ceil_ui32: @@ -1208,11 +1175,10 @@ ; RV64IZHINXMIN-NEXT: .LBB6_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a1, a0 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a0, a1 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.ceil.f16(half %x) %b = call i32 @llvm.fptoui.sat.i32.f16(half %a) @@ -1258,11 +1224,10 @@ ; ; RV64IZFH-LABEL: test_ceil_ui64: ; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: fcvt.lu.h a0, fa0, rup -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.lu.h a1, fa0, rup +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_ceil_ui64: @@ -1312,11 +1277,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rup ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB7_2: -; RV64IZHINX-NEXT: fcvt.lu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.lu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_ceil_ui64: @@ -1372,11 +1336,10 @@ ; RV64IZFHMIN-NEXT: .LBB7_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a1, a0 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.lu.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a0, a1 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_ceil_ui64: @@ -1429,11 +1392,10 @@ ; RV64IZHINXMIN-NEXT: .LBB7_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.lu.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a0, a1 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.ceil.f16(half %x) %b = call i64 @llvm.fptoui.sat.i64.f16(half %a) @@ -1443,11 +1405,10 @@ define signext i32 @test_trunc_si32(half %x) { ; CHECKIZFH-LABEL: test_trunc_si32: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: fcvt.w.h a0, fa0, rtz -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.w.h a1, fa0, rtz +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; CHECKIZHINX-LABEL: test_trunc_si32: @@ -1462,11 +1423,10 @@ ; CHECKIZHINX-NEXT: fcvt.h.w a1, a1, rtz ; CHECKIZHINX-NEXT: fsgnj.h a0, a1, a0 ; CHECKIZHINX-NEXT: .LBB8_2: -; CHECKIZHINX-NEXT: fcvt.w.h a1, a0, rtz -; CHECKIZHINX-NEXT: feq.h a0, a0, a0 -; CHECKIZHINX-NEXT: seqz a0, a0 -; CHECKIZHINX-NEXT: addi a0, a0, -1 -; CHECKIZHINX-NEXT: and a0, a0, a1 +; CHECKIZHINX-NEXT: feq.h a1, a0, a0 +; CHECKIZHINX-NEXT: neg a1, a1 +; CHECKIZHINX-NEXT: fcvt.w.h a0, a0, rtz +; CHECKIZHINX-NEXT: and a0, a1, a0 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: test_trunc_si32: @@ -1484,11 +1444,10 @@ ; CHECKIZFHMIN-NEXT: .LBB8_2: ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.w.s a0, fa5, rtz -; CHECKIZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECKIZFHMIN-NEXT: seqz a1, a1 -; CHECKIZFHMIN-NEXT: addi a1, a1, -1 -; CHECKIZFHMIN-NEXT: and a0, a1, a0 +; CHECKIZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECKIZFHMIN-NEXT: neg a0, a0 +; CHECKIZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz +; CHECKIZFHMIN-NEXT: and a0, a0, a1 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: test_trunc_si32: @@ -1505,11 +1464,10 @@ ; CHECKIZHINXMIN-NEXT: .LBB8_2: ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECKIZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECKIZHINXMIN-NEXT: seqz a0, a0 -; CHECKIZHINXMIN-NEXT: addi a0, a0, -1 -; CHECKIZHINXMIN-NEXT: and a0, a0, a1 +; CHECKIZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECKIZHINXMIN-NEXT: neg a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECKIZHINXMIN-NEXT: and a0, a1, a0 ; CHECKIZHINXMIN-NEXT: ret %a = call half @llvm.trunc.f16(half %x) %b = call i32 @llvm.fptosi.sat.i32.f16(half %a) @@ -1568,11 +1526,10 @@ ; ; RV64IZFH-LABEL: test_trunc_si64: ; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: fcvt.l.h a0, fa0, rtz -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.l.h a1, fa0, rtz +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_trunc_si64: @@ -1637,11 +1594,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rtz ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB9_2: -; RV64IZHINX-NEXT: fcvt.l.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.l.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_trunc_si64: @@ -1710,11 +1666,10 @@ ; RV64IZFHMIN-NEXT: .LBB9_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.l.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a1, a0 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a0, a1 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_trunc_si64: @@ -1782,11 +1737,10 @@ ; RV64IZHINXMIN-NEXT: .LBB9_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.l.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a0, a1 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.trunc.f16(half %x) %b = call i64 @llvm.fptosi.sat.i64.f16(half %a) @@ -1796,11 +1750,10 @@ define signext i32 @test_trunc_ui32(half %x) { ; CHECKIZFH-LABEL: test_trunc_ui32: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: fcvt.wu.h a0, fa0, rtz -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.wu.h a1, fa0, rtz +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_trunc_ui32: @@ -1815,11 +1768,10 @@ ; RV32IZHINX-NEXT: fcvt.h.w a1, a1, rtz ; RV32IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV32IZHINX-NEXT: .LBB10_2: -; RV32IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV32IZHINX-NEXT: feq.h a0, a0, a0 -; RV32IZHINX-NEXT: seqz a0, a0 -; RV32IZHINX-NEXT: addi a0, a0, -1 -; RV32IZHINX-NEXT: and a0, a0, a1 +; RV32IZHINX-NEXT: feq.h a1, a0, a0 +; RV32IZHINX-NEXT: neg a1, a1 +; RV32IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV32IZHINX-NEXT: and a0, a1, a0 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: test_trunc_ui32: @@ -1834,11 +1786,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rtz ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB10_2: -; RV64IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a1, a0 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a0, a1 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_trunc_ui32: @@ -1856,11 +1807,10 @@ ; RV32IZFHMIN-NEXT: .LBB10_2: ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; RV32IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV32IZFHMIN-NEXT: seqz a1, a1 -; RV32IZFHMIN-NEXT: addi a1, a1, -1 -; RV32IZFHMIN-NEXT: and a0, a1, a0 +; RV32IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV32IZFHMIN-NEXT: neg a0, a0 +; RV32IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; RV32IZFHMIN-NEXT: and a0, a0, a1 ; RV32IZFHMIN-NEXT: ret ; ; RV64IZFHMIN-LABEL: test_trunc_ui32: @@ -1878,11 +1828,10 @@ ; RV64IZFHMIN-NEXT: .LBB10_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a0, a1 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a1, a0 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_trunc_ui32: @@ -1899,11 +1848,10 @@ ; RV32IZHINXMIN-NEXT: .LBB10_2: ; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV32IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; RV32IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV32IZHINXMIN-NEXT: seqz a0, a0 -; RV32IZHINXMIN-NEXT: addi a0, a0, -1 -; RV32IZHINXMIN-NEXT: and a0, a0, a1 +; RV32IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV32IZHINXMIN-NEXT: neg a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; RV32IZHINXMIN-NEXT: and a0, a1, a0 ; RV32IZHINXMIN-NEXT: ret ; ; RV64IZHINXMIN-LABEL: test_trunc_ui32: @@ -1920,11 +1868,10 @@ ; RV64IZHINXMIN-NEXT: .LBB10_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a1, a0 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a0, a1 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.trunc.f16(half %x) %b = call i32 @llvm.fptoui.sat.i32.f16(half %a) @@ -1970,11 +1917,10 @@ ; ; RV64IZFH-LABEL: test_trunc_ui64: ; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: fcvt.lu.h a0, fa0, rtz -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.lu.h a1, fa0, rtz +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_trunc_ui64: @@ -2024,11 +1970,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rtz ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB11_2: -; RV64IZHINX-NEXT: fcvt.lu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.lu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_trunc_ui64: @@ -2084,11 +2029,10 @@ ; RV64IZFHMIN-NEXT: .LBB11_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a1, a0 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.lu.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a0, a1 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_trunc_ui64: @@ -2141,11 +2085,10 @@ ; RV64IZHINXMIN-NEXT: .LBB11_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.lu.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a0, a1 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.trunc.f16(half %x) %b = call i64 @llvm.fptoui.sat.i64.f16(half %a) @@ -2155,11 +2098,10 @@ define signext i32 @test_round_si32(half %x) { ; CHECKIZFH-LABEL: test_round_si32: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: fcvt.w.h a0, fa0, rmm -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.w.h a1, fa0, rmm +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; CHECKIZHINX-LABEL: test_round_si32: @@ -2174,11 +2116,10 @@ ; CHECKIZHINX-NEXT: fcvt.h.w a1, a1, rmm ; CHECKIZHINX-NEXT: fsgnj.h a0, a1, a0 ; CHECKIZHINX-NEXT: .LBB12_2: -; CHECKIZHINX-NEXT: fcvt.w.h a1, a0, rtz -; CHECKIZHINX-NEXT: feq.h a0, a0, a0 -; CHECKIZHINX-NEXT: seqz a0, a0 -; CHECKIZHINX-NEXT: addi a0, a0, -1 -; CHECKIZHINX-NEXT: and a0, a0, a1 +; CHECKIZHINX-NEXT: feq.h a1, a0, a0 +; CHECKIZHINX-NEXT: neg a1, a1 +; CHECKIZHINX-NEXT: fcvt.w.h a0, a0, rtz +; CHECKIZHINX-NEXT: and a0, a1, a0 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: test_round_si32: @@ -2196,11 +2137,10 @@ ; CHECKIZFHMIN-NEXT: .LBB12_2: ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.w.s a0, fa5, rtz -; CHECKIZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECKIZFHMIN-NEXT: seqz a1, a1 -; CHECKIZFHMIN-NEXT: addi a1, a1, -1 -; CHECKIZFHMIN-NEXT: and a0, a1, a0 +; CHECKIZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECKIZFHMIN-NEXT: neg a0, a0 +; CHECKIZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz +; CHECKIZFHMIN-NEXT: and a0, a0, a1 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: test_round_si32: @@ -2217,11 +2157,10 @@ ; CHECKIZHINXMIN-NEXT: .LBB12_2: ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECKIZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECKIZHINXMIN-NEXT: seqz a0, a0 -; CHECKIZHINXMIN-NEXT: addi a0, a0, -1 -; CHECKIZHINXMIN-NEXT: and a0, a0, a1 +; CHECKIZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECKIZHINXMIN-NEXT: neg a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECKIZHINXMIN-NEXT: and a0, a1, a0 ; CHECKIZHINXMIN-NEXT: ret %a = call half @llvm.round.f16(half %x) %b = call i32 @llvm.fptosi.sat.i32.f16(half %a) @@ -2280,11 +2219,10 @@ ; ; RV64IZFH-LABEL: test_round_si64: ; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: fcvt.l.h a0, fa0, rmm -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.l.h a1, fa0, rmm +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_round_si64: @@ -2349,11 +2287,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rmm ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB13_2: -; RV64IZHINX-NEXT: fcvt.l.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.l.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_round_si64: @@ -2422,11 +2359,10 @@ ; RV64IZFHMIN-NEXT: .LBB13_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.l.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a1, a0 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a0, a1 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_round_si64: @@ -2494,11 +2430,10 @@ ; RV64IZHINXMIN-NEXT: .LBB13_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.l.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a0, a1 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.round.f16(half %x) %b = call i64 @llvm.fptosi.sat.i64.f16(half %a) @@ -2508,11 +2443,10 @@ define signext i32 @test_round_ui32(half %x) { ; CHECKIZFH-LABEL: test_round_ui32: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: fcvt.wu.h a0, fa0, rmm -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.wu.h a1, fa0, rmm +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_round_ui32: @@ -2527,11 +2461,10 @@ ; RV32IZHINX-NEXT: fcvt.h.w a1, a1, rmm ; RV32IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV32IZHINX-NEXT: .LBB14_2: -; RV32IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV32IZHINX-NEXT: feq.h a0, a0, a0 -; RV32IZHINX-NEXT: seqz a0, a0 -; RV32IZHINX-NEXT: addi a0, a0, -1 -; RV32IZHINX-NEXT: and a0, a0, a1 +; RV32IZHINX-NEXT: feq.h a1, a0, a0 +; RV32IZHINX-NEXT: neg a1, a1 +; RV32IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV32IZHINX-NEXT: and a0, a1, a0 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: test_round_ui32: @@ -2546,11 +2479,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rmm ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB14_2: -; RV64IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a1, a0 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a0, a1 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_round_ui32: @@ -2568,11 +2500,10 @@ ; RV32IZFHMIN-NEXT: .LBB14_2: ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; RV32IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV32IZFHMIN-NEXT: seqz a1, a1 -; RV32IZFHMIN-NEXT: addi a1, a1, -1 -; RV32IZFHMIN-NEXT: and a0, a1, a0 +; RV32IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV32IZFHMIN-NEXT: neg a0, a0 +; RV32IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; RV32IZFHMIN-NEXT: and a0, a0, a1 ; RV32IZFHMIN-NEXT: ret ; ; RV64IZFHMIN-LABEL: test_round_ui32: @@ -2590,11 +2521,10 @@ ; RV64IZFHMIN-NEXT: .LBB14_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a0, a1 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a1, a0 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_round_ui32: @@ -2611,11 +2541,10 @@ ; RV32IZHINXMIN-NEXT: .LBB14_2: ; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV32IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; RV32IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV32IZHINXMIN-NEXT: seqz a0, a0 -; RV32IZHINXMIN-NEXT: addi a0, a0, -1 -; RV32IZHINXMIN-NEXT: and a0, a0, a1 +; RV32IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV32IZHINXMIN-NEXT: neg a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; RV32IZHINXMIN-NEXT: and a0, a1, a0 ; RV32IZHINXMIN-NEXT: ret ; ; RV64IZHINXMIN-LABEL: test_round_ui32: @@ -2632,11 +2561,10 @@ ; RV64IZHINXMIN-NEXT: .LBB14_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a1, a0 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a0, a1 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.round.f16(half %x) %b = call i32 @llvm.fptoui.sat.i32.f16(half %a) @@ -2682,11 +2610,10 @@ ; ; RV64IZFH-LABEL: test_round_ui64: ; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: fcvt.lu.h a0, fa0, rmm -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.lu.h a1, fa0, rmm +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_round_ui64: @@ -2736,11 +2663,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rmm ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB15_2: -; RV64IZHINX-NEXT: fcvt.lu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.lu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_round_ui64: @@ -2796,11 +2722,10 @@ ; RV64IZFHMIN-NEXT: .LBB15_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a1, a0 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.lu.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a0, a1 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_round_ui64: @@ -2853,11 +2778,10 @@ ; RV64IZHINXMIN-NEXT: .LBB15_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.lu.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a0, a1 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.round.f16(half %x) %b = call i64 @llvm.fptoui.sat.i64.f16(half %a) @@ -2867,11 +2791,10 @@ define signext i32 @test_roundeven_si32(half %x) { ; CHECKIZFH-LABEL: test_roundeven_si32: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: fcvt.w.h a0, fa0, rne -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.w.h a1, fa0, rne +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; CHECKIZHINX-LABEL: test_roundeven_si32: @@ -2886,11 +2809,10 @@ ; CHECKIZHINX-NEXT: fcvt.h.w a1, a1, rne ; CHECKIZHINX-NEXT: fsgnj.h a0, a1, a0 ; CHECKIZHINX-NEXT: .LBB16_2: -; CHECKIZHINX-NEXT: fcvt.w.h a1, a0, rtz -; CHECKIZHINX-NEXT: feq.h a0, a0, a0 -; CHECKIZHINX-NEXT: seqz a0, a0 -; CHECKIZHINX-NEXT: addi a0, a0, -1 -; CHECKIZHINX-NEXT: and a0, a0, a1 +; CHECKIZHINX-NEXT: feq.h a1, a0, a0 +; CHECKIZHINX-NEXT: neg a1, a1 +; CHECKIZHINX-NEXT: fcvt.w.h a0, a0, rtz +; CHECKIZHINX-NEXT: and a0, a1, a0 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: test_roundeven_si32: @@ -2908,11 +2830,10 @@ ; CHECKIZFHMIN-NEXT: .LBB16_2: ; CHECKIZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; CHECKIZFHMIN-NEXT: fcvt.w.s a0, fa5, rtz -; CHECKIZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECKIZFHMIN-NEXT: seqz a1, a1 -; CHECKIZFHMIN-NEXT: addi a1, a1, -1 -; CHECKIZFHMIN-NEXT: and a0, a1, a0 +; CHECKIZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECKIZFHMIN-NEXT: neg a0, a0 +; CHECKIZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz +; CHECKIZFHMIN-NEXT: and a0, a0, a1 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: test_roundeven_si32: @@ -2929,11 +2850,10 @@ ; CHECKIZHINXMIN-NEXT: .LBB16_2: ; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 ; CHECKIZHINXMIN-NEXT: fcvt.s.h a0, a0 -; CHECKIZHINXMIN-NEXT: fcvt.w.s a1, a0, rtz -; CHECKIZHINXMIN-NEXT: feq.s a0, a0, a0 -; CHECKIZHINXMIN-NEXT: seqz a0, a0 -; CHECKIZHINXMIN-NEXT: addi a0, a0, -1 -; CHECKIZHINXMIN-NEXT: and a0, a0, a1 +; CHECKIZHINXMIN-NEXT: feq.s a1, a0, a0 +; CHECKIZHINXMIN-NEXT: neg a1, a1 +; CHECKIZHINXMIN-NEXT: fcvt.w.s a0, a0, rtz +; CHECKIZHINXMIN-NEXT: and a0, a1, a0 ; CHECKIZHINXMIN-NEXT: ret %a = call half @llvm.roundeven.f16(half %x) %b = call i32 @llvm.fptosi.sat.i32.f16(half %a) @@ -2992,11 +2912,10 @@ ; ; RV64IZFH-LABEL: test_roundeven_si64: ; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: fcvt.l.h a0, fa0, rne -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.l.h a1, fa0, rne +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_roundeven_si64: @@ -3061,11 +2980,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rne ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB17_2: -; RV64IZHINX-NEXT: fcvt.l.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.l.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_roundeven_si64: @@ -3134,11 +3052,10 @@ ; RV64IZFHMIN-NEXT: .LBB17_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.l.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a1, a0 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a0, a1 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_roundeven_si64: @@ -3206,11 +3123,10 @@ ; RV64IZHINXMIN-NEXT: .LBB17_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.l.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a0, a1 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.l.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.roundeven.f16(half %x) %b = call i64 @llvm.fptosi.sat.i64.f16(half %a) @@ -3220,11 +3136,10 @@ define signext i32 @test_roundeven_ui32(half %x) { ; CHECKIZFH-LABEL: test_roundeven_ui32: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: fcvt.wu.h a0, fa0, rne -; CHECKIZFH-NEXT: feq.h a1, fa0, fa0 -; CHECKIZFH-NEXT: seqz a1, a1 -; CHECKIZFH-NEXT: addi a1, a1, -1 -; CHECKIZFH-NEXT: and a0, a1, a0 +; CHECKIZFH-NEXT: feq.h a0, fa0, fa0 +; CHECKIZFH-NEXT: neg a0, a0 +; CHECKIZFH-NEXT: fcvt.wu.h a1, fa0, rne +; CHECKIZFH-NEXT: and a0, a0, a1 ; CHECKIZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_roundeven_ui32: @@ -3239,11 +3154,10 @@ ; RV32IZHINX-NEXT: fcvt.h.w a1, a1, rne ; RV32IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV32IZHINX-NEXT: .LBB18_2: -; RV32IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV32IZHINX-NEXT: feq.h a0, a0, a0 -; RV32IZHINX-NEXT: seqz a0, a0 -; RV32IZHINX-NEXT: addi a0, a0, -1 -; RV32IZHINX-NEXT: and a0, a0, a1 +; RV32IZHINX-NEXT: feq.h a1, a0, a0 +; RV32IZHINX-NEXT: neg a1, a1 +; RV32IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV32IZHINX-NEXT: and a0, a1, a0 ; RV32IZHINX-NEXT: ret ; ; RV64IZHINX-LABEL: test_roundeven_ui32: @@ -3258,11 +3172,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rne ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB18_2: -; RV64IZHINX-NEXT: fcvt.wu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a1, a0 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.wu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a0, a1 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_roundeven_ui32: @@ -3280,11 +3193,10 @@ ; RV32IZFHMIN-NEXT: .LBB18_2: ; RV32IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; RV32IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV32IZFHMIN-NEXT: seqz a1, a1 -; RV32IZFHMIN-NEXT: addi a1, a1, -1 -; RV32IZFHMIN-NEXT: and a0, a1, a0 +; RV32IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV32IZFHMIN-NEXT: neg a0, a0 +; RV32IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; RV32IZFHMIN-NEXT: and a0, a0, a1 ; RV32IZFHMIN-NEXT: ret ; ; RV64IZFHMIN-LABEL: test_roundeven_ui32: @@ -3302,11 +3214,10 @@ ; RV64IZFHMIN-NEXT: .LBB18_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a0, a1 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.wu.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a1, a0 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_roundeven_ui32: @@ -3323,11 +3234,10 @@ ; RV32IZHINXMIN-NEXT: .LBB18_2: ; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV32IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; RV32IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV32IZHINXMIN-NEXT: seqz a0, a0 -; RV32IZHINXMIN-NEXT: addi a0, a0, -1 -; RV32IZHINXMIN-NEXT: and a0, a0, a1 +; RV32IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV32IZHINXMIN-NEXT: neg a1, a1 +; RV32IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; RV32IZHINXMIN-NEXT: and a0, a1, a0 ; RV32IZHINXMIN-NEXT: ret ; ; RV64IZHINXMIN-LABEL: test_roundeven_ui32: @@ -3344,11 +3254,10 @@ ; RV64IZHINXMIN-NEXT: .LBB18_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.wu.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a1, a0 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.wu.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a0, a1 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.roundeven.f16(half %x) %b = call i32 @llvm.fptoui.sat.i32.f16(half %a) @@ -3394,11 +3303,10 @@ ; ; RV64IZFH-LABEL: test_roundeven_ui64: ; RV64IZFH: # %bb.0: -; RV64IZFH-NEXT: fcvt.lu.h a0, fa0, rne -; RV64IZFH-NEXT: feq.h a1, fa0, fa0 -; RV64IZFH-NEXT: seqz a1, a1 -; RV64IZFH-NEXT: addi a1, a1, -1 -; RV64IZFH-NEXT: and a0, a1, a0 +; RV64IZFH-NEXT: feq.h a0, fa0, fa0 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fcvt.lu.h a1, fa0, rne +; RV64IZFH-NEXT: and a0, a0, a1 ; RV64IZFH-NEXT: ret ; ; RV32IZHINX-LABEL: test_roundeven_ui64: @@ -3448,11 +3356,10 @@ ; RV64IZHINX-NEXT: fcvt.h.w a1, a1, rne ; RV64IZHINX-NEXT: fsgnj.h a0, a1, a0 ; RV64IZHINX-NEXT: .LBB19_2: -; RV64IZHINX-NEXT: fcvt.lu.h a1, a0, rtz -; RV64IZHINX-NEXT: feq.h a0, a0, a0 -; RV64IZHINX-NEXT: seqz a0, a0 -; RV64IZHINX-NEXT: addi a0, a0, -1 -; RV64IZHINX-NEXT: and a0, a0, a1 +; RV64IZHINX-NEXT: feq.h a1, a0, a0 +; RV64IZHINX-NEXT: neg a1, a1 +; RV64IZHINX-NEXT: fcvt.lu.h a0, a0, rtz +; RV64IZHINX-NEXT: and a0, a1, a0 ; RV64IZHINX-NEXT: ret ; ; RV32IZFHMIN-LABEL: test_roundeven_ui64: @@ -3508,11 +3415,10 @@ ; RV64IZFHMIN-NEXT: .LBB19_2: ; RV64IZFHMIN-NEXT: fcvt.h.s fa5, fa5 ; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz -; RV64IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; RV64IZFHMIN-NEXT: seqz a1, a1 -; RV64IZFHMIN-NEXT: addi a1, a1, -1 -; RV64IZFHMIN-NEXT: and a0, a1, a0 +; RV64IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; RV64IZFHMIN-NEXT: neg a0, a0 +; RV64IZFHMIN-NEXT: fcvt.lu.s a1, fa5, rtz +; RV64IZFHMIN-NEXT: and a0, a0, a1 ; RV64IZFHMIN-NEXT: ret ; ; RV32IZHINXMIN-LABEL: test_roundeven_ui64: @@ -3565,11 +3471,10 @@ ; RV64IZHINXMIN-NEXT: .LBB19_2: ; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0 ; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0 -; RV64IZHINXMIN-NEXT: fcvt.lu.s a1, a0, rtz -; RV64IZHINXMIN-NEXT: feq.s a0, a0, a0 -; RV64IZHINXMIN-NEXT: seqz a0, a0 -; RV64IZHINXMIN-NEXT: addi a0, a0, -1 -; RV64IZHINXMIN-NEXT: and a0, a0, a1 +; RV64IZHINXMIN-NEXT: feq.s a1, a0, a0 +; RV64IZHINXMIN-NEXT: neg a1, a1 +; RV64IZHINXMIN-NEXT: fcvt.lu.s a0, a0, rtz +; RV64IZHINXMIN-NEXT: and a0, a1, a0 ; RV64IZHINXMIN-NEXT: ret %a = call half @llvm.roundeven.f16(half %x) %b = call i64 @llvm.fptoui.sat.i64.f16(half %a) diff --git a/llvm/test/CodeGen/RISCV/legalize-fneg.ll b/llvm/test/CodeGen/RISCV/legalize-fneg.ll --- a/llvm/test/CodeGen/RISCV/legalize-fneg.ll +++ b/llvm/test/CodeGen/RISCV/legalize-fneg.ll @@ -56,15 +56,15 @@ define void @test3(ptr %a, ptr %b) nounwind { ; RV32-LABEL: test3: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lw a2, 4(a1) +; RV32-NEXT: lw a2, 0(a1) ; RV32-NEXT: lw a3, 12(a1) ; RV32-NEXT: lw a4, 8(a1) -; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: lw a1, 4(a1) ; RV32-NEXT: lui a5, 524288 ; RV32-NEXT: xor a3, a3, a5 ; RV32-NEXT: sw a4, 8(a0) -; RV32-NEXT: sw a1, 0(a0) -; RV32-NEXT: sw a2, 4(a0) +; RV32-NEXT: sw a1, 4(a0) +; RV32-NEXT: sw a2, 0(a0) ; RV32-NEXT: sw a3, 12(a0) ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll --- a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll +++ b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll @@ -35,7 +35,7 @@ ; RV32-NEXT: slli a0, a0, 6 ; RV32-NEXT: lui a2, %hi(A) ; RV32-NEXT: addi a2, a2, %lo(A) -; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a0, a2, a0 ; RV32-NEXT: addi a0, a0, 8 ; RV32-NEXT: li a2, 4 ; RV32-NEXT: li a3, 5 @@ -57,7 +57,7 @@ ; RV64-NEXT: slli a0, a0, 6 ; RV64-NEXT: lui a2, %hi(A) ; RV64-NEXT: addi a2, a2, %lo(A) -; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: add a0, a2, a0 ; RV64-NEXT: addi a2, a0, 4 ; RV64-NEXT: li a3, 2 ; RV64-NEXT: li a4, 4 diff --git a/llvm/test/CodeGen/RISCV/mem.ll b/llvm/test/CodeGen/RISCV/mem.ll --- a/llvm/test/CodeGen/RISCV/mem.ll +++ b/llvm/test/CodeGen/RISCV/mem.ll @@ -124,6 +124,7 @@ ; RV32I-NEXT: lbu a1, 1(a0) ; RV32I-NEXT: lbu a2, 2(a0) ; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: andi a1, a1, 1 ; RV32I-NEXT: sub a0, a2, a1 ; RV32I-NEXT: ret ; sextload i1 @@ -146,6 +147,7 @@ ; RV32I-NEXT: lbu a1, 1(a0) ; RV32I-NEXT: lbu a2, 2(a0) ; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: andi a1, a1, 1 ; RV32I-NEXT: sub a0, a2, a1 ; RV32I-NEXT: ret ; sextload i1 diff --git a/llvm/test/CodeGen/RISCV/mem64.ll b/llvm/test/CodeGen/RISCV/mem64.ll --- a/llvm/test/CodeGen/RISCV/mem64.ll +++ b/llvm/test/CodeGen/RISCV/mem64.ll @@ -169,6 +169,7 @@ ; RV64I-NEXT: lbu a1, 1(a0) ; RV64I-NEXT: lbu a2, 2(a0) ; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: andi a1, a1, 1 ; RV64I-NEXT: sub a0, a2, a1 ; RV64I-NEXT: ret ; sextload i1 @@ -191,6 +192,7 @@ ; RV64I-NEXT: lbu a1, 1(a0) ; RV64I-NEXT: lbu a2, 2(a0) ; RV64I-NEXT: lbu a0, 0(a0) +; RV64I-NEXT: andi a1, a1, 1 ; RV64I-NEXT: sub a0, a2, a1 ; RV64I-NEXT: ret ; sextload i1 diff --git a/llvm/test/CodeGen/RISCV/pr58511.ll b/llvm/test/CodeGen/RISCV/pr58511.ll --- a/llvm/test/CodeGen/RISCV/pr58511.ll +++ b/llvm/test/CodeGen/RISCV/pr58511.ll @@ -8,8 +8,8 @@ ; CHECK-NEXT: srai a0, a0, 63 ; CHECK-NEXT: lui a3, 4097 ; CHECK-NEXT: addiw a3, a3, -2047 -; CHECK-NEXT: or a0, a0, a3 ; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: or a0, a0, a3 ; CHECK-NEXT: sw a1, 0(a2) ; CHECK-NEXT: ret BB: @@ -27,8 +27,8 @@ ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: lui a3, 4097 ; CHECK-NEXT: addiw a3, a3, -2047 -; CHECK-NEXT: or a0, a0, a3 ; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: or a0, a0, a3 ; CHECK-NEXT: sw a1, 0(a2) ; CHECK-NEXT: ret BB: @@ -42,11 +42,11 @@ define i32 @h(i1 %0, i32 %1, ptr %2) { ; CHECK-LABEL: h: ; CHECK: # %bb.0: # %BB +; CHECK-NEXT: slli a0, a0, 63 +; CHECK-NEXT: srai a0, a0, 63 ; CHECK-NEXT: lui a3, 4097 ; CHECK-NEXT: addiw a3, a3, -2047 ; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: slli a0, a0, 63 -; CHECK-NEXT: srai a0, a0, 63 ; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: sw a1, 0(a2) ; CHECK-NEXT: ret @@ -62,10 +62,10 @@ ; CHECK-LABEL: i: ; CHECK: # %bb.0: # %BB ; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: lui a3, 4097 ; CHECK-NEXT: addiw a3, a3, -2047 ; CHECK-NEXT: mul a1, a1, a3 -; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a3 ; CHECK-NEXT: sw a1, 0(a2) ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll --- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll +++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll @@ -42,7 +42,10 @@ ; ; RV64ZBB-LABEL: rotl_32: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: rolw a0, a0, a1 +; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: sllw a1, a0, a1 +; RV64ZBB-NEXT: srlw a0, a0, a2 +; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret ; ; RV32XTHEADBB-LABEL: rotl_32: @@ -55,10 +58,10 @@ ; ; RV64XTHEADBB-LABEL: rotl_32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 -; RV64XTHEADBB-NEXT: srlw a0, a0, a1 -; RV64XTHEADBB-NEXT: or a0, a2, a0 +; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: sllw a1, a0, a1 +; RV64XTHEADBB-NEXT: srlw a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 ; RV64XTHEADBB-NEXT: ret %z = sub i32 32, %y %b = shl i32 %x, %y @@ -91,7 +94,10 @@ ; ; RV64ZBB-LABEL: rotr_32: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: rorw a0, a0, a1 +; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: srlw a1, a0, a1 +; RV64ZBB-NEXT: sllw a0, a0, a2 +; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret ; ; RV32XTHEADBB-LABEL: rotr_32: @@ -104,10 +110,10 @@ ; ; RV64XTHEADBB-LABEL: rotr_32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 -; RV64XTHEADBB-NEXT: sllw a0, a0, a1 -; RV64XTHEADBB-NEXT: or a0, a2, a0 +; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: srlw a1, a0, a1 +; RV64XTHEADBB-NEXT: sllw a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 ; RV64XTHEADBB-NEXT: ret %z = sub i32 32, %y %b = lshr i32 %x, %y @@ -436,7 +442,10 @@ ; ; RV64ZBB-LABEL: rotl_32_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: rolw a0, a0, a1 +; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: sllw a1, a0, a1 +; RV64ZBB-NEXT: srlw a0, a0, a2 +; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret ; ; RV32XTHEADBB-LABEL: rotl_32_mask: @@ -449,10 +458,10 @@ ; ; RV64XTHEADBB-LABEL: rotl_32_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: sllw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 -; RV64XTHEADBB-NEXT: srlw a0, a0, a1 -; RV64XTHEADBB-NEXT: or a0, a2, a0 +; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: sllw a1, a0, a1 +; RV64XTHEADBB-NEXT: srlw a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 ; RV64XTHEADBB-NEXT: ret %z = sub i32 0, %y %and = and i32 %z, 31 @@ -486,7 +495,10 @@ ; ; RV64ZBB-LABEL: rotl_32_mask_and_63_and_31: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: rolw a0, a0, a1 +; RV64ZBB-NEXT: sllw a2, a0, a1 +; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: srlw a0, a0, a1 +; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret ; ; RV32XTHEADBB-LABEL: rotl_32_mask_and_63_and_31: @@ -529,28 +541,28 @@ ; ; RV32ZBB-LABEL: rotl_32_mask_or_64_or_32: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: rol a0, a0, a1 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: ori a1, a1, 32 +; RV32ZBB-NEXT: srl a0, a0, a1 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: rotl_32_mask_or_64_or_32: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: rolw a0, a0, a1 +; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: srlw a0, a0, a1 ; RV64ZBB-NEXT: ret ; ; RV32XTHEADBB-LABEL: rotl_32_mask_or_64_or_32: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: sll a2, a0, a1 ; RV32XTHEADBB-NEXT: neg a1, a1 +; RV32XTHEADBB-NEXT: ori a1, a1, 32 ; RV32XTHEADBB-NEXT: srl a0, a0, a1 -; RV32XTHEADBB-NEXT: or a0, a2, a0 ; RV32XTHEADBB-NEXT: ret ; ; RV64XTHEADBB-LABEL: rotl_32_mask_or_64_or_32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: sllw a2, a0, a1 ; RV64XTHEADBB-NEXT: negw a1, a1 ; RV64XTHEADBB-NEXT: srlw a0, a0, a1 -; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret %a = or i32 %y, 64 %b = shl i32 %x, %a @@ -585,7 +597,10 @@ ; ; RV64ZBB-LABEL: rotr_32_mask: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: rorw a0, a0, a1 +; RV64ZBB-NEXT: negw a2, a1 +; RV64ZBB-NEXT: srlw a1, a0, a1 +; RV64ZBB-NEXT: sllw a0, a0, a2 +; RV64ZBB-NEXT: or a0, a1, a0 ; RV64ZBB-NEXT: ret ; ; RV32XTHEADBB-LABEL: rotr_32_mask: @@ -598,10 +613,10 @@ ; ; RV64XTHEADBB-LABEL: rotr_32_mask: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 -; RV64XTHEADBB-NEXT: sllw a0, a0, a1 -; RV64XTHEADBB-NEXT: or a0, a2, a0 +; RV64XTHEADBB-NEXT: negw a2, a1 +; RV64XTHEADBB-NEXT: srlw a1, a0, a1 +; RV64XTHEADBB-NEXT: sllw a0, a0, a2 +; RV64XTHEADBB-NEXT: or a0, a1, a0 ; RV64XTHEADBB-NEXT: ret %z = sub i32 0, %y %and = and i32 %z, 31 @@ -635,7 +650,10 @@ ; ; RV64ZBB-LABEL: rotr_32_mask_and_63_and_31: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: rorw a0, a0, a1 +; RV64ZBB-NEXT: srlw a2, a0, a1 +; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: sllw a0, a0, a1 +; RV64ZBB-NEXT: or a0, a2, a0 ; RV64ZBB-NEXT: ret ; ; RV32XTHEADBB-LABEL: rotr_32_mask_and_63_and_31: @@ -676,28 +694,24 @@ ; ; RV32ZBB-LABEL: rotr_32_mask_or_64_or_32: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: ror a0, a0, a1 +; RV32ZBB-NEXT: ori a1, a1, 64 +; RV32ZBB-NEXT: srl a0, a0, a1 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: rotr_32_mask_or_64_or_32: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: rorw a0, a0, a1 +; RV64ZBB-NEXT: srlw a0, a0, a1 ; RV64ZBB-NEXT: ret ; ; RV32XTHEADBB-LABEL: rotr_32_mask_or_64_or_32: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: srl a2, a0, a1 -; RV32XTHEADBB-NEXT: neg a1, a1 -; RV32XTHEADBB-NEXT: sll a0, a0, a1 -; RV32XTHEADBB-NEXT: or a0, a2, a0 +; RV32XTHEADBB-NEXT: ori a1, a1, 64 +; RV32XTHEADBB-NEXT: srl a0, a0, a1 ; RV32XTHEADBB-NEXT: ret ; ; RV64XTHEADBB-LABEL: rotr_32_mask_or_64_or_32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: srlw a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 -; RV64XTHEADBB-NEXT: sllw a0, a0, a1 -; RV64XTHEADBB-NEXT: or a0, a2, a0 +; RV64XTHEADBB-NEXT: srlw a0, a0, a1 ; RV64XTHEADBB-NEXT: ret %a = or i32 %y, 64 %b = lshr i32 %x, %a @@ -1022,7 +1036,9 @@ ; ; RV64ZBB-LABEL: rotl_64_mask_or_128_or_64: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: rol a0, a0, a1 +; RV64ZBB-NEXT: negw a1, a1 +; RV64ZBB-NEXT: ori a1, a1, 64 +; RV64ZBB-NEXT: srl a0, a0, a1 ; RV64ZBB-NEXT: ret ; ; RV32XTHEADBB-LABEL: rotl_64_mask_or_128_or_64: @@ -1034,10 +1050,9 @@ ; ; RV64XTHEADBB-LABEL: rotl_64_mask_or_128_or_64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: sll a2, a0, a1 ; RV64XTHEADBB-NEXT: negw a1, a1 +; RV64XTHEADBB-NEXT: ori a1, a1, 64 ; RV64XTHEADBB-NEXT: srl a0, a0, a1 -; RV64XTHEADBB-NEXT: or a0, a2, a0 ; RV64XTHEADBB-NEXT: ret %a = or i64 %y, 128 %b = shl i64 %x, %a @@ -1359,7 +1374,8 @@ ; ; RV64ZBB-LABEL: rotr_64_mask_or_128_or_64: ; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: ror a0, a0, a1 +; RV64ZBB-NEXT: ori a1, a1, 128 +; RV64ZBB-NEXT: srl a0, a0, a1 ; RV64ZBB-NEXT: ret ; ; RV32XTHEADBB-LABEL: rotr_64_mask_or_128_or_64: @@ -1370,10 +1386,8 @@ ; ; RV64XTHEADBB-LABEL: rotr_64_mask_or_128_or_64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: srl a2, a0, a1 -; RV64XTHEADBB-NEXT: negw a1, a1 -; RV64XTHEADBB-NEXT: sll a0, a0, a1 -; RV64XTHEADBB-NEXT: or a0, a2, a0 +; RV64XTHEADBB-NEXT: ori a1, a1, 128 +; RV64XTHEADBB-NEXT: srl a0, a0, a1 ; RV64XTHEADBB-NEXT: ret %a = or i64 %y, 128 %b = lshr i64 %x, %a diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll --- a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll @@ -352,12 +352,12 @@ define i64 @roriw_bug(i64 %x) nounwind { ; CHECK-LABEL: roriw_bug: ; CHECK: # %bb.0: -; CHECK-NEXT: slli a1, a0, 31 -; CHECK-NEXT: andi a2, a0, -2 -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: or a0, a1, a0 -; CHECK-NEXT: sext.w a0, a0 -; CHECK-NEXT: xor a0, a2, a0 +; CHECK-NEXT: andi a1, a0, -2 +; CHECK-NEXT: slli a2, a0, 31 +; CHECK-NEXT: slli a0, a0, 63 +; CHECK-NEXT: or a0, a0, a2 +; CHECK-NEXT: srai a0, a0, 32 +; CHECK-NEXT: xor a0, a1, a0 ; CHECK-NEXT: ret %a = shl i64 %x, 31 %b = and i64 %x, 18446744073709551614 diff --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll --- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll @@ -105,7 +105,7 @@ define i64 @pack_i64_3(ptr %0, ptr %1) { ; RV64I-LABEL: pack_i64_3: ; RV64I: # %bb.0: -; RV64I-NEXT: lw a0, 0(a0) +; RV64I-NEXT: lwu a0, 0(a0) ; RV64I-NEXT: lwu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a1 @@ -113,7 +113,7 @@ ; ; RV64ZBKB-LABEL: pack_i64_3: ; RV64ZBKB: # %bb.0: -; RV64ZBKB-NEXT: lw a0, 0(a0) +; RV64ZBKB-NEXT: lwu a0, 0(a0) ; RV64ZBKB-NEXT: lwu a1, 0(a1) ; RV64ZBKB-NEXT: pack a0, a1, a0 ; RV64ZBKB-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-store.ll b/llvm/test/CodeGen/RISCV/rvv/combine-store.ll --- a/llvm/test/CodeGen/RISCV/rvv/combine-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-store.ll @@ -16,7 +16,9 @@ define void @combine_zero_stores_4xi8(ptr %p) { ; CHECK-LABEL: combine_zero_stores_4xi8: ; CHECK: # %bb.0: -; CHECK-NEXT: sw zero, 0(a0) +; CHECK-NEXT: sh zero, 0(a0) +; CHECK-NEXT: sb zero, 2(a0) +; CHECK-NEXT: sb zero, 3(a0) ; CHECK-NEXT: ret store i8 zeroinitializer, ptr %p, align 4 %gep1 = getelementptr i8, ptr %p, i64 1 @@ -29,16 +31,15 @@ } define void @combine_zero_stores_8xi8(ptr %p) { -; RV32-LABEL: combine_zero_stores_8xi8: -; RV32: # %bb.0: -; RV32-NEXT: sw zero, 0(a0) -; RV32-NEXT: sw zero, 4(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: combine_zero_stores_8xi8: -; RV64: # %bb.0: -; RV64-NEXT: sd zero, 0(a0) -; RV64-NEXT: ret +; CHECK-LABEL: combine_zero_stores_8xi8: +; CHECK: # %bb.0: +; CHECK-NEXT: sh zero, 0(a0) +; CHECK-NEXT: sb zero, 2(a0) +; CHECK-NEXT: sb zero, 3(a0) +; CHECK-NEXT: sh zero, 4(a0) +; CHECK-NEXT: sb zero, 6(a0) +; CHECK-NEXT: sb zero, 7(a0) +; CHECK-NEXT: ret store i8 zeroinitializer, ptr %p, align 8 %gep1 = getelementptr i8, ptr %p, i64 1 store i8 zeroinitializer, ptr %gep1 @@ -90,21 +91,15 @@ } define void @combine_zero_stores_8xi16(ptr %p) { -; RV32-LABEL: combine_zero_stores_8xi16: -; RV32: # %bb.0: -; RV32-NEXT: sw zero, 0(a0) -; RV32-NEXT: sh zero, 4(a0) -; RV32-NEXT: sh zero, 6(a0) -; RV32-NEXT: sw zero, 8(a0) -; RV32-NEXT: sh zero, 12(a0) -; RV32-NEXT: sh zero, 14(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: combine_zero_stores_8xi16: -; RV64: # %bb.0: -; RV64-NEXT: sd zero, 0(a0) -; RV64-NEXT: sd zero, 8(a0) -; RV64-NEXT: ret +; CHECK-LABEL: combine_zero_stores_8xi16: +; CHECK: # %bb.0: +; CHECK-NEXT: sw zero, 0(a0) +; CHECK-NEXT: sh zero, 4(a0) +; CHECK-NEXT: sh zero, 6(a0) +; CHECK-NEXT: sw zero, 8(a0) +; CHECK-NEXT: sh zero, 12(a0) +; CHECK-NEXT: sh zero, 14(a0) +; CHECK-NEXT: ret store i16 zeroinitializer, ptr %p, align 16 %gep1 = getelementptr i16, ptr %p, i64 1 store i16 zeroinitializer, ptr %gep1 @@ -166,9 +161,11 @@ define void @combine_zero_stores_8xi32(ptr %p) { ; RV32-LABEL: combine_zero_stores_8xi32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: combine_zero_stores_8xi32: diff --git a/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll b/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll --- a/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll +++ b/llvm/test/CodeGen/RISCV/rvv/common-shuffle-patterns.ll @@ -7,11 +7,19 @@ define dso_local <16 x i16> @interleave(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: interleave: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v9 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vid.v v14 +; CHECK-NEXT: vrgather.vv v8, v12, v14 +; CHECK-NEXT: vsrl.vi v12, v14, 1 +; CHECK-NEXT: lui a0, 11 +; CHECK-NEXT: addiw a0, a0, -1366 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v0, a0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; CHECK-NEXT: vrgather.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret entry: %v2 = shufflevector <8 x i16> %v0, <8 x i16> poison, <16 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding.ll --- a/llvm/test/CodeGen/RISCV/rvv/constant-folding.ll +++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding.ll @@ -14,26 +14,13 @@ ; a constant SPLAT_VECTOR didn't follow suit. define <2 x i16> @fixedlen(<2 x i32> %x) { -; RV32-LABEL: fixedlen: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 16 -; RV32-NEXT: lui a0, 1048568 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; RV32-NEXT: vnsrl.wi v8, v8, 0 -; RV32-NEXT: ret -; -; RV64-LABEL: fixedlen: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vsrl.vi v8, v8, 16 -; RV64-NEXT: lui a0, 131071 -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; RV64-NEXT: vnsrl.wi v8, v8, 0 -; RV64-NEXT: ret +; CHECK-LABEL: fixedlen: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 16 +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: ret %v41 = insertelement <2 x i32> poison, i32 16, i32 0 %v42 = shufflevector <2 x i32> %v41, <2 x i32> poison, <2 x i32> zeroinitializer %v43 = lshr <2 x i32> %x, %v42 @@ -63,3 +50,6 @@ %v48 = and %v44, %v47 ret %v48 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll @@ -803,27 +803,16 @@ } define i32 @extractelt_sdiv_nxv4i32_splat( %x) { -; RV32NOM-LABEL: extractelt_sdiv_nxv4i32_splat: -; RV32NOM: # %bb.0: -; RV32NOM-NEXT: lui a0, 349525 -; RV32NOM-NEXT: addi a0, a0, 1366 -; RV32NOM-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32NOM-NEXT: vmulh.vx v8, v8, a0 -; RV32NOM-NEXT: vsrl.vi v10, v8, 31 -; RV32NOM-NEXT: vadd.vv v8, v8, v10 -; RV32NOM-NEXT: vmv.x.s a0, v8 -; RV32NOM-NEXT: ret -; -; RV32M-LABEL: extractelt_sdiv_nxv4i32_splat: -; RV32M: # %bb.0: -; RV32M-NEXT: vsetivli zero, 0, e32, m2, ta, ma -; RV32M-NEXT: vmv.x.s a0, v8 -; RV32M-NEXT: lui a1, 349525 -; RV32M-NEXT: addi a1, a1, 1366 -; RV32M-NEXT: mulh a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 31 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: ret +; CHECK-LABEL: extractelt_sdiv_nxv4i32_splat: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1366 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vmulh.vx v8, v8, a0 +; CHECK-NEXT: vsrl.vi v10, v8, 31 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret %head = insertelement poison, i32 3, i32 0 %splat = shufflevector %head, poison, zeroinitializer %bo = sdiv %x, %splat @@ -832,27 +821,16 @@ } define i32 @extractelt_udiv_nxv4i32_splat( %x) { -; RV32NOM-LABEL: extractelt_udiv_nxv4i32_splat: -; RV32NOM: # %bb.0: -; RV32NOM-NEXT: lui a0, 349525 -; RV32NOM-NEXT: addi a0, a0, 1366 -; RV32NOM-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV32NOM-NEXT: vmulh.vx v8, v8, a0 -; RV32NOM-NEXT: vsrl.vi v10, v8, 31 -; RV32NOM-NEXT: vadd.vv v8, v8, v10 -; RV32NOM-NEXT: vmv.x.s a0, v8 -; RV32NOM-NEXT: ret -; -; RV32M-LABEL: extractelt_udiv_nxv4i32_splat: -; RV32M: # %bb.0: -; RV32M-NEXT: vsetivli zero, 0, e32, m2, ta, ma -; RV32M-NEXT: vmv.x.s a0, v8 -; RV32M-NEXT: lui a1, 349525 -; RV32M-NEXT: addi a1, a1, 1366 -; RV32M-NEXT: mulh a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 31 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: ret +; CHECK-LABEL: extractelt_udiv_nxv4i32_splat: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 349525 +; CHECK-NEXT: addi a0, a0, 1366 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vmulh.vx v8, v8, a0 +; CHECK-NEXT: vsrl.vi v10, v8, 31 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret %head = insertelement poison, i32 3, i32 0 %splat = shufflevector %head, poison, zeroinitializer %bo = sdiv %x, %splat diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll @@ -243,11 +243,25 @@ } define <4 x i8> @vslide1down_4xi8_with_splat(<4 x i8> %v, i8 %b) { -; CHECK-LABEL: vslide1down_4xi8_with_splat: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslide1down.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vslide1down_4xi8_with_splat: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vid.v v10 +; RV32-NEXT: vmv.v.i v0, 7 +; RV32-NEXT: vadd.vi v10, v10, 1 +; RV32-NEXT: vrgather.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vslide1down_4xi8_with_splat: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vslide1down.vx v8, v8, a0 +; RV64-NEXT: ret %vb = insertelement <4 x i8> poison, i8 %b, i64 0 %v1 = shufflevector <4 x i8> %vb, <4 x i8> poison, <4 x i32> zeroinitializer %v2 = shufflevector <4 x i8> %v1, <4 x i8> %v, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll @@ -259,12 +259,26 @@ } define <4 x i8> @vslide1up_4xi8_with_splat(<4 x i8> %v, i8 %b) { -; CHECK-LABEL: vslide1up_4xi8_with_splat: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslide1up.vx v9, v8, a0 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vslide1up_4xi8_with_splat: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vid.v v10 +; RV32-NEXT: vmv.v.i v0, 14 +; RV32-NEXT: vadd.vi v10, v10, -1 +; RV32-NEXT: vrgather.vv v9, v8, v10, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vslide1up_4xi8_with_splat: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vslide1up.vx v9, v8, a0 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %vb = insertelement <4 x i8> poison, i8 %b, i64 0 %v1 = shufflevector <4 x i8> %vb, <4 x i8> poison, <4 x i32> zeroinitializer %v2 = shufflevector <4 x i8> %v1, <4 x i8> %v, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -827,10 +827,10 @@ ; ; LMULMAX1-RV32-LABEL: bitreverse_v4i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v10, (a1) +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle64.v v10, (a0) ; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmv.v.i v9, 0 @@ -929,8 +929,8 @@ ; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v14 ; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v8 ; LMULMAX1-RV32-NEXT: vor.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v10, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v10, (a0) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bitreverse_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -391,10 +391,10 @@ ; ; LMULMAX1-RV32-LABEL: bswap_v4i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v9, (a1) +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) ; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5 ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmv.v.i v10, 0 @@ -448,8 +448,8 @@ ; LMULMAX1-RV32-NEXT: vor.vv v8, v12, v8 ; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 ; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v11 -; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v9, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: bswap_v4i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll @@ -847,9 +847,9 @@ ; LMULMAX1-RV32-LABEL: ctpop_v4i64: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV32-NEXT: vle64.v v9, (a1) ; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 ; LMULMAX1-RV32-NEXT: lui a2, 349525 ; LMULMAX1-RV32-NEXT: addi a2, a2, 1365 @@ -895,16 +895,16 @@ ; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v12 ; LMULMAX1-RV32-NEXT: vmul.vv v9, v9, v13 ; LMULMAX1-RV32-NEXT: vsrl.vx v9, v9, a2 -; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v9, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: ctpop_v4i64: ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV64-NEXT: vle64.v v9, (a1) ; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 ; LMULMAX1-RV64-NEXT: lui a2, 349525 ; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365 @@ -946,8 +946,8 @@ ; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4 ; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a5 ; LMULMAX1-RV64-NEXT: vsrl.vx v9, v9, a6 -; LMULMAX1-RV64-NEXT: vse64.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV64-NEXT: vse64.v v9, (a1) +; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, ptr %x %b = load <4 x i64>, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -680,40 +680,24 @@ } define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) { -; RV32NOM-LABEL: extractelt_sdiv_v4i32: -; RV32NOM: # %bb.0: -; RV32NOM-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32NOM-NEXT: vmv.v.i v9, -1 -; RV32NOM-NEXT: vmv.v.i v10, 0 -; RV32NOM-NEXT: vslideup.vi v10, v9, 3 -; RV32NOM-NEXT: lui a0, %hi(.LCPI38_0) -; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI38_0) -; RV32NOM-NEXT: vle32.v v9, (a0) -; RV32NOM-NEXT: lui a0, %hi(.LCPI38_1) -; RV32NOM-NEXT: addi a0, a0, %lo(.LCPI38_1) -; RV32NOM-NEXT: vle32.v v11, (a0) -; RV32NOM-NEXT: vand.vv v10, v8, v10 -; RV32NOM-NEXT: vmulh.vv v8, v8, v9 -; RV32NOM-NEXT: vadd.vv v8, v8, v10 -; RV32NOM-NEXT: vsra.vv v9, v8, v11 -; RV32NOM-NEXT: vsrl.vi v8, v8, 31 -; RV32NOM-NEXT: vadd.vv v8, v9, v8 -; RV32NOM-NEXT: vslidedown.vi v8, v8, 2 -; RV32NOM-NEXT: vmv.x.s a0, v8 -; RV32NOM-NEXT: ret -; -; RV32M-LABEL: extractelt_sdiv_v4i32: -; RV32M: # %bb.0: -; RV32M-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32M-NEXT: vslidedown.vi v8, v8, 2 -; RV32M-NEXT: vmv.x.s a0, v8 -; RV32M-NEXT: lui a1, 322639 -; RV32M-NEXT: addi a1, a1, -945 -; RV32M-NEXT: mulh a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 31 -; RV32M-NEXT: srai a0, a0, 2 -; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: ret +; RV32-LABEL: extractelt_sdiv_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v9, -1 +; RV32-NEXT: lui a0, %hi(.LCPI38_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI38_0) +; RV32-NEXT: vle32.v v10, (a0) +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: vslideup.vi v11, v9, 3 +; RV32-NEXT: vand.vv v9, v8, v11 +; RV32-NEXT: vmulh.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 31 +; RV32-NEXT: vsra.vi v8, v8, 2 +; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vslidedown.vi v8, v8, 2 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: ret ; ; RV64-LABEL: extractelt_sdiv_v4i32: ; RV64: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll @@ -65,22 +65,22 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: addi a1, a0, 80 ; LMULMAX1-NEXT: addi a2, a0, 16 -; LMULMAX1-NEXT: addi a3, a0, 48 -; LMULMAX1-NEXT: addi a4, a0, 32 +; LMULMAX1-NEXT: addi a3, a0, 32 +; LMULMAX1-NEXT: addi a4, a0, 48 ; LMULMAX1-NEXT: addi a5, a0, 94 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; LMULMAX1-NEXT: vlse16.v v8, (a5), zero ; LMULMAX1-NEXT: addi a5, a0, 64 -; LMULMAX1-NEXT: addi a6, a0, 112 -; LMULMAX1-NEXT: addi a7, a0, 96 +; LMULMAX1-NEXT: addi a6, a0, 96 +; LMULMAX1-NEXT: addi a7, a0, 112 ; LMULMAX1-NEXT: vse16.v v8, (a7) ; LMULMAX1-NEXT: vse16.v v8, (a6) ; LMULMAX1-NEXT: vse16.v v8, (a5) ; LMULMAX1-NEXT: vse16.v v8, (a1) ; LMULMAX1-NEXT: vse16.v v8, (a4) ; LMULMAX1-NEXT: vse16.v v8, (a3) -; LMULMAX1-NEXT: vse16.v v8, (a0) ; LMULMAX1-NEXT: vse16.v v8, (a2) +; LMULMAX1-NEXT: vse16.v v8, (a0) ; LMULMAX1-NEXT: ret %a = load <64 x half>, ptr %x %b = extractelement <64 x half> %a, i32 47 @@ -104,22 +104,22 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: addi a1, a0, 64 ; LMULMAX1-NEXT: addi a2, a0, 16 -; LMULMAX1-NEXT: addi a3, a0, 48 -; LMULMAX1-NEXT: addi a4, a0, 32 +; LMULMAX1-NEXT: addi a3, a0, 32 +; LMULMAX1-NEXT: addi a4, a0, 48 ; LMULMAX1-NEXT: addi a5, a0, 68 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vlse32.v v8, (a5), zero ; LMULMAX1-NEXT: addi a5, a0, 80 -; LMULMAX1-NEXT: addi a6, a0, 112 -; LMULMAX1-NEXT: addi a7, a0, 96 +; LMULMAX1-NEXT: addi a6, a0, 96 +; LMULMAX1-NEXT: addi a7, a0, 112 ; LMULMAX1-NEXT: vse32.v v8, (a7) ; LMULMAX1-NEXT: vse32.v v8, (a6) -; LMULMAX1-NEXT: vse32.v v8, (a1) ; LMULMAX1-NEXT: vse32.v v8, (a5) +; LMULMAX1-NEXT: vse32.v v8, (a1) ; LMULMAX1-NEXT: vse32.v v8, (a4) ; LMULMAX1-NEXT: vse32.v v8, (a3) -; LMULMAX1-NEXT: vse32.v v8, (a0) ; LMULMAX1-NEXT: vse32.v v8, (a2) +; LMULMAX1-NEXT: vse32.v v8, (a0) ; LMULMAX1-NEXT: ret %a = load <32 x float>, ptr %x %b = extractelement <32 x float> %a, i32 17 @@ -142,21 +142,21 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: addi a1, a0, 80 ; LMULMAX1-NEXT: addi a2, a0, 16 -; LMULMAX1-NEXT: addi a3, a0, 48 -; LMULMAX1-NEXT: addi a4, a0, 32 +; LMULMAX1-NEXT: addi a3, a0, 32 +; LMULMAX1-NEXT: addi a4, a0, 48 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vlse64.v v8, (a1), zero ; LMULMAX1-NEXT: addi a5, a0, 64 -; LMULMAX1-NEXT: addi a6, a0, 112 -; LMULMAX1-NEXT: addi a7, a0, 96 +; LMULMAX1-NEXT: addi a6, a0, 96 +; LMULMAX1-NEXT: addi a7, a0, 112 ; LMULMAX1-NEXT: vse64.v v8, (a7) ; LMULMAX1-NEXT: vse64.v v8, (a6) ; LMULMAX1-NEXT: vse64.v v8, (a5) ; LMULMAX1-NEXT: vse64.v v8, (a1) ; LMULMAX1-NEXT: vse64.v v8, (a4) ; LMULMAX1-NEXT: vse64.v v8, (a3) -; LMULMAX1-NEXT: vse64.v v8, (a0) ; LMULMAX1-NEXT: vse64.v v8, (a2) +; LMULMAX1-NEXT: vse64.v v8, (a0) ; LMULMAX1-NEXT: ret %a = load <16 x double>, ptr %x %b = extractelement <16 x double> %a, i32 10 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll @@ -151,23 +151,23 @@ ; LMULMAX8RV64: # %bb.0: ; LMULMAX8RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; LMULMAX8RV64-NEXT: vfncvt.rtz.x.f.w v9, v8 -; LMULMAX8RV64-NEXT: vmv.x.s a1, v9 -; LMULMAX8RV64-NEXT: lui a2, 8 -; LMULMAX8RV64-NEXT: addiw a2, a2, -1 -; LMULMAX8RV64-NEXT: and a1, a1, a2 -; LMULMAX8RV64-NEXT: vslidedown.vi v8, v9, 1 -; LMULMAX8RV64-NEXT: vmv.x.s a3, v8 -; LMULMAX8RV64-NEXT: and a2, a3, a2 -; LMULMAX8RV64-NEXT: slli a2, a2, 15 ; LMULMAX8RV64-NEXT: vslidedown.vi v8, v9, 2 -; LMULMAX8RV64-NEXT: vmv.x.s a3, v8 -; LMULMAX8RV64-NEXT: slli a3, a3, 30 +; LMULMAX8RV64-NEXT: vmv.x.s a1, v8 +; LMULMAX8RV64-NEXT: slli a2, a1, 49 +; LMULMAX8RV64-NEXT: srli a2, a2, 51 +; LMULMAX8RV64-NEXT: sh a2, 4(a0) +; LMULMAX8RV64-NEXT: vmv.x.s a2, v9 +; LMULMAX8RV64-NEXT: lui a3, 8 +; LMULMAX8RV64-NEXT: addiw a3, a3, -1 +; LMULMAX8RV64-NEXT: and a2, a2, a3 +; LMULMAX8RV64-NEXT: vslidedown.vi v8, v9, 1 +; LMULMAX8RV64-NEXT: vmv.x.s a4, v8 +; LMULMAX8RV64-NEXT: and a3, a4, a3 +; LMULMAX8RV64-NEXT: slli a3, a3, 15 +; LMULMAX8RV64-NEXT: slli a1, a1, 30 +; LMULMAX8RV64-NEXT: or a1, a2, a1 ; LMULMAX8RV64-NEXT: or a1, a1, a3 -; LMULMAX8RV64-NEXT: or a1, a1, a2 ; LMULMAX8RV64-NEXT: sw a1, 0(a0) -; LMULMAX8RV64-NEXT: slli a1, a1, 19 -; LMULMAX8RV64-NEXT: srli a1, a1, 51 -; LMULMAX8RV64-NEXT: sh a1, 4(a0) ; LMULMAX8RV64-NEXT: ret ; ; LMULMAX1RV32-LABEL: fp2si_v3f32_v3i15: @@ -197,23 +197,23 @@ ; LMULMAX1RV64: # %bb.0: ; LMULMAX1RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; LMULMAX1RV64-NEXT: vfncvt.rtz.x.f.w v9, v8 -; LMULMAX1RV64-NEXT: vmv.x.s a1, v9 -; LMULMAX1RV64-NEXT: lui a2, 8 -; LMULMAX1RV64-NEXT: addiw a2, a2, -1 -; LMULMAX1RV64-NEXT: and a1, a1, a2 -; LMULMAX1RV64-NEXT: vslidedown.vi v8, v9, 1 -; LMULMAX1RV64-NEXT: vmv.x.s a3, v8 -; LMULMAX1RV64-NEXT: and a2, a3, a2 -; LMULMAX1RV64-NEXT: slli a2, a2, 15 ; LMULMAX1RV64-NEXT: vslidedown.vi v8, v9, 2 -; LMULMAX1RV64-NEXT: vmv.x.s a3, v8 -; LMULMAX1RV64-NEXT: slli a3, a3, 30 +; LMULMAX1RV64-NEXT: vmv.x.s a1, v8 +; LMULMAX1RV64-NEXT: slli a2, a1, 49 +; LMULMAX1RV64-NEXT: srli a2, a2, 51 +; LMULMAX1RV64-NEXT: sh a2, 4(a0) +; LMULMAX1RV64-NEXT: vmv.x.s a2, v9 +; LMULMAX1RV64-NEXT: lui a3, 8 +; LMULMAX1RV64-NEXT: addiw a3, a3, -1 +; LMULMAX1RV64-NEXT: and a2, a2, a3 +; LMULMAX1RV64-NEXT: vslidedown.vi v8, v9, 1 +; LMULMAX1RV64-NEXT: vmv.x.s a4, v8 +; LMULMAX1RV64-NEXT: and a3, a4, a3 +; LMULMAX1RV64-NEXT: slli a3, a3, 15 +; LMULMAX1RV64-NEXT: slli a1, a1, 30 +; LMULMAX1RV64-NEXT: or a1, a2, a1 ; LMULMAX1RV64-NEXT: or a1, a1, a3 -; LMULMAX1RV64-NEXT: or a1, a1, a2 ; LMULMAX1RV64-NEXT: sw a1, 0(a0) -; LMULMAX1RV64-NEXT: slli a1, a1, 19 -; LMULMAX1RV64-NEXT: srli a1, a1, 51 -; LMULMAX1RV64-NEXT: sh a1, 4(a0) ; LMULMAX1RV64-NEXT: ret %z = fptosi <3 x float> %x to <3 x i15> ret <3 x i15> %z @@ -248,23 +248,23 @@ ; LMULMAX8RV64: # %bb.0: ; LMULMAX8RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; LMULMAX8RV64-NEXT: vfncvt.rtz.x.f.w v9, v8 -; LMULMAX8RV64-NEXT: vmv.x.s a1, v9 -; LMULMAX8RV64-NEXT: lui a2, 16 -; LMULMAX8RV64-NEXT: addiw a2, a2, -1 -; LMULMAX8RV64-NEXT: and a1, a1, a2 -; LMULMAX8RV64-NEXT: vslidedown.vi v8, v9, 1 -; LMULMAX8RV64-NEXT: vmv.x.s a3, v8 -; LMULMAX8RV64-NEXT: and a2, a3, a2 -; LMULMAX8RV64-NEXT: slli a2, a2, 15 ; LMULMAX8RV64-NEXT: vslidedown.vi v8, v9, 2 -; LMULMAX8RV64-NEXT: vmv.x.s a3, v8 -; LMULMAX8RV64-NEXT: slli a3, a3, 30 +; LMULMAX8RV64-NEXT: vmv.x.s a1, v8 +; LMULMAX8RV64-NEXT: slli a2, a1, 49 +; LMULMAX8RV64-NEXT: srli a2, a2, 51 +; LMULMAX8RV64-NEXT: sh a2, 4(a0) +; LMULMAX8RV64-NEXT: vmv.x.s a2, v9 +; LMULMAX8RV64-NEXT: lui a3, 16 +; LMULMAX8RV64-NEXT: addiw a3, a3, -1 +; LMULMAX8RV64-NEXT: and a2, a2, a3 +; LMULMAX8RV64-NEXT: vslidedown.vi v8, v9, 1 +; LMULMAX8RV64-NEXT: vmv.x.s a4, v8 +; LMULMAX8RV64-NEXT: and a3, a4, a3 +; LMULMAX8RV64-NEXT: slli a3, a3, 15 +; LMULMAX8RV64-NEXT: slli a1, a1, 30 +; LMULMAX8RV64-NEXT: or a1, a2, a1 ; LMULMAX8RV64-NEXT: or a1, a1, a3 -; LMULMAX8RV64-NEXT: or a1, a1, a2 ; LMULMAX8RV64-NEXT: sw a1, 0(a0) -; LMULMAX8RV64-NEXT: slli a1, a1, 19 -; LMULMAX8RV64-NEXT: srli a1, a1, 51 -; LMULMAX8RV64-NEXT: sh a1, 4(a0) ; LMULMAX8RV64-NEXT: ret ; ; LMULMAX1RV32-LABEL: fp2ui_v3f32_v3i15: @@ -294,23 +294,23 @@ ; LMULMAX1RV64: # %bb.0: ; LMULMAX1RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; LMULMAX1RV64-NEXT: vfncvt.rtz.x.f.w v9, v8 -; LMULMAX1RV64-NEXT: vmv.x.s a1, v9 -; LMULMAX1RV64-NEXT: lui a2, 16 -; LMULMAX1RV64-NEXT: addiw a2, a2, -1 -; LMULMAX1RV64-NEXT: and a1, a1, a2 -; LMULMAX1RV64-NEXT: vslidedown.vi v8, v9, 1 -; LMULMAX1RV64-NEXT: vmv.x.s a3, v8 -; LMULMAX1RV64-NEXT: and a2, a3, a2 -; LMULMAX1RV64-NEXT: slli a2, a2, 15 ; LMULMAX1RV64-NEXT: vslidedown.vi v8, v9, 2 -; LMULMAX1RV64-NEXT: vmv.x.s a3, v8 -; LMULMAX1RV64-NEXT: slli a3, a3, 30 +; LMULMAX1RV64-NEXT: vmv.x.s a1, v8 +; LMULMAX1RV64-NEXT: slli a2, a1, 49 +; LMULMAX1RV64-NEXT: srli a2, a2, 51 +; LMULMAX1RV64-NEXT: sh a2, 4(a0) +; LMULMAX1RV64-NEXT: vmv.x.s a2, v9 +; LMULMAX1RV64-NEXT: lui a3, 16 +; LMULMAX1RV64-NEXT: addiw a3, a3, -1 +; LMULMAX1RV64-NEXT: and a2, a2, a3 +; LMULMAX1RV64-NEXT: vslidedown.vi v8, v9, 1 +; LMULMAX1RV64-NEXT: vmv.x.s a4, v8 +; LMULMAX1RV64-NEXT: and a3, a4, a3 +; LMULMAX1RV64-NEXT: slli a3, a3, 15 +; LMULMAX1RV64-NEXT: slli a1, a1, 30 +; LMULMAX1RV64-NEXT: or a1, a2, a1 ; LMULMAX1RV64-NEXT: or a1, a1, a3 -; LMULMAX1RV64-NEXT: or a1, a1, a2 ; LMULMAX1RV64-NEXT: sw a1, 0(a0) -; LMULMAX1RV64-NEXT: slli a1, a1, 19 -; LMULMAX1RV64-NEXT: srli a1, a1, 51 -; LMULMAX1RV64-NEXT: sh a1, 4(a0) ; LMULMAX1RV64-NEXT: ret %z = fptoui <3 x float> %x to <3 x i15> ret <3 x i15> %z diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-setcc.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-setcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-setcc.ll @@ -246,14 +246,6 @@ } define void @setge_vx_v8i8(ptr %x, i8 %y, ptr %z) { -; CHECK-LABEL: setge_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: vmsle.vv v8, v9, v8 -; CHECK-NEXT: vsm.v v8, (a2) -; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -331,13 +323,6 @@ } define void @setule_vx_v8i8(ptr %x, i8 %y, ptr %z) { -; CHECK-LABEL: setule_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmsleu.vx v8, v8, a1 -; CHECK-NEXT: vsm.v v8, (a2) -; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -414,13 +399,6 @@ } define void @setge_xv_v8i8(ptr %x, i8 %y, ptr %z) { -; CHECK-LABEL: setge_xv_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmsle.vx v8, v8, a1 -; CHECK-NEXT: vsm.v v8, (a2) -; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -498,14 +476,6 @@ } define void @setule_xv_v8i8(ptr %x, i8 %y, ptr %z) { -; CHECK-LABEL: setule_xv_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.v.x v9, a1 -; CHECK-NEXT: vmsleu.vv v8, v9, v8 -; CHECK-NEXT: vsm.v v8, (a2) -; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-splat.ll @@ -761,17 +761,17 @@ ; ; LMULMAX1-RV32-LABEL: splat_allones_with_use_v4i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v9, (a1) +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmv.v.i v10, -1 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 ; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v9, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: splat_allones_with_use_v4i64: @@ -813,13 +813,13 @@ ; ; LMULMAX2-RV32-LABEL: vadd_vx_v16i64: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi a4, a0, 64 +; LMULMAX2-RV32-NEXT: addi a4, a0, 96 ; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; LMULMAX2-RV32-NEXT: vle64.v v8, (a4) -; LMULMAX2-RV32-NEXT: addi a4, a0, 96 +; LMULMAX2-RV32-NEXT: addi a4, a0, 64 ; LMULMAX2-RV32-NEXT: vle64.v v10, (a4) -; LMULMAX2-RV32-NEXT: vle64.v v12, (a0) -; LMULMAX2-RV32-NEXT: addi a0, a0, 32 +; LMULMAX2-RV32-NEXT: addi a4, a0, 32 +; LMULMAX2-RV32-NEXT: vle64.v v12, (a4) ; LMULMAX2-RV32-NEXT: vle64.v v14, (a0) ; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-RV32-NEXT: vmv.v.x v16, a2 @@ -833,32 +833,32 @@ ; LMULMAX2-RV32-NEXT: vadd.vv v12, v12, v16 ; LMULMAX2-RV32-NEXT: vadd.vv v10, v10, v16 ; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v16 -; LMULMAX2-RV32-NEXT: addi a0, a3, 64 -; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV32-NEXT: addi a0, a3, 96 +; LMULMAX2-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX2-RV32-NEXT: addi a0, a3, 64 ; LMULMAX2-RV32-NEXT: vse64.v v10, (a0) -; LMULMAX2-RV32-NEXT: vse64.v v12, (a3) ; LMULMAX2-RV32-NEXT: addi a0, a3, 32 -; LMULMAX2-RV32-NEXT: vse64.v v14, (a0) +; LMULMAX2-RV32-NEXT: vse64.v v12, (a0) +; LMULMAX2-RV32-NEXT: vse64.v v14, (a3) ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX1-RV32-LABEL: vadd_vx_v16i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: addi a4, a0, 96 +; LMULMAX1-RV32-NEXT: addi a4, a0, 112 ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-RV32-NEXT: vle64.v v8, (a4) -; LMULMAX1-RV32-NEXT: addi a4, a0, 112 +; LMULMAX1-RV32-NEXT: addi a4, a0, 96 ; LMULMAX1-RV32-NEXT: vle64.v v9, (a4) -; LMULMAX1-RV32-NEXT: addi a4, a0, 64 -; LMULMAX1-RV32-NEXT: vle64.v v10, (a4) ; LMULMAX1-RV32-NEXT: addi a4, a0, 80 +; LMULMAX1-RV32-NEXT: vle64.v v10, (a4) +; LMULMAX1-RV32-NEXT: addi a4, a0, 64 ; LMULMAX1-RV32-NEXT: vle64.v v11, (a4) -; LMULMAX1-RV32-NEXT: addi a4, a0, 32 -; LMULMAX1-RV32-NEXT: vle64.v v12, (a4) ; LMULMAX1-RV32-NEXT: addi a4, a0, 48 +; LMULMAX1-RV32-NEXT: vle64.v v12, (a4) +; LMULMAX1-RV32-NEXT: addi a4, a0, 32 ; LMULMAX1-RV32-NEXT: vle64.v v13, (a4) -; LMULMAX1-RV32-NEXT: vle64.v v14, (a0) -; LMULMAX1-RV32-NEXT: addi a0, a0, 16 +; LMULMAX1-RV32-NEXT: addi a4, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v14, (a4) ; LMULMAX1-RV32-NEXT: vle64.v v15, (a0) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV32-NEXT: vmv.v.i v0, 5 @@ -873,20 +873,20 @@ ; LMULMAX1-RV32-NEXT: vadd.vv v10, v10, v16 ; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v16 ; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v16 -; LMULMAX1-RV32-NEXT: addi a0, a3, 96 -; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a0, a3, 112 +; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX1-RV32-NEXT: addi a0, a3, 96 ; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) -; LMULMAX1-RV32-NEXT: addi a0, a3, 64 -; LMULMAX1-RV32-NEXT: vse64.v v10, (a0) ; LMULMAX1-RV32-NEXT: addi a0, a3, 80 +; LMULMAX1-RV32-NEXT: vse64.v v10, (a0) +; LMULMAX1-RV32-NEXT: addi a0, a3, 64 ; LMULMAX1-RV32-NEXT: vse64.v v11, (a0) -; LMULMAX1-RV32-NEXT: addi a0, a3, 32 -; LMULMAX1-RV32-NEXT: vse64.v v12, (a0) ; LMULMAX1-RV32-NEXT: addi a0, a3, 48 +; LMULMAX1-RV32-NEXT: vse64.v v12, (a0) +; LMULMAX1-RV32-NEXT: addi a0, a3, 32 ; LMULMAX1-RV32-NEXT: vse64.v v13, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v14, (a3) -; LMULMAX1-RV32-NEXT: addi a3, a3, 16 +; LMULMAX1-RV32-NEXT: addi a0, a3, 16 +; LMULMAX1-RV32-NEXT: vse64.v v14, (a0) ; LMULMAX1-RV32-NEXT: vse64.v v15, (a3) ; LMULMAX1-RV32-NEXT: ret ; @@ -901,11 +901,11 @@ ; LMULMAX2-RV64-LABEL: vadd_vx_v16i64: ; LMULMAX2-RV64: # %bb.0: ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV64-NEXT: addi a3, a0, 96 +; LMULMAX2-RV64-NEXT: addi a3, a0, 64 ; LMULMAX2-RV64-NEXT: vle64.v v8, (a3) ; LMULMAX2-RV64-NEXT: addi a3, a0, 32 ; LMULMAX2-RV64-NEXT: vle64.v v10, (a3) -; LMULMAX2-RV64-NEXT: addi a3, a0, 64 +; LMULMAX2-RV64-NEXT: addi a3, a0, 96 ; LMULMAX2-RV64-NEXT: vle64.v v12, (a3) ; LMULMAX2-RV64-NEXT: vle64.v v14, (a0) ; LMULMAX2-RV64-NEXT: vadd.vx v10, v10, a1 @@ -913,9 +913,9 @@ ; LMULMAX2-RV64-NEXT: vadd.vx v12, v12, a1 ; LMULMAX2-RV64-NEXT: vadd.vx v14, v14, a1 ; LMULMAX2-RV64-NEXT: vse64.v v14, (a2) -; LMULMAX2-RV64-NEXT: addi a0, a2, 64 -; LMULMAX2-RV64-NEXT: vse64.v v12, (a0) ; LMULMAX2-RV64-NEXT: addi a0, a2, 96 +; LMULMAX2-RV64-NEXT: vse64.v v12, (a0) +; LMULMAX2-RV64-NEXT: addi a0, a2, 64 ; LMULMAX2-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX2-RV64-NEXT: addi a0, a2, 32 ; LMULMAX2-RV64-NEXT: vse64.v v10, (a0) @@ -925,18 +925,18 @@ ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX1-RV64-NEXT: addi a3, a0, 96 -; LMULMAX1-RV64-NEXT: vle64.v v9, (a3) ; LMULMAX1-RV64-NEXT: addi a3, a0, 112 +; LMULMAX1-RV64-NEXT: vle64.v v9, (a3) +; LMULMAX1-RV64-NEXT: addi a3, a0, 96 ; LMULMAX1-RV64-NEXT: vle64.v v10, (a3) -; LMULMAX1-RV64-NEXT: addi a3, a0, 64 +; LMULMAX1-RV64-NEXT: addi a3, a0, 80 ; LMULMAX1-RV64-NEXT: vle64.v v11, (a3) -; LMULMAX1-RV64-NEXT: addi a3, a0, 48 +; LMULMAX1-RV64-NEXT: addi a3, a0, 32 ; LMULMAX1-RV64-NEXT: vle64.v v12, (a3) ; LMULMAX1-RV64-NEXT: addi a3, a0, 16 ; LMULMAX1-RV64-NEXT: vle64.v v13, (a3) -; LMULMAX1-RV64-NEXT: addi a3, a0, 80 -; LMULMAX1-RV64-NEXT: addi a0, a0, 32 +; LMULMAX1-RV64-NEXT: addi a3, a0, 64 +; LMULMAX1-RV64-NEXT: addi a0, a0, 48 ; LMULMAX1-RV64-NEXT: vle64.v v14, (a0) ; LMULMAX1-RV64-NEXT: vle64.v v15, (a3) ; LMULMAX1-RV64-NEXT: vadd.vx v13, v13, a1 @@ -948,17 +948,17 @@ ; LMULMAX1-RV64-NEXT: vadd.vx v9, v9, a1 ; LMULMAX1-RV64-NEXT: vadd.vx v8, v8, a1 ; LMULMAX1-RV64-NEXT: vse64.v v8, (a2) -; LMULMAX1-RV64-NEXT: addi a0, a2, 96 -; LMULMAX1-RV64-NEXT: vse64.v v9, (a0) ; LMULMAX1-RV64-NEXT: addi a0, a2, 112 +; LMULMAX1-RV64-NEXT: vse64.v v9, (a0) +; LMULMAX1-RV64-NEXT: addi a0, a2, 96 ; LMULMAX1-RV64-NEXT: vse64.v v10, (a0) -; LMULMAX1-RV64-NEXT: addi a0, a2, 64 -; LMULMAX1-RV64-NEXT: vse64.v v11, (a0) ; LMULMAX1-RV64-NEXT: addi a0, a2, 80 +; LMULMAX1-RV64-NEXT: vse64.v v11, (a0) +; LMULMAX1-RV64-NEXT: addi a0, a2, 64 ; LMULMAX1-RV64-NEXT: vse64.v v15, (a0) -; LMULMAX1-RV64-NEXT: addi a0, a2, 32 -; LMULMAX1-RV64-NEXT: vse64.v v14, (a0) ; LMULMAX1-RV64-NEXT: addi a0, a2, 48 +; LMULMAX1-RV64-NEXT: vse64.v v14, (a0) +; LMULMAX1-RV64-NEXT: addi a0, a2, 32 ; LMULMAX1-RV64-NEXT: vse64.v v12, (a0) ; LMULMAX1-RV64-NEXT: addi a2, a2, 16 ; LMULMAX1-RV64-NEXT: vse64.v v13, (a2) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll @@ -85,10 +85,10 @@ ; LMULMAX1-NEXT: vlse8.v v8, (a1), zero ; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: addi a3, a0, 48 -; LMULMAX1-NEXT: vse8.v v8, (a1) ; LMULMAX1-NEXT: vse8.v v8, (a3) -; LMULMAX1-NEXT: vse8.v v8, (a0) +; LMULMAX1-NEXT: vse8.v v8, (a1) ; LMULMAX1-NEXT: vse8.v v8, (a2) +; LMULMAX1-NEXT: vse8.v v8, (a0) ; LMULMAX1-NEXT: ret %a = load <64 x i8>, ptr %x %b = extractelement <64 x i8> %a, i32 32 @@ -118,8 +118,8 @@ ; LMULMAX1-NEXT: addi a3, a0, 32 ; LMULMAX1-NEXT: vse16.v v8, (a3) ; LMULMAX1-NEXT: vse16.v v8, (a1) -; LMULMAX1-NEXT: vse16.v v8, (a0) ; LMULMAX1-NEXT: vse16.v v8, (a2) +; LMULMAX1-NEXT: vse16.v v8, (a0) ; LMULMAX1-NEXT: ret %a = load <32 x i16>, ptr %x %b = extractelement <32 x i16> %a, i32 25 @@ -146,10 +146,10 @@ ; LMULMAX1-NEXT: addi a1, a0, 32 ; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: addi a3, a0, 48 -; LMULMAX1-NEXT: vse32.v v8, (a1) ; LMULMAX1-NEXT: vse32.v v8, (a3) -; LMULMAX1-NEXT: vse32.v v8, (a0) +; LMULMAX1-NEXT: vse32.v v8, (a1) ; LMULMAX1-NEXT: vse32.v v8, (a2) +; LMULMAX1-NEXT: vse32.v v8, (a0) ; LMULMAX1-NEXT: ret %a = load <16 x i32>, ptr %x %b = extractelement <16 x i32> %a, i32 9 @@ -174,8 +174,8 @@ ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vlse64.v v8, (a1), zero ; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: addi a2, a0, 48 -; LMULMAX1-NEXT: addi a3, a0, 32 +; LMULMAX1-NEXT: addi a2, a0, 32 +; LMULMAX1-NEXT: addi a3, a0, 48 ; LMULMAX1-NEXT: vse64.v v8, (a3) ; LMULMAX1-NEXT: vse64.v v8, (a2) ; LMULMAX1-NEXT: vse64.v v8, (a0) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -5049,17 +5049,17 @@ ; ; LMULMAX1-LABEL: mulhu_v32i8: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: vle8.v v8, (a1) +; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; LMULMAX1-NEXT: vle8.v v8, (a0) ; LMULMAX1-NEXT: lui a2, %hi(.LCPI181_0) ; LMULMAX1-NEXT: addi a2, a2, %lo(.LCPI181_0) ; LMULMAX1-NEXT: vle8.v v9, (a2) -; LMULMAX1-NEXT: vle8.v v10, (a0) +; LMULMAX1-NEXT: vle8.v v10, (a1) ; LMULMAX1-NEXT: vdivu.vv v8, v8, v9 ; LMULMAX1-NEXT: vdivu.vv v9, v10, v9 -; LMULMAX1-NEXT: vse8.v v9, (a0) -; LMULMAX1-NEXT: vse8.v v8, (a1) +; LMULMAX1-NEXT: vse8.v v9, (a1) +; LMULMAX1-NEXT: vse8.v v8, (a0) ; LMULMAX1-NEXT: ret %a = load <32 x i8>, ptr %x %b = udiv <32 x i8> %a, @@ -5148,17 +5148,17 @@ ; ; LMULMAX1-LABEL: mulhu_v16i16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: vle16.v v8, (a1) +; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; LMULMAX1-NEXT: vle16.v v8, (a0) ; LMULMAX1-NEXT: lui a2, %hi(.LCPI182_0) ; LMULMAX1-NEXT: addi a2, a2, %lo(.LCPI182_0) ; LMULMAX1-NEXT: vle16.v v9, (a2) -; LMULMAX1-NEXT: vle16.v v10, (a0) +; LMULMAX1-NEXT: vle16.v v10, (a1) ; LMULMAX1-NEXT: vdivu.vv v8, v8, v9 ; LMULMAX1-NEXT: vdivu.vv v9, v10, v9 -; LMULMAX1-NEXT: vse16.v v9, (a0) -; LMULMAX1-NEXT: vse16.v v8, (a1) +; LMULMAX1-NEXT: vse16.v v9, (a1) +; LMULMAX1-NEXT: vse16.v v8, (a0) ; LMULMAX1-NEXT: ret %a = load <16 x i16>, ptr %x %b = udiv <16 x i16> %a, @@ -5197,10 +5197,10 @@ ; ; LMULMAX1-RV32-LABEL: mulhu_v8i32: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle32.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle32.v v9, (a1) +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle32.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle32.v v9, (a0) ; LMULMAX1-RV32-NEXT: lui a2, 524288 ; LMULMAX1-RV32-NEXT: vmv.s.x v10, a2 ; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0 @@ -5223,23 +5223,23 @@ ; LMULMAX1-RV32-NEXT: vmulhu.vv v8, v8, v11 ; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 ; LMULMAX1-RV32-NEXT: vsrl.vv v8, v8, v13 -; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV32-NEXT: vse32.v v9, (a1) +; LMULMAX1-RV32-NEXT: vse32.v v8, (a1) +; LMULMAX1-RV32-NEXT: vse32.v v9, (a0) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: mulhu_v8i32: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle32.v v8, (a1) +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV64-NEXT: vle32.v v8, (a0) ; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI183_0) ; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI183_0) ; LMULMAX1-RV64-NEXT: vle32.v v9, (a2) -; LMULMAX1-RV64-NEXT: vle32.v v10, (a0) +; LMULMAX1-RV64-NEXT: vle32.v v10, (a1) ; LMULMAX1-RV64-NEXT: vdivu.vv v8, v8, v9 ; LMULMAX1-RV64-NEXT: vdivu.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vse32.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse32.v v8, (a1) +; LMULMAX1-RV64-NEXT: vse32.v v9, (a1) +; LMULMAX1-RV64-NEXT: vse32.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i32>, ptr %x %b = udiv <8 x i32> %a, @@ -5304,10 +5304,10 @@ ; ; LMULMAX1-RV32-LABEL: mulhu_v4i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v9, (a1) +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) ; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI184_0) ; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI184_0) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -5320,8 +5320,8 @@ ; LMULMAX1-RV32-NEXT: vle32.v v10, (a2) ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-RV32-NEXT: vdivu.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v9, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: mulhu_v4i64: @@ -5420,10 +5420,10 @@ ; ; LMULMAX1-RV32-LABEL: mulhs_v32i8: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle8.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle8.v v9, (a1) +; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle8.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle8.v v9, (a0) ; LMULMAX1-RV32-NEXT: lui a2, 5 ; LMULMAX1-RV32-NEXT: addi a2, a2, -1452 ; LMULMAX1-RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma @@ -5433,16 +5433,16 @@ ; LMULMAX1-RV32-NEXT: vmerge.vim v10, v10, 9, v0 ; LMULMAX1-RV32-NEXT: vdivu.vv v9, v9, v10 ; LMULMAX1-RV32-NEXT: vdivu.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vse8.v v8, (a0) -; LMULMAX1-RV32-NEXT: vse8.v v9, (a1) +; LMULMAX1-RV32-NEXT: vse8.v v8, (a1) +; LMULMAX1-RV32-NEXT: vse8.v v9, (a0) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: mulhs_v32i8: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-RV64-NEXT: vle8.v v8, (a0) ; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle8.v v9, (a1) +; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; LMULMAX1-RV64-NEXT: vle8.v v8, (a1) +; LMULMAX1-RV64-NEXT: vle8.v v9, (a0) ; LMULMAX1-RV64-NEXT: lui a2, 5 ; LMULMAX1-RV64-NEXT: addiw a2, a2, -1452 ; LMULMAX1-RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma @@ -5452,8 +5452,8 @@ ; LMULMAX1-RV64-NEXT: vmerge.vim v10, v10, 9, v0 ; LMULMAX1-RV64-NEXT: vdivu.vv v9, v9, v10 ; LMULMAX1-RV64-NEXT: vdivu.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vse8.v v8, (a0) -; LMULMAX1-RV64-NEXT: vse8.v v9, (a1) +; LMULMAX1-RV64-NEXT: vse8.v v8, (a1) +; LMULMAX1-RV64-NEXT: vse8.v v9, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <32 x i8>, ptr %x %b = udiv <32 x i8> %a, @@ -5508,18 +5508,18 @@ ; ; LMULMAX1-LABEL: mulhs_v16i16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v8, (a0) ; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: vle16.v v9, (a1) +; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; LMULMAX1-NEXT: vle16.v v8, (a1) +; LMULMAX1-NEXT: vle16.v v9, (a0) ; LMULMAX1-NEXT: li a2, 105 ; LMULMAX1-NEXT: vmv.v.x v0, a2 ; LMULMAX1-NEXT: vmv.v.i v10, 7 ; LMULMAX1-NEXT: vmerge.vim v10, v10, -7, v0 ; LMULMAX1-NEXT: vdiv.vv v9, v9, v10 ; LMULMAX1-NEXT: vdiv.vv v8, v8, v10 -; LMULMAX1-NEXT: vse16.v v8, (a0) -; LMULMAX1-NEXT: vse16.v v9, (a1) +; LMULMAX1-NEXT: vse16.v v8, (a1) +; LMULMAX1-NEXT: vse16.v v9, (a0) ; LMULMAX1-NEXT: ret %a = load <16 x i16>, ptr %x %b = sdiv <16 x i16> %a, @@ -5567,10 +5567,10 @@ ; ; LMULMAX1-RV32-LABEL: mulhs_v8i32: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle32.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle32.v v9, (a1) +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle32.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle32.v v9, (a0) ; LMULMAX1-RV32-NEXT: lui a2, 419430 ; LMULMAX1-RV32-NEXT: addi a2, a2, 1639 ; LMULMAX1-RV32-NEXT: vmv.v.x v10, a2 @@ -5586,16 +5586,16 @@ ; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 31 ; LMULMAX1-RV32-NEXT: vsra.vi v8, v8, 1 ; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV32-NEXT: vse32.v v9, (a1) +; LMULMAX1-RV32-NEXT: vse32.v v8, (a1) +; LMULMAX1-RV32-NEXT: vse32.v v9, (a0) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: mulhs_v8i32: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-RV64-NEXT: vle32.v v8, (a0) ; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle32.v v9, (a1) +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV64-NEXT: vle32.v v8, (a1) +; LMULMAX1-RV64-NEXT: vle32.v v9, (a0) ; LMULMAX1-RV64-NEXT: li a2, 3 ; LMULMAX1-RV64-NEXT: slli a2, a2, 33 ; LMULMAX1-RV64-NEXT: addi a2, a2, -5 @@ -5604,8 +5604,8 @@ ; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-RV64-NEXT: vdiv.vv v9, v9, v10 ; LMULMAX1-RV64-NEXT: vdiv.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV64-NEXT: vse32.v v9, (a1) +; LMULMAX1-RV64-NEXT: vse32.v v8, (a1) +; LMULMAX1-RV64-NEXT: vse32.v v9, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <8 x i32>, ptr %x %b = sdiv <8 x i32> %a, @@ -5682,10 +5682,10 @@ ; ; LMULMAX1-RV32-LABEL: mulhs_v4i64: ; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV32-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v9, (a1) +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) ; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI188_0) ; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI188_0) ; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -5693,22 +5693,22 @@ ; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-RV32-NEXT: vdiv.vv v9, v9, v10 ; LMULMAX1-RV32-NEXT: vdiv.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v9, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vse64.v v9, (a0) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: mulhs_v4i64: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-RV64-NEXT: vle64.v v8, (a0) ; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vle64.v v8, (a1) ; LMULMAX1-RV64-NEXT: lui a2, 349525 ; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365 ; LMULMAX1-RV64-NEXT: slli a3, a2, 32 ; LMULMAX1-RV64-NEXT: add a2, a2, a3 ; LMULMAX1-RV64-NEXT: lui a3, %hi(.LCPI188_0) ; LMULMAX1-RV64-NEXT: ld a3, %lo(.LCPI188_0)(a3) -; LMULMAX1-RV64-NEXT: vle64.v v9, (a1) +; LMULMAX1-RV64-NEXT: vle64.v v9, (a0) ; LMULMAX1-RV64-NEXT: vmv.v.x v10, a2 ; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma ; LMULMAX1-RV64-NEXT: vmv.s.x v10, a3 @@ -5726,8 +5726,8 @@ ; LMULMAX1-RV64-NEXT: vsrl.vx v8, v10, a2 ; LMULMAX1-RV64-NEXT: vsra.vv v10, v10, v12 ; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v9, (a1) +; LMULMAX1-RV64-NEXT: vse64.v v8, (a1) +; LMULMAX1-RV64-NEXT: vse64.v v9, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, ptr %x %b = sdiv <4 x i64> %a, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-splat.ll @@ -163,16 +163,6 @@ } define void @splat_v32i1(ptr %x, i1 %y) { -; LMULMAX2-LABEL: splat_v32i1: -; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: andi a1, a1, 1 -; LMULMAX2-NEXT: li a2, 32 -; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; LMULMAX2-NEXT: vmv.v.x v8, a1 -; LMULMAX2-NEXT: vmsne.vi v10, v8, 0 -; LMULMAX2-NEXT: vsm.v v10, (a0) -; LMULMAX2-NEXT: ret -; ; LMULMAX1-RV32-LABEL: splat_v32i1: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: andi a1, a1, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -72,10 +72,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB1_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB1_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB1_4 @@ -83,13 +83,13 @@ ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB1_3: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB1_2 ; RV64ZVE32F-NEXT: .LBB1_4: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret @@ -124,24 +124,24 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB2_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB2_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB2_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB2_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB2_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vsext.vf2 v9, v8 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -177,24 +177,24 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB3_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB3_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB3_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB3_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB3_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vzext.vf2 v9, v8 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -230,24 +230,24 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB4_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB4_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB4_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB4_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB4_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vsext.vf4 v9, v8 ; RV64ZVE32F-NEXT: vmv.v.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -283,24 +283,24 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB5_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB5_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB5_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB5_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB5_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vzext.vf4 v9, v8 ; RV64ZVE32F-NEXT: vmv.v.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -343,20 +343,20 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB6_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB6_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB6_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB6_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB6_4: # %else2 @@ -404,20 +404,20 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB7_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB7_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lbu a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB7_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB7_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lbu a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB7_4: # %else2 @@ -452,10 +452,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v4i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: bnez a2, .LBB8_5 +; RV64ZVE32F-NEXT: beqz a2, .LBB8_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB8_6 @@ -470,15 +470,16 @@ ; RV64ZVE32F-NEXT: .LBB8_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB8_2 ; RV64ZVE32F-NEXT: .LBB8_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB8_3 @@ -518,9 +519,7 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v9 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: li a1, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB9_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -608,8 +607,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: bnez a2, .LBB11_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: beqz a2, .LBB11_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB11_10 @@ -636,15 +636,16 @@ ; RV64ZVE32F-NEXT: .LBB11_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB11_2 ; RV64ZVE32F-NEXT: .LBB11_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB11_3 @@ -722,13 +723,14 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB12_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB12_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB12_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -896,10 +898,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB14_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB14_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB14_4 @@ -907,13 +909,13 @@ ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB14_3: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB14_2 ; RV64ZVE32F-NEXT: .LBB14_4: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret @@ -948,24 +950,24 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB15_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB15_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB15_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB15_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB15_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vsext.vf2 v9, v8 ; RV64ZVE32F-NEXT: vmv.v.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -1001,24 +1003,24 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i16_zextload_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB16_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB16_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB16_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB16_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB16_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vzext.vf2 v9, v8 ; RV64ZVE32F-NEXT: vmv.v.v v8, v9 ; RV64ZVE32F-NEXT: ret @@ -1061,20 +1063,20 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB17_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB17_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB17_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB17_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB17_4: # %else2 @@ -1124,28 +1126,29 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i16_zextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB18_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB18_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB18_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB18_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lh a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB18_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: lui a1, 16 ; RV64ZVE32F-NEXT: addiw a1, a1, -1 ; RV64ZVE32F-NEXT: and a0, a0, a1 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: and a1, a2, a1 @@ -1174,10 +1177,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v4i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: bnez a2, .LBB19_5 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB19_6 @@ -1192,15 +1195,16 @@ ; RV64ZVE32F-NEXT: .LBB19_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB19_2 ; RV64ZVE32F-NEXT: .LBB19_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB19_3 @@ -1240,9 +1244,7 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v9 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: li a1, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB20_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -1330,8 +1332,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: bnez a2, .LBB22_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: beqz a2, .LBB22_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB22_10 @@ -1358,15 +1361,16 @@ ; RV64ZVE32F-NEXT: .LBB22_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB22_2 ; RV64ZVE32F-NEXT: .LBB22_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB22_3 @@ -1446,14 +1450,15 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB23_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB23_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB23_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -1589,14 +1594,15 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB24_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB24_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB24_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -1733,15 +1739,16 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB25_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB25_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB25_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -1885,10 +1892,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB26_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB26_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2068,10 +2076,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB28_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB28_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB28_4 @@ -2079,13 +2087,13 @@ ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB28_3: # %cond.load ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB28_2 ; RV64ZVE32F-NEXT: .LBB28_4: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret @@ -2129,20 +2137,20 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i32_sextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB29_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB29_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB29_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB29_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB29_4: # %else2 @@ -2188,20 +2196,20 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i32_zextload_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB30_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB30_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: .LBB30_2: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB30_4 ; RV64ZVE32F-NEXT: # %bb.3: # %cond.load1 ; RV64ZVE32F-NEXT: lw a0, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a0 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: .LBB30_4: # %else2 @@ -2238,10 +2246,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v4i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_5 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB31_6 @@ -2256,15 +2264,16 @@ ; RV64ZVE32F-NEXT: .LBB31_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB31_2 ; RV64ZVE32F-NEXT: .LBB31_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB31_3 @@ -2303,9 +2312,7 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v9 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: li a1, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB32_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -2393,8 +2400,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: bnez a2, .LBB34_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB34_10 @@ -2421,7 +2429,7 @@ ; RV64ZVE32F-NEXT: .LBB34_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB34_2 @@ -2508,14 +2516,15 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB35_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -2656,14 +2665,15 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB36_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB36_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB36_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -2805,15 +2815,16 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB37_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB37_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB37_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -2962,10 +2973,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB38_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB38_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -3111,10 +3123,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB39_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB39_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -3262,11 +3275,12 @@ ; RV64ZVE32F-NEXT: lui a1, 16 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: addiw a1, a1, -1 -; RV64ZVE32F-NEXT: beqz a3, .LBB40_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB40_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 @@ -3420,10 +3434,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB41_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB41_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -3604,10 +3619,10 @@ ; ; RV32ZVE32F-LABEL: mgather_v2i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 ; RV32ZVE32F-NEXT: vmv.x.s a4, v0 -; RV32ZVE32F-NEXT: andi a2, a4, 1 -; RV32ZVE32F-NEXT: beqz a2, .LBB43_3 +; RV32ZVE32F-NEXT: bnez a2, .LBB43_3 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 @@ -3639,10 +3654,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a5, v0 ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi a5, a4, 1 -; RV64ZVE32F-NEXT: beqz a5, .LBB43_2 +; RV64ZVE32F-NEXT: bnez a5, .LBB43_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: .LBB43_2: # %else @@ -3677,10 +3692,10 @@ ; ; RV32ZVE32F-LABEL: mgather_v4i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 ; RV32ZVE32F-NEXT: vmv.x.s a6, v0 -; RV32ZVE32F-NEXT: andi a2, a6, 1 -; RV32ZVE32F-NEXT: beqz a2, .LBB44_5 +; RV32ZVE32F-NEXT: bnez a2, .LBB44_5 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 @@ -3742,10 +3757,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v4i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB44_5 +; RV64ZVE32F-NEXT: bnez a3, .LBB44_5 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: ld a3, 0(a1) ; RV64ZVE32F-NEXT: ld a3, 0(a3) @@ -3805,9 +3820,7 @@ ; ; RV32ZVE32F-LABEL: mgather_truemask_v4i64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmset.m v9 -; RV32ZVE32F-NEXT: vmv.x.s a6, v9 +; RV32ZVE32F-NEXT: li a6, 15 ; RV32ZVE32F-NEXT: bnez zero, .LBB45_5 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load ; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma @@ -3870,9 +3883,7 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v8 -; RV64ZVE32F-NEXT: vmv.x.s a5, v8 +; RV64ZVE32F-NEXT: li a5, 15 ; RV64ZVE32F-NEXT: bnez zero, .LBB45_5 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: ld a3, 0(a1) @@ -3992,10 +4003,11 @@ ; RV32ZVE32F-NEXT: .cfi_offset s1, -8 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a2, t0, 1 -; RV32ZVE32F-NEXT: beqz a2, .LBB47_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: bnez a2, .LBB47_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a2, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -4120,8 +4132,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi a3, a6, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB47_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB47_9 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: ld a3, 0(a1) ; RV64ZVE32F-NEXT: ld a3, 0(a3) @@ -4239,10 +4252,11 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 -; RV32ZVE32F-NEXT: beqz a1, .LBB48_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: bnez a1, .LBB48_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -4366,22 +4380,23 @@ ; RV64ZVE32F-LABEL: mgather_baseidx_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi a3, a6, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB48_3 +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB48_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: andi a4, a5, 2 ; RV64ZVE32F-NEXT: bnez a4, .LBB48_4 ; RV64ZVE32F-NEXT: .LBB48_2: ; RV64ZVE32F-NEXT: ld a4, 8(a2) ; RV64ZVE32F-NEXT: j .LBB48_5 ; RV64ZVE32F-NEXT: .LBB48_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: andi a4, a5, 2 ; RV64ZVE32F-NEXT: beqz a4, .LBB48_2 ; RV64ZVE32F-NEXT: .LBB48_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -4392,20 +4407,20 @@ ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB48_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi a5, a6, 4 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a5, .LBB48_7 +; RV64ZVE32F-NEXT: beqz a6, .LBB48_7 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a5, v9 -; RV64ZVE32F-NEXT: slli a5, a5, 3 -; RV64ZVE32F-NEXT: add a5, a1, a5 -; RV64ZVE32F-NEXT: ld a5, 0(a5) +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: j .LBB48_8 ; RV64ZVE32F-NEXT: .LBB48_7: -; RV64ZVE32F-NEXT: ld a5, 16(a2) +; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: .LBB48_8: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: andi a7, a6, 8 +; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: beqz a7, .LBB48_12 ; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 @@ -4415,18 +4430,18 @@ ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a6, 16 +; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB48_13 ; RV64ZVE32F-NEXT: .LBB48_10: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a6, 32 +; RV64ZVE32F-NEXT: andi t1, a5, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB48_14 ; RV64ZVE32F-NEXT: .LBB48_11: ; RV64ZVE32F-NEXT: ld t1, 40(a2) ; RV64ZVE32F-NEXT: j .LBB48_15 ; RV64ZVE32F-NEXT: .LBB48_12: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a6, 16 +; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB48_10 ; RV64ZVE32F-NEXT: .LBB48_13: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma @@ -4434,7 +4449,7 @@ ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a6, 32 +; RV64ZVE32F-NEXT: andi t1, a5, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB48_11 ; RV64ZVE32F-NEXT: .LBB48_14: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -4445,7 +4460,7 @@ ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB48_15: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi t2, a6, 64 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz t2, .LBB48_18 ; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 @@ -4453,15 +4468,15 @@ ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: bnez a6, .LBB48_19 +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB48_19 ; RV64ZVE32F-NEXT: .LBB48_17: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB48_20 ; RV64ZVE32F-NEXT: .LBB48_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: beqz a6, .LBB48_17 +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB48_17 ; RV64ZVE32F-NEXT: .LBB48_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -4471,7 +4486,7 @@ ; RV64ZVE32F-NEXT: .LBB48_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) -; RV64ZVE32F-NEXT: sd a5, 16(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) ; RV64ZVE32F-NEXT: sd t1, 40(a0) @@ -4517,10 +4532,11 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 -; RV32ZVE32F-NEXT: beqz a1, .LBB49_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: bnez a1, .LBB49_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -4644,22 +4660,23 @@ ; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi a3, a6, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB49_3 +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB49_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: andi a4, a5, 2 ; RV64ZVE32F-NEXT: bnez a4, .LBB49_4 ; RV64ZVE32F-NEXT: .LBB49_2: ; RV64ZVE32F-NEXT: ld a4, 8(a2) ; RV64ZVE32F-NEXT: j .LBB49_5 ; RV64ZVE32F-NEXT: .LBB49_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: andi a4, a5, 2 ; RV64ZVE32F-NEXT: beqz a4, .LBB49_2 ; RV64ZVE32F-NEXT: .LBB49_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -4670,20 +4687,20 @@ ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB49_5: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi a5, a6, 4 +; RV64ZVE32F-NEXT: andi a6, a5, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a5, .LBB49_7 +; RV64ZVE32F-NEXT: beqz a6, .LBB49_7 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a5, v9 -; RV64ZVE32F-NEXT: slli a5, a5, 3 -; RV64ZVE32F-NEXT: add a5, a1, a5 -; RV64ZVE32F-NEXT: ld a5, 0(a5) +; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) ; RV64ZVE32F-NEXT: j .LBB49_8 ; RV64ZVE32F-NEXT: .LBB49_7: -; RV64ZVE32F-NEXT: ld a5, 16(a2) +; RV64ZVE32F-NEXT: ld a6, 16(a2) ; RV64ZVE32F-NEXT: .LBB49_8: # %else5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: andi a7, a6, 8 +; RV64ZVE32F-NEXT: andi a7, a5, 8 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: beqz a7, .LBB49_12 ; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 @@ -4693,18 +4710,18 @@ ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a6, 16 +; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: bnez t0, .LBB49_13 ; RV64ZVE32F-NEXT: .LBB49_10: ; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a6, 32 +; RV64ZVE32F-NEXT: andi t1, a5, 32 ; RV64ZVE32F-NEXT: bnez t1, .LBB49_14 ; RV64ZVE32F-NEXT: .LBB49_11: ; RV64ZVE32F-NEXT: ld t1, 40(a2) ; RV64ZVE32F-NEXT: j .LBB49_15 ; RV64ZVE32F-NEXT: .LBB49_12: ; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a6, 16 +; RV64ZVE32F-NEXT: andi t0, a5, 16 ; RV64ZVE32F-NEXT: beqz t0, .LBB49_10 ; RV64ZVE32F-NEXT: .LBB49_13: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma @@ -4712,7 +4729,7 @@ ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a6, 32 +; RV64ZVE32F-NEXT: andi t1, a5, 32 ; RV64ZVE32F-NEXT: beqz t1, .LBB49_11 ; RV64ZVE32F-NEXT: .LBB49_14: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -4723,7 +4740,7 @@ ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: .LBB49_15: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi t2, a6, 64 +; RV64ZVE32F-NEXT: andi t2, a5, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 ; RV64ZVE32F-NEXT: beqz t2, .LBB49_18 ; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 @@ -4731,15 +4748,15 @@ ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: bnez a6, .LBB49_19 +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB49_19 ; RV64ZVE32F-NEXT: .LBB49_17: ; RV64ZVE32F-NEXT: ld a1, 56(a2) ; RV64ZVE32F-NEXT: j .LBB49_20 ; RV64ZVE32F-NEXT: .LBB49_18: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: beqz a6, .LBB49_17 +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB49_17 ; RV64ZVE32F-NEXT: .LBB49_19: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -4749,7 +4766,7 @@ ; RV64ZVE32F-NEXT: .LBB49_20: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) -; RV64ZVE32F-NEXT: sd a5, 16(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) ; RV64ZVE32F-NEXT: sd t1, 40(a0) @@ -4796,10 +4813,11 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 -; RV32ZVE32F-NEXT: beqz a1, .LBB50_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: bnez a1, .LBB50_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -4924,8 +4942,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB50_3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB50_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: andi a3, a3, 255 @@ -5083,10 +5102,11 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 -; RV32ZVE32F-NEXT: beqz a1, .LBB51_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: bnez a1, .LBB51_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -5211,10 +5231,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB51_3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB51_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 @@ -5362,10 +5383,11 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 -; RV32ZVE32F-NEXT: beqz a1, .LBB52_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: bnez a1, .LBB52_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -5490,10 +5512,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB52_3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB52_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 @@ -5642,10 +5665,11 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 -; RV32ZVE32F-NEXT: beqz a1, .LBB53_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: bnez a1, .LBB53_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -5771,11 +5795,12 @@ ; RV64ZVE32F-NEXT: lui a5, 16 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi a3, a6, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: addiw a5, a5, -1 -; RV64ZVE32F-NEXT: beqz a3, .LBB53_3 +; RV64ZVE32F-NEXT: bnez a3, .LBB53_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a5 ; RV64ZVE32F-NEXT: slli a3, a3, 3 @@ -5930,10 +5955,11 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 -; RV32ZVE32F-NEXT: beqz a1, .LBB54_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: bnez a1, .LBB54_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -6058,10 +6084,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB54_3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB54_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 @@ -6203,10 +6230,11 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 -; RV32ZVE32F-NEXT: beqz a1, .LBB55_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: bnez a1, .LBB55_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -6331,10 +6359,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB55_3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB55_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 @@ -6477,10 +6506,11 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 -; RV32ZVE32F-NEXT: beqz a1, .LBB56_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: bnez a1, .LBB56_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a3) ; RV32ZVE32F-NEXT: lw a3, 0(a3) @@ -6605,10 +6635,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi a3, a5, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB56_3 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB56_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 @@ -6775,10 +6806,11 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 -; RV32ZVE32F-NEXT: andi a1, t0, 1 -; RV32ZVE32F-NEXT: beqz a1, .LBB57_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: bnez a1, .LBB57_9 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: lw a1, 4(a2) ; RV32ZVE32F-NEXT: lw a2, 0(a2) @@ -6903,8 +6935,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a7, v0 -; RV64ZVE32F-NEXT: andi a4, a7, 1 -; RV64ZVE32F-NEXT: beqz a4, .LBB57_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a4, v0 +; RV64ZVE32F-NEXT: bnez a4, .LBB57_9 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: ld a4, 0(a2) ; RV64ZVE32F-NEXT: slli a4, a4, 3 @@ -7069,10 +7102,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v2f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB59_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB59_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB59_4 @@ -7080,13 +7113,13 @@ ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB59_3: # %cond.load ; RV64ZVE32F-NEXT: flh fa5, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB59_2 ; RV64ZVE32F-NEXT: .LBB59_4: # %cond.load1 ; RV64ZVE32F-NEXT: flh fa5, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret @@ -7113,10 +7146,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v4f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_5 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB60_6 @@ -7131,15 +7164,16 @@ ; RV64ZVE32F-NEXT: .LBB60_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB60_2 ; RV64ZVE32F-NEXT: .LBB60_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB60_3 @@ -7179,9 +7213,7 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v9 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: li a1, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB61_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -7269,8 +7301,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: bnez a2, .LBB63_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: beqz a2, .LBB63_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB63_10 @@ -7297,15 +7330,16 @@ ; RV64ZVE32F-NEXT: .LBB63_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB63_2 ; RV64ZVE32F-NEXT: .LBB63_10: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB63_3 @@ -7385,14 +7419,15 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB64_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB64_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 ; RV64ZVE32F-NEXT: .LBB64_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -7528,14 +7563,15 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB65_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB65_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 ; RV64ZVE32F-NEXT: .LBB65_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -7672,15 +7708,16 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB66_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB66_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 ; RV64ZVE32F-NEXT: .LBB66_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -7824,10 +7861,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB67_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB67_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -8007,10 +8045,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v2f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB69_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB69_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB69_4 @@ -8018,13 +8056,13 @@ ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB69_3: # %cond.load ; RV64ZVE32F-NEXT: flw fa5, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB69_2 ; RV64ZVE32F-NEXT: .LBB69_4: # %cond.load1 ; RV64ZVE32F-NEXT: flw fa5, 0(a1) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: ret @@ -8051,10 +8089,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v4f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: bnez a2, .LBB70_5 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB70_6 @@ -8069,15 +8107,16 @@ ; RV64ZVE32F-NEXT: .LBB70_5: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB70_2 ; RV64ZVE32F-NEXT: .LBB70_6: # %cond.load1 ; RV64ZVE32F-NEXT: ld a2, 8(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vfmv.s.f v9, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: beqz a2, .LBB70_3 @@ -8116,9 +8155,7 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v9 -; RV64ZVE32F-NEXT: vmv.x.s a1, v9 +; RV64ZVE32F-NEXT: li a1, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB71_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -8206,8 +8243,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: bnez a2, .LBB73_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: beqz a2, .LBB73_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB73_10 @@ -8234,7 +8272,7 @@ ; RV64ZVE32F-NEXT: .LBB73_9: # %cond.load ; RV64ZVE32F-NEXT: ld a2, 0(a0) ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: andi a2, a1, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB73_2 @@ -8321,14 +8359,15 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB74_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: .LBB74_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -8469,14 +8508,15 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB75_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB75_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: .LBB75_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -8618,15 +8658,16 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB76_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB76_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: .LBB76_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -8775,10 +8816,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB77_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB77_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -8924,10 +8966,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB78_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB78_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -9075,11 +9118,12 @@ ; RV64ZVE32F-NEXT: lui a1, 16 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: addiw a1, a1, -1 -; RV64ZVE32F-NEXT: beqz a3, .LBB79_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB79_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 @@ -9233,10 +9277,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB80_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB80_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -9415,10 +9460,10 @@ ; ; RV32ZVE32F-LABEL: mgather_v2f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB82_3 +; RV32ZVE32F-NEXT: beqz a1, .LBB82_3 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a0, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB82_4 @@ -9439,10 +9484,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v2f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB82_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB82_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB82_4 @@ -9478,10 +9523,10 @@ ; ; RV32ZVE32F-LABEL: mgather_v4f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB83_6 +; RV32ZVE32F-NEXT: beqz a2, .LBB83_6 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB83_7 @@ -9526,10 +9571,10 @@ ; ; RV64ZVE32F-LABEL: mgather_v4f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB83_6 +; RV64ZVE32F-NEXT: beqz a3, .LBB83_6 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a3, a2, 2 ; RV64ZVE32F-NEXT: bnez a3, .LBB83_7 @@ -9584,9 +9629,7 @@ ; ; RV32ZVE32F-LABEL: mgather_truemask_v4f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmset.m v9 -; RV32ZVE32F-NEXT: vmv.x.s a1, v9 +; RV32ZVE32F-NEXT: li a1, 15 ; RV32ZVE32F-NEXT: beqz zero, .LBB84_6 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -9632,9 +9675,7 @@ ; ; RV64ZVE32F-LABEL: mgather_truemask_v4f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v8 -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: li a2, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB84_6 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a3, a2, 2 @@ -9727,8 +9768,9 @@ ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB86_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB86_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB86_11 @@ -9766,7 +9808,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB86_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -9819,8 +9861,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB86_10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_10 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a3, a2, 2 ; RV64ZVE32F-NEXT: bnez a3, .LBB86_11 @@ -9923,8 +9966,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB87_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB87_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB87_11 @@ -9962,7 +10006,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB87_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -10015,8 +10059,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB87_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB87_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 @@ -10140,8 +10185,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB88_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB88_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB88_11 @@ -10179,7 +10225,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB88_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -10232,8 +10278,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB88_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB88_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 @@ -10358,8 +10405,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB89_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB89_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB89_11 @@ -10397,7 +10445,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB89_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -10450,8 +10498,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB89_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB89_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: andi a3, a3, 255 @@ -10584,8 +10633,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB90_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB90_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB90_11 @@ -10623,7 +10673,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB90_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -10676,10 +10726,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB90_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB90_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 @@ -10802,8 +10853,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB91_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB91_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB91_11 @@ -10841,7 +10893,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB91_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -10894,10 +10946,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB91_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB91_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 @@ -11021,8 +11074,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB92_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB92_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB92_11 @@ -11060,7 +11114,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB92_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -11114,11 +11168,12 @@ ; RV64ZVE32F-NEXT: lui a2, 16 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi a4, a3, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a4, v0 ; RV64ZVE32F-NEXT: addiw a2, a2, -1 -; RV64ZVE32F-NEXT: beqz a4, .LBB92_2 +; RV64ZVE32F-NEXT: bnez a4, .LBB92_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v8 ; RV64ZVE32F-NEXT: and a4, a4, a2 ; RV64ZVE32F-NEXT: slli a4, a4, 3 @@ -11248,8 +11303,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB93_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB93_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB93_11 @@ -11287,7 +11343,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB93_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -11340,10 +11396,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB93_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB93_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 @@ -11462,8 +11519,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB94_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB94_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB94_11 @@ -11501,7 +11559,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB94_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -11554,10 +11612,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB94_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB94_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 @@ -11677,8 +11736,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB95_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB95_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB95_11 @@ -11716,7 +11776,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB95_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -11769,10 +11829,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: beqz a3, .LBB95_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: bnez a3, .LBB95_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 32 ; RV64ZVE32F-NEXT: srli a3, a3, 29 @@ -11916,8 +11977,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB96_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB96_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a2, a1, 2 ; RV32ZVE32F-NEXT: bnez a2, .LBB96_11 @@ -11955,7 +12017,7 @@ ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB96_10: # %cond.load -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: fld fa0, 0(a2) ; RV32ZVE32F-NEXT: andi a2, a1, 2 @@ -12008,8 +12070,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi a4, a3, 1 -; RV64ZVE32F-NEXT: bnez a4, .LBB96_10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a4, v0 +; RV64ZVE32F-NEXT: beqz a4, .LBB96_10 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a4, a3, 2 ; RV64ZVE32F-NEXT: bnez a4, .LBB96_11 @@ -12126,14 +12189,14 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_2 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a2 ; RV64ZVE32F-NEXT: .LBB97_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 @@ -12383,15 +12446,15 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_2 +; RV64ZVE32F-NEXT: li a2, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: .LBB98_2: # %else ; RV64ZVE32F-NEXT: andi a2, a1, 2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll @@ -363,38 +363,38 @@ define void @masked_load_v32f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind { ; RV32-LABEL: masked_load_v32f64: ; RV32: # %bb.0: -; RV32-NEXT: addi a3, a1, 128 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v8, (a1) +; RV32-NEXT: addi a1, a1, 128 ; RV32-NEXT: vle64.v v16, (a1) -; RV32-NEXT: vle64.v v24, (a3) ; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: vmfeq.vf v0, v8, fa5 ; RV32-NEXT: vmfeq.vf v8, v16, fa5 -; RV32-NEXT: vmfeq.vf v0, v24, fa5 -; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: vle64.v v16, (a1), v0.t +; RV32-NEXT: vle64.v v16, (a0), v0.t +; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vle64.v v8, (a0), v0.t -; RV32-NEXT: vse64.v v8, (a2) ; RV32-NEXT: addi a0, a2, 128 -; RV32-NEXT: vse64.v v16, (a0) +; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: vse64.v v16, (a2) ; RV32-NEXT: ret ; ; RV64-LABEL: masked_load_v32f64: ; RV64: # %bb.0: -; RV64-NEXT: addi a3, a1, 128 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: addi a1, a1, 128 ; RV64-NEXT: vle64.v v16, (a1) -; RV64-NEXT: vle64.v v24, (a3) ; RV64-NEXT: fmv.d.x fa5, zero +; RV64-NEXT: vmfeq.vf v0, v8, fa5 ; RV64-NEXT: vmfeq.vf v8, v16, fa5 -; RV64-NEXT: vmfeq.vf v0, v24, fa5 -; RV64-NEXT: addi a1, a0, 128 -; RV64-NEXT: vle64.v v16, (a1), v0.t +; RV64-NEXT: vle64.v v16, (a0), v0.t +; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: vle64.v v8, (a0), v0.t -; RV64-NEXT: vse64.v v8, (a2) ; RV64-NEXT: addi a0, a2, 128 -; RV64-NEXT: vse64.v v16, (a0) +; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: vse64.v v16, (a2) ; RV64-NEXT: ret %m = load <32 x double>, ptr %m_ptr %mask = fcmp oeq <32 x double> %m, zeroinitializer @@ -426,21 +426,21 @@ define void @masked_load_v64f32(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind { ; CHECK-LABEL: masked_load_v64f32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a3, a1, 128 -; CHECK-NEXT: li a4, 32 -; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: addi a1, a1, 128 ; CHECK-NEXT: vle32.v v16, (a1) -; CHECK-NEXT: vle32.v v24, (a3) ; CHECK-NEXT: fmv.w.x fa5, zero +; CHECK-NEXT: vmfeq.vf v0, v8, fa5 ; CHECK-NEXT: vmfeq.vf v8, v16, fa5 -; CHECK-NEXT: vmfeq.vf v0, v24, fa5 -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vle32.v v16, (a1), v0.t +; CHECK-NEXT: vle32.v v16, (a0), v0.t +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vle32.v v8, (a0), v0.t -; CHECK-NEXT: vse32.v v8, (a2) ; CHECK-NEXT: addi a0, a2, 128 -; CHECK-NEXT: vse32.v v16, (a0) +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vse32.v v16, (a2) ; CHECK-NEXT: ret %m = load <64 x float>, ptr %m_ptr %mask = fcmp oeq <64 x float> %m, zeroinitializer @@ -453,21 +453,21 @@ define void @masked_load_v128f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind { ; CHECK-LABEL: masked_load_v128f16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a3, a1, 128 -; CHECK-NEXT: li a4, 64 -; CHECK-NEXT: vsetvli zero, a4, e16, m8, ta, ma +; CHECK-NEXT: li a3, 64 +; CHECK-NEXT: vsetvli zero, a3, e16, m8, ta, ma +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: addi a1, a1, 128 ; CHECK-NEXT: vle16.v v16, (a1) -; CHECK-NEXT: vle16.v v24, (a3) ; CHECK-NEXT: fmv.h.x fa5, zero +; CHECK-NEXT: vmfeq.vf v0, v8, fa5 ; CHECK-NEXT: vmfeq.vf v8, v16, fa5 -; CHECK-NEXT: vmfeq.vf v0, v24, fa5 -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vle16.v v16, (a1), v0.t +; CHECK-NEXT: vle16.v v16, (a0), v0.t +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vle16.v v8, (a0), v0.t -; CHECK-NEXT: vse16.v v8, (a2) ; CHECK-NEXT: addi a0, a2, 128 -; CHECK-NEXT: vse16.v v16, (a0) +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: vse16.v v16, (a2) ; CHECK-NEXT: ret %m = load <128 x half>, ptr %m_ptr %mask = fcmp oeq <128 x half> %m, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll @@ -399,40 +399,40 @@ define void @masked_load_v32i64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind { ; RV32-LABEL: masked_load_v32i64: ; RV32: # %bb.0: -; RV32-NEXT: addi a3, a1, 128 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vle64.v v16, (a3) -; RV32-NEXT: vle64.v v0, (a1) +; RV32-NEXT: vle64.v v8, (a1) +; RV32-NEXT: addi a1, a1, 128 +; RV32-NEXT: vle64.v v16, (a1) ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v24, 0 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV32-NEXT: vmseq.vv v8, v0, v24 -; RV32-NEXT: vmseq.vv v0, v16, v24 -; RV32-NEXT: addi a1, a0, 128 -; RV32-NEXT: vle64.v v16, (a1), v0.t +; RV32-NEXT: vmseq.vv v0, v8, v24 +; RV32-NEXT: vmseq.vv v8, v16, v24 +; RV32-NEXT: vle64.v v16, (a0), v0.t +; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: vle64.v v8, (a0), v0.t -; RV32-NEXT: vse64.v v8, (a2) ; RV32-NEXT: addi a0, a2, 128 -; RV32-NEXT: vse64.v v16, (a0) +; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: vse64.v v16, (a2) ; RV32-NEXT: ret ; ; RV64-LABEL: masked_load_v32i64: ; RV64: # %bb.0: -; RV64-NEXT: addi a3, a1, 128 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: addi a1, a1, 128 ; RV64-NEXT: vle64.v v16, (a1) -; RV64-NEXT: vle64.v v24, (a3) +; RV64-NEXT: vmseq.vi v0, v8, 0 ; RV64-NEXT: vmseq.vi v8, v16, 0 -; RV64-NEXT: vmseq.vi v0, v24, 0 -; RV64-NEXT: addi a1, a0, 128 -; RV64-NEXT: vle64.v v16, (a1), v0.t +; RV64-NEXT: vle64.v v16, (a0), v0.t +; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: vle64.v v8, (a0), v0.t -; RV64-NEXT: vse64.v v8, (a2) ; RV64-NEXT: addi a0, a2, 128 -; RV64-NEXT: vse64.v v16, (a0) +; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: vse64.v v16, (a2) ; RV64-NEXT: ret %m = load <32 x i64>, ptr %m_ptr %mask = icmp eq <32 x i64> %m, zeroinitializer @@ -481,20 +481,20 @@ define void @masked_load_v64i32(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind { ; CHECK-LABEL: masked_load_v64i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a3, a1, 128 -; CHECK-NEXT: li a4, 32 -; CHECK-NEXT: vsetvli zero, a4, e32, m8, ta, ma +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: addi a1, a1, 128 ; CHECK-NEXT: vle32.v v16, (a1) -; CHECK-NEXT: vle32.v v24, (a3) +; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmseq.vi v8, v16, 0 -; CHECK-NEXT: vmseq.vi v0, v24, 0 -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vle32.v v16, (a1), v0.t +; CHECK-NEXT: vle32.v v16, (a0), v0.t +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vle32.v v8, (a0), v0.t -; CHECK-NEXT: vse32.v v8, (a2) ; CHECK-NEXT: addi a0, a2, 128 -; CHECK-NEXT: vse32.v v16, (a0) +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vse32.v v16, (a2) ; CHECK-NEXT: ret %m = load <64 x i32>, ptr %m_ptr %mask = icmp eq <64 x i32> %m, zeroinitializer @@ -525,20 +525,20 @@ define void @masked_load_v256i8(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind { ; CHECK-LABEL: masked_load_v256i8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a3, a1, 128 -; CHECK-NEXT: li a4, 128 -; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, ma +; CHECK-NEXT: li a3, 128 +; CHECK-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: addi a1, a1, 128 ; CHECK-NEXT: vle8.v v16, (a1) -; CHECK-NEXT: vle8.v v24, (a3) +; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmseq.vi v8, v16, 0 -; CHECK-NEXT: vmseq.vi v0, v24, 0 -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vle8.v v16, (a1), v0.t +; CHECK-NEXT: vle8.v v16, (a0), v0.t +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vle8.v v8, (a0), v0.t -; CHECK-NEXT: vse8.v v8, (a2) ; CHECK-NEXT: addi a0, a2, 128 -; CHECK-NEXT: vse8.v v16, (a0) +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: vse8.v v16, (a2) ; CHECK-NEXT: ret %m = load <256 x i8>, ptr %m_ptr %mask = icmp eq <256 x i8> %m, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -66,10 +66,10 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i8: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB1_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB1_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB1_4 @@ -114,10 +114,10 @@ ; RV64ZVE32F-LABEL: mscatter_v2i16_truncstore_v2i8: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: vnsrl.wi v8, v8, 0 -; RV64ZVE32F-NEXT: bnez a3, .LBB2_3 +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: beqz a3, .LBB2_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB2_4 @@ -171,10 +171,10 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vnsrl.wi v8, v8, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 ; RV64ZVE32F-NEXT: vnsrl.wi v8, v8, 0 -; RV64ZVE32F-NEXT: bnez a3, .LBB3_3 +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: vmv.x.s a2, v0 +; RV64ZVE32F-NEXT: beqz a3, .LBB3_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB3_4 @@ -234,10 +234,10 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v9, a1 ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 -; RV64ZVE32F-NEXT: vmv.x.s a0, v0 -; RV64ZVE32F-NEXT: andi a1, a0, 1 ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 -; RV64ZVE32F-NEXT: bnez a1, .LBB4_3 +; RV64ZVE32F-NEXT: vfirst.m a1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a0, v0 +; RV64ZVE32F-NEXT: beqz a1, .LBB4_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB4_4 @@ -278,10 +278,10 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a5, v0 ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi a5, a3, 1 -; RV64ZVE32F-NEXT: bnez a5, .LBB5_5 +; RV64ZVE32F-NEXT: beqz a5, .LBB5_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB5_6 @@ -338,9 +338,7 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v9 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: li a3, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB6_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 @@ -416,8 +414,9 @@ ; RV64ZVE32F-NEXT: ld t0, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi t1, a3, 1 -; RV64ZVE32F-NEXT: bnez t1, .LBB8_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t1, v0 +; RV64ZVE32F-NEXT: beqz t1, .LBB8_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB8_10 @@ -513,8 +512,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB9_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB9_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -671,10 +671,10 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB11_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB11_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB11_4 @@ -722,8 +722,9 @@ ; RV64ZVE32F-NEXT: vnsrl.wi v8, v8, 0 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB12_3 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 +; RV64ZVE32F-NEXT: beqz a3, .LBB12_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB12_4 @@ -782,8 +783,9 @@ ; RV64ZVE32F-NEXT: vslideup.vi v8, v9, 1 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a0, v0 -; RV64ZVE32F-NEXT: andi a1, a0, 1 -; RV64ZVE32F-NEXT: bnez a1, .LBB13_3 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a1, v0 +; RV64ZVE32F-NEXT: beqz a1, .LBB13_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB13_4 @@ -824,10 +826,10 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a5, v0 ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi a5, a3, 1 -; RV64ZVE32F-NEXT: bnez a5, .LBB14_5 +; RV64ZVE32F-NEXT: beqz a5, .LBB14_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB14_6 @@ -884,9 +886,7 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v9 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: li a3, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB15_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 @@ -962,8 +962,9 @@ ; RV64ZVE32F-NEXT: ld t0, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi t1, a3, 1 -; RV64ZVE32F-NEXT: bnez t1, .LBB17_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t1, v0 +; RV64ZVE32F-NEXT: beqz t1, .LBB17_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB17_10 @@ -1061,8 +1062,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB18_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB18_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1192,8 +1194,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB19_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1324,8 +1327,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB20_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -1464,10 +1468,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB21_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB21_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -1631,10 +1636,10 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB23_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB23_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB23_4 @@ -1686,8 +1691,9 @@ ; RV64ZVE32F-NEXT: vslide1down.vx v8, v8, a1 ; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a0, v0 -; RV64ZVE32F-NEXT: andi a1, a0, 1 -; RV64ZVE32F-NEXT: bnez a1, .LBB24_3 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a1, v0 +; RV64ZVE32F-NEXT: beqz a1, .LBB24_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a0, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB24_4 @@ -1728,10 +1734,10 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a5, v0 ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi a5, a3, 1 -; RV64ZVE32F-NEXT: bnez a5, .LBB25_5 +; RV64ZVE32F-NEXT: beqz a5, .LBB25_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB25_6 @@ -1788,9 +1794,7 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v9 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: li a3, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB26_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 @@ -1866,8 +1870,9 @@ ; RV64ZVE32F-NEXT: ld t0, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi t1, a3, 1 -; RV64ZVE32F-NEXT: bnez t1, .LBB28_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t1, v0 +; RV64ZVE32F-NEXT: beqz t1, .LBB28_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB28_10 @@ -1964,8 +1969,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB29_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB29_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2094,8 +2100,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB30_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB30_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2225,8 +2232,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB31_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -2364,10 +2372,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB32_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2495,10 +2504,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB33_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB33_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2628,11 +2638,12 @@ ; RV64ZVE32F-NEXT: lui a1, 16 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: addiw a1, a1, -1 -; RV64ZVE32F-NEXT: beqz a3, .LBB34_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB34_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 @@ -2768,10 +2779,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2939,10 +2951,10 @@ ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: lw a2, 12(a0) ; RV32ZVE32F-NEXT: lw a1, 8(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a4, v0 ; RV32ZVE32F-NEXT: vmv.x.s a3, v0 -; RV32ZVE32F-NEXT: andi a4, a3, 1 -; RV32ZVE32F-NEXT: bnez a4, .LBB37_3 +; RV32ZVE32F-NEXT: beqz a4, .LBB37_3 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a3, a3, 2 ; RV32ZVE32F-NEXT: bnez a3, .LBB37_4 @@ -2967,10 +2979,10 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2i64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a5, v0 ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi a5, a4, 1 -; RV64ZVE32F-NEXT: bnez a5, .LBB37_3 +; RV64ZVE32F-NEXT: beqz a5, .LBB37_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a4, a4, 2 ; RV64ZVE32F-NEXT: bnez a4, .LBB37_4 @@ -3010,10 +3022,10 @@ ; RV32ZVE32F-NEXT: lw a4, 16(a0) ; RV32ZVE32F-NEXT: lw a7, 12(a0) ; RV32ZVE32F-NEXT: lw a6, 8(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vfirst.m t0, v0 ; RV32ZVE32F-NEXT: vmv.x.s a5, v0 -; RV32ZVE32F-NEXT: andi t0, a5, 1 -; RV32ZVE32F-NEXT: bnez t0, .LBB38_5 +; RV32ZVE32F-NEXT: beqz t0, .LBB38_5 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a5, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB38_6 @@ -3062,22 +3074,22 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: ld a2, 24(a1) ; RV64ZVE32F-NEXT: ld a4, 16(a1) -; RV64ZVE32F-NEXT: ld a7, 8(a1) +; RV64ZVE32F-NEXT: ld a6, 8(a1) ; RV64ZVE32F-NEXT: ld a3, 24(a0) ; RV64ZVE32F-NEXT: ld a5, 16(a0) ; RV64ZVE32F-NEXT: ld t0, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi t1, a6, 1 -; RV64ZVE32F-NEXT: bnez t1, .LBB38_5 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t1, v0 +; RV64ZVE32F-NEXT: vmv.x.s a7, v0 +; RV64ZVE32F-NEXT: beqz t1, .LBB38_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a6, 2 +; RV64ZVE32F-NEXT: andi a0, a7, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB38_6 ; RV64ZVE32F-NEXT: .LBB38_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a6, 4 +; RV64ZVE32F-NEXT: andi a0, a7, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB38_7 ; RV64ZVE32F-NEXT: .LBB38_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a6, 8 +; RV64ZVE32F-NEXT: andi a0, a7, 8 ; RV64ZVE32F-NEXT: bnez a0, .LBB38_8 ; RV64ZVE32F-NEXT: .LBB38_4: # %else6 ; RV64ZVE32F-NEXT: ret @@ -3085,15 +3097,15 @@ ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: sd a0, 0(a1) -; RV64ZVE32F-NEXT: andi a0, a6, 2 +; RV64ZVE32F-NEXT: andi a0, a7, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB38_2 ; RV64ZVE32F-NEXT: .LBB38_6: # %cond.store1 -; RV64ZVE32F-NEXT: sd t0, 0(a7) -; RV64ZVE32F-NEXT: andi a0, a6, 4 +; RV64ZVE32F-NEXT: sd t0, 0(a6) +; RV64ZVE32F-NEXT: andi a0, a7, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB38_3 ; RV64ZVE32F-NEXT: .LBB38_7: # %cond.store3 ; RV64ZVE32F-NEXT: sd a5, 0(a4) -; RV64ZVE32F-NEXT: andi a0, a6, 8 +; RV64ZVE32F-NEXT: andi a0, a7, 8 ; RV64ZVE32F-NEXT: beqz a0, .LBB38_4 ; RV64ZVE32F-NEXT: .LBB38_8: # %cond.store5 ; RV64ZVE32F-NEXT: sd a3, 0(a2) @@ -3123,9 +3135,7 @@ ; RV32ZVE32F-NEXT: lw a4, 16(a0) ; RV32ZVE32F-NEXT: lw a7, 12(a0) ; RV32ZVE32F-NEXT: lw a6, 8(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmset.m v9 -; RV32ZVE32F-NEXT: vmv.x.s a5, v9 +; RV32ZVE32F-NEXT: li a5, 15 ; RV32ZVE32F-NEXT: beqz zero, .LBB39_5 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a5, 2 @@ -3177,37 +3187,35 @@ ; RV64ZVE32F-NEXT: ld a4, 16(a1) ; RV64ZVE32F-NEXT: ld a7, 8(a1) ; RV64ZVE32F-NEXT: ld a3, 24(a0) -; RV64ZVE32F-NEXT: ld a5, 16(a0) +; RV64ZVE32F-NEXT: ld a6, 16(a0) ; RV64ZVE32F-NEXT: ld t0, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v8 -; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: li a5, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB39_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else -; RV64ZVE32F-NEXT: andi a0, a6, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB39_6 ; RV64ZVE32F-NEXT: .LBB39_2: # %else2 -; RV64ZVE32F-NEXT: andi a0, a6, 4 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB39_7 ; RV64ZVE32F-NEXT: .LBB39_3: # %else4 -; RV64ZVE32F-NEXT: andi a0, a6, 8 -; RV64ZVE32F-NEXT: bnez a0, .LBB39_8 +; RV64ZVE32F-NEXT: andi a5, a5, 8 +; RV64ZVE32F-NEXT: bnez a5, .LBB39_8 ; RV64ZVE32F-NEXT: .LBB39_4: # %else6 ; RV64ZVE32F-NEXT: ret ; RV64ZVE32F-NEXT: .LBB39_5: # %cond.store ; RV64ZVE32F-NEXT: ld a1, 0(a1) ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: sd a0, 0(a1) -; RV64ZVE32F-NEXT: andi a0, a6, 2 +; RV64ZVE32F-NEXT: andi a0, a5, 2 ; RV64ZVE32F-NEXT: beqz a0, .LBB39_2 ; RV64ZVE32F-NEXT: .LBB39_6: # %cond.store1 ; RV64ZVE32F-NEXT: sd t0, 0(a7) -; RV64ZVE32F-NEXT: andi a0, a6, 4 +; RV64ZVE32F-NEXT: andi a0, a5, 4 ; RV64ZVE32F-NEXT: beqz a0, .LBB39_3 ; RV64ZVE32F-NEXT: .LBB39_7: # %cond.store3 -; RV64ZVE32F-NEXT: sd a5, 0(a4) -; RV64ZVE32F-NEXT: andi a0, a6, 8 -; RV64ZVE32F-NEXT: beqz a0, .LBB39_4 +; RV64ZVE32F-NEXT: sd a6, 0(a4) +; RV64ZVE32F-NEXT: andi a5, a5, 8 +; RV64ZVE32F-NEXT: beqz a5, .LBB39_4 ; RV64ZVE32F-NEXT: .LBB39_8: # %cond.store5 ; RV64ZVE32F-NEXT: sd a3, 0(a2) ; RV64ZVE32F-NEXT: ret @@ -3266,8 +3274,9 @@ ; RV32ZVE32F-NEXT: lw t6, 8(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a6, v0 -; RV32ZVE32F-NEXT: andi s1, a6, 1 -; RV32ZVE32F-NEXT: bnez s1, .LBB41_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m s1, v0 +; RV32ZVE32F-NEXT: beqz s1, .LBB41_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a6, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB41_11 @@ -3304,7 +3313,7 @@ ; RV32ZVE32F-NEXT: .LBB41_10: # %cond.store ; RV32ZVE32F-NEXT: lw s1, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -3386,8 +3395,9 @@ ; RV64ZVE32F-NEXT: ld s1, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a7, v0 -; RV64ZVE32F-NEXT: andi s2, a7, 1 -; RV64ZVE32F-NEXT: bnez s2, .LBB41_10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m s2, v0 +; RV64ZVE32F-NEXT: beqz s2, .LBB41_10 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a7, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB41_11 @@ -3500,8 +3510,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s1, a1, 1 -; RV32ZVE32F-NEXT: bnez s1, .LBB42_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m s1, v0 +; RV32ZVE32F-NEXT: beqz s1, .LBB42_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB42_11 @@ -3538,7 +3549,7 @@ ; RV32ZVE32F-NEXT: .LBB42_10: # %cond.store ; RV32ZVE32F-NEXT: lw s1, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -3605,8 +3616,9 @@ ; RV64ZVE32F-NEXT: ld t1, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi t2, a4, 1 -; RV64ZVE32F-NEXT: beqz t2, .LBB42_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t2, v0 +; RV64ZVE32F-NEXT: bnez t2, .LBB42_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -3746,8 +3758,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s1, a1, 1 -; RV32ZVE32F-NEXT: bnez s1, .LBB43_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m s1, v0 +; RV32ZVE32F-NEXT: beqz s1, .LBB43_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB43_11 @@ -3784,7 +3797,7 @@ ; RV32ZVE32F-NEXT: .LBB43_10: # %cond.store ; RV32ZVE32F-NEXT: lw s1, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -3851,8 +3864,9 @@ ; RV64ZVE32F-NEXT: ld t1, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi t2, a4, 1 -; RV64ZVE32F-NEXT: beqz t2, .LBB43_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t2, v0 +; RV64ZVE32F-NEXT: bnez t2, .LBB43_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -3993,8 +4007,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s1, a1, 1 -; RV32ZVE32F-NEXT: bnez s1, .LBB44_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m s1, v0 +; RV32ZVE32F-NEXT: beqz s1, .LBB44_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB44_11 @@ -4031,7 +4046,7 @@ ; RV32ZVE32F-NEXT: .LBB44_10: # %cond.store ; RV32ZVE32F-NEXT: lw s1, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -4098,8 +4113,9 @@ ; RV64ZVE32F-NEXT: ld t1, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi t2, a4, 1 -; RV64ZVE32F-NEXT: beqz t2, .LBB44_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t2, v0 +; RV64ZVE32F-NEXT: bnez t2, .LBB44_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 @@ -4248,8 +4264,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s1, a1, 1 -; RV32ZVE32F-NEXT: bnez s1, .LBB45_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m s1, v0 +; RV32ZVE32F-NEXT: beqz s1, .LBB45_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB45_11 @@ -4286,7 +4303,7 @@ ; RV32ZVE32F-NEXT: .LBB45_10: # %cond.store ; RV32ZVE32F-NEXT: lw s1, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -4353,11 +4370,12 @@ ; RV64ZVE32F-NEXT: ld t1, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi t2, a4, 1 -; RV64ZVE32F-NEXT: beqz t2, .LBB45_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t2, v0 +; RV64ZVE32F-NEXT: bnez t2, .LBB45_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 @@ -4495,8 +4513,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s1, a1, 1 -; RV32ZVE32F-NEXT: bnez s1, .LBB46_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m s1, v0 +; RV32ZVE32F-NEXT: beqz s1, .LBB46_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB46_11 @@ -4533,7 +4552,7 @@ ; RV32ZVE32F-NEXT: .LBB46_10: # %cond.store ; RV32ZVE32F-NEXT: lw s1, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -4600,11 +4619,12 @@ ; RV64ZVE32F-NEXT: ld t1, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi t2, a4, 1 -; RV64ZVE32F-NEXT: beqz t2, .LBB46_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t2, v0 +; RV64ZVE32F-NEXT: bnez t2, .LBB46_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 @@ -4743,8 +4763,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s1, a1, 1 -; RV32ZVE32F-NEXT: bnez s1, .LBB47_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m s1, v0 +; RV32ZVE32F-NEXT: beqz s1, .LBB47_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB47_11 @@ -4781,7 +4802,7 @@ ; RV32ZVE32F-NEXT: .LBB47_10: # %cond.store ; RV32ZVE32F-NEXT: lw s1, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -4849,12 +4870,13 @@ ; RV64ZVE32F-NEXT: lui a4, 16 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a5, v0 -; RV64ZVE32F-NEXT: andi t3, a5, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t3, v0 ; RV64ZVE32F-NEXT: addiw a4, a4, -1 -; RV64ZVE32F-NEXT: beqz t3, .LBB47_2 +; RV64ZVE32F-NEXT: bnez t3, .LBB47_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s t3, v8 ; RV64ZVE32F-NEXT: and t3, t3, a4 ; RV64ZVE32F-NEXT: slli t3, t3, 3 @@ -4999,8 +5021,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s1, a1, 1 -; RV32ZVE32F-NEXT: bnez s1, .LBB48_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m s1, v0 +; RV32ZVE32F-NEXT: beqz s1, .LBB48_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB48_11 @@ -5037,7 +5060,7 @@ ; RV32ZVE32F-NEXT: .LBB48_10: # %cond.store ; RV32ZVE32F-NEXT: lw s1, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -5104,11 +5127,12 @@ ; RV64ZVE32F-NEXT: ld t1, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi t2, a4, 1 -; RV64ZVE32F-NEXT: beqz t2, .LBB48_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t2, v0 +; RV64ZVE32F-NEXT: bnez t2, .LBB48_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 @@ -5242,8 +5266,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s1, a1, 1 -; RV32ZVE32F-NEXT: bnez s1, .LBB49_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m s1, v0 +; RV32ZVE32F-NEXT: beqz s1, .LBB49_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB49_11 @@ -5280,7 +5305,7 @@ ; RV32ZVE32F-NEXT: .LBB49_10: # %cond.store ; RV32ZVE32F-NEXT: lw s1, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -5347,11 +5372,12 @@ ; RV64ZVE32F-NEXT: ld t1, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi t2, a4, 1 -; RV64ZVE32F-NEXT: beqz t2, .LBB49_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t2, v0 +; RV64ZVE32F-NEXT: bnez t2, .LBB49_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 @@ -5486,8 +5512,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s1, a1, 1 -; RV32ZVE32F-NEXT: bnez s1, .LBB50_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m s1, v0 +; RV32ZVE32F-NEXT: beqz s1, .LBB50_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB50_11 @@ -5524,7 +5551,7 @@ ; RV32ZVE32F-NEXT: .LBB50_10: # %cond.store ; RV32ZVE32F-NEXT: lw s1, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw s1, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -5591,11 +5618,12 @@ ; RV64ZVE32F-NEXT: ld t1, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a4, v0 -; RV64ZVE32F-NEXT: andi t2, a4, 1 -; RV64ZVE32F-NEXT: beqz t2, .LBB50_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t2, v0 +; RV64ZVE32F-NEXT: bnez t2, .LBB50_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: ld a0, 0(a0) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 32 ; RV64ZVE32F-NEXT: srli t2, t2, 29 @@ -5766,8 +5794,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a2, a1, 1 -; RV32ZVE32F-NEXT: bnez a2, .LBB51_10 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a2, v0 +; RV32ZVE32F-NEXT: beqz a2, .LBB51_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a1, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB51_11 @@ -5810,7 +5839,7 @@ ; RV32ZVE32F-NEXT: .LBB51_10: # %cond.store ; RV32ZVE32F-NEXT: lw a2, 4(a0) ; RV32ZVE32F-NEXT: lw a0, 0(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s s2, v8 ; RV32ZVE32F-NEXT: sw a2, 4(s2) ; RV32ZVE32F-NEXT: sw a0, 0(s2) @@ -5894,8 +5923,9 @@ ; RV64ZVE32F-NEXT: ld a5, 56(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a7, v0 -; RV64ZVE32F-NEXT: andi s3, a7, 1 -; RV64ZVE32F-NEXT: bnez s3, .LBB51_10 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m s3, v0 +; RV64ZVE32F-NEXT: beqz s3, .LBB51_10 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a7, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB51_11 @@ -6036,10 +6066,10 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2f16: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB53_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB53_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB53_4 @@ -6079,10 +6109,10 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a5, v0 ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi a5, a3, 1 -; RV64ZVE32F-NEXT: bnez a5, .LBB54_5 +; RV64ZVE32F-NEXT: beqz a5, .LBB54_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB54_6 @@ -6139,9 +6169,7 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v9 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: li a3, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB55_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 @@ -6217,8 +6245,9 @@ ; RV64ZVE32F-NEXT: ld t0, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi t1, a3, 1 -; RV64ZVE32F-NEXT: bnez t1, .LBB57_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t1, v0 +; RV64ZVE32F-NEXT: beqz t1, .LBB57_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB57_10 @@ -6316,8 +6345,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB58_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB58_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -6447,8 +6477,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB59_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB59_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -6579,8 +6610,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB60_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -6719,10 +6751,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB61_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB61_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -6886,10 +6919,10 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2f32: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB63_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB63_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB63_4 @@ -6929,10 +6962,10 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a5, v0 ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi a5, a3, 1 -; RV64ZVE32F-NEXT: bnez a5, .LBB64_5 +; RV64ZVE32F-NEXT: beqz a5, .LBB64_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB64_6 @@ -6989,9 +7022,7 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v9 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: li a3, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB65_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 @@ -7067,8 +7098,9 @@ ; RV64ZVE32F-NEXT: ld t0, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi t1, a3, 1 -; RV64ZVE32F-NEXT: bnez t1, .LBB67_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t1, v0 +; RV64ZVE32F-NEXT: beqz t1, .LBB67_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB67_10 @@ -7165,8 +7197,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB68_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB68_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -7295,8 +7328,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB69_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB69_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -7426,8 +7460,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB70_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -7565,10 +7600,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB71_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB71_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -7696,10 +7732,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB72_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB72_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -7829,11 +7866,12 @@ ; RV64ZVE32F-NEXT: lui a1, 16 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: addiw a1, a1, -1 -; RV64ZVE32F-NEXT: beqz a3, .LBB73_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB73_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 @@ -7969,10 +8007,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB74_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -8137,10 +8176,10 @@ ; ; RV32ZVE32F-LABEL: mscatter_v2f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB76_3 +; RV32ZVE32F-NEXT: beqz a1, .LBB76_3 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a0, a0, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB76_4 @@ -8161,10 +8200,10 @@ ; ; RV64ZVE32F-LABEL: mscatter_v2f64: ; RV64ZVE32F: # %bb.0: -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 -; RV64ZVE32F-NEXT: bnez a3, .LBB76_3 +; RV64ZVE32F-NEXT: beqz a3, .LBB76_3 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a2, a2, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB76_4 @@ -8198,10 +8237,10 @@ ; ; RV32ZVE32F-LABEL: mscatter_v4f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB77_5 +; RV32ZVE32F-NEXT: beqz a1, .LBB77_5 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB77_6 @@ -8245,10 +8284,10 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a5, v0 ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi a5, a3, 1 -; RV64ZVE32F-NEXT: bnez a5, .LBB77_5 +; RV64ZVE32F-NEXT: beqz a5, .LBB77_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB77_6 @@ -8295,9 +8334,7 @@ ; ; RV32ZVE32F-LABEL: mscatter_truemask_v4f64: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmset.m v9 -; RV32ZVE32F-NEXT: vmv.x.s a0, v9 +; RV32ZVE32F-NEXT: li a0, 15 ; RV32ZVE32F-NEXT: beqz zero, .LBB78_5 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -8342,9 +8379,7 @@ ; RV64ZVE32F-NEXT: ld a1, 24(a0) ; RV64ZVE32F-NEXT: ld a2, 16(a0) ; RV64ZVE32F-NEXT: ld a4, 8(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmset.m v8 -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: li a3, 15 ; RV64ZVE32F-NEXT: beqz zero, .LBB78_5 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 @@ -8406,8 +8441,9 @@ ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB80_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB80_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB80_10 @@ -8432,7 +8468,7 @@ ; RV32ZVE32F-NEXT: .LBB80_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB80_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -8497,8 +8533,9 @@ ; RV64ZVE32F-NEXT: ld t0, 8(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi t1, a3, 1 -; RV64ZVE32F-NEXT: bnez t1, .LBB80_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t1, v0 +; RV64ZVE32F-NEXT: beqz t1, .LBB80_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a0, a3, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB80_10 @@ -8584,8 +8621,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB81_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB81_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB81_10 @@ -8610,7 +8648,7 @@ ; RV32ZVE32F-NEXT: .LBB81_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB81_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -8668,8 +8706,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB81_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB81_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 @@ -8786,8 +8825,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB82_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB82_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB82_10 @@ -8812,7 +8852,7 @@ ; RV32ZVE32F-NEXT: .LBB82_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB82_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -8870,8 +8910,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB82_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB82_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 @@ -8989,8 +9030,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB83_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB83_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB83_10 @@ -9015,7 +9057,7 @@ ; RV32ZVE32F-NEXT: .LBB83_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB83_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -9073,8 +9115,9 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB83_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB83_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -9200,8 +9243,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB84_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB84_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB84_10 @@ -9226,7 +9270,7 @@ ; RV32ZVE32F-NEXT: .LBB84_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB84_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -9284,10 +9328,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB84_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -9403,8 +9448,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB85_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB85_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB85_10 @@ -9429,7 +9475,7 @@ ; RV32ZVE32F-NEXT: .LBB85_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB85_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -9487,10 +9533,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB85_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB85_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -9607,8 +9654,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB86_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB86_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB86_10 @@ -9633,7 +9681,7 @@ ; RV32ZVE32F-NEXT: .LBB86_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB86_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -9692,11 +9740,12 @@ ; RV64ZVE32F-NEXT: lui a1, 16 ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v0 -; RV64ZVE32F-NEXT: andi a3, a2, 1 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a3, v0 ; RV64ZVE32F-NEXT: addiw a1, a1, -1 -; RV64ZVE32F-NEXT: beqz a3, .LBB86_2 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 3 @@ -9819,8 +9868,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB87_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB87_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB87_10 @@ -9845,7 +9895,7 @@ ; RV32ZVE32F-NEXT: .LBB87_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB87_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -9903,10 +9953,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB87_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB87_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -10018,8 +10069,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB88_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB88_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB88_10 @@ -10044,7 +10096,7 @@ ; RV32ZVE32F-NEXT: .LBB88_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB88_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -10102,10 +10154,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB88_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB88_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -10218,8 +10271,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB89_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB89_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB89_10 @@ -10244,7 +10298,7 @@ ; RV32ZVE32F-NEXT: .LBB89_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB89_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -10302,10 +10356,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB89_2 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB89_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 32 ; RV64ZVE32F-NEXT: srli a2, a2, 29 @@ -10442,8 +10497,9 @@ ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a1, a0, 1 -; RV32ZVE32F-NEXT: bnez a1, .LBB90_9 +; RV32ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vfirst.m a1, v0 +; RV32ZVE32F-NEXT: beqz a1, .LBB90_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else ; RV32ZVE32F-NEXT: andi a1, a0, 2 ; RV32ZVE32F-NEXT: bnez a1, .LBB90_10 @@ -10468,7 +10524,7 @@ ; RV32ZVE32F-NEXT: .LBB90_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB90_9: # %cond.store -; RV32ZVE32F-NEXT: vsetivli zero, 0, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fsd fa0, 0(a1) ; RV32ZVE32F-NEXT: andi a1, a0, 2 @@ -10533,8 +10589,9 @@ ; RV64ZVE32F-NEXT: ld a2, 56(a1) ; RV64ZVE32F-NEXT: vsetivli zero, 0, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a3, v0 -; RV64ZVE32F-NEXT: andi t2, a3, 1 -; RV64ZVE32F-NEXT: bnez t2, .LBB90_9 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m t2, v0 +; RV64ZVE32F-NEXT: beqz t2, .LBB90_9 ; RV64ZVE32F-NEXT: # %bb.1: # %else ; RV64ZVE32F-NEXT: andi a1, a3, 2 ; RV64ZVE32F-NEXT: bnez a1, .LBB90_10 @@ -10634,10 +10691,10 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_2 +; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma @@ -10868,10 +10925,11 @@ ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 0, e32, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a1, v0 -; RV64ZVE32F-NEXT: andi a2, a1, 1 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_2 +; RV64ZVE32F-NEXT: li a2, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vfirst.m a2, v0 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.store -; RV64ZVE32F-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -1500,9 +1500,12 @@ ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: mv a2, a0 +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a1 ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 ; RV64-NEXT: vid.v v9 -; RV64-NEXT: vmsltu.vx v9, v9, a1 +; RV64-NEXT: vmsltu.vv v9, v9, v10 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; RV64-NEXT: vmv.v.i v9, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll @@ -106,11 +106,21 @@ } define <8 x i1> @icmp_eq_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_eq_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_eq_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmseq.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_eq_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmseq.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"eq", <8 x i1> %m, i32 %evl) @@ -118,11 +128,21 @@ } define <8 x i1> @icmp_eq_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_eq_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmseq.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_eq_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmseq.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_eq_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmseq.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"eq", <8 x i1> %m, i32 %evl) @@ -164,11 +184,21 @@ } define <8 x i1> @icmp_ne_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ne_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ne_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ne_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"ne", <8 x i1> %m, i32 %evl) @@ -176,11 +206,21 @@ } define <8 x i1> @icmp_ne_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ne_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ne_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsne.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ne_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsne.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"ne", <8 x i1> %m, i32 %evl) @@ -222,11 +262,21 @@ } define <8 x i1> @icmp_ugt_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ugt_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ugt_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ugt_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsltu.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"ugt", <8 x i1> %m, i32 %evl) @@ -234,11 +284,21 @@ } define <8 x i1> @icmp_ugt_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ugt_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ugt_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsltu.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ugt_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsltu.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"ugt", <8 x i1> %m, i32 %evl) @@ -280,13 +340,23 @@ } define <8 x i1> @icmp_uge_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_uge_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsleu.vv v0, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_uge_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsleu.vv v0, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_uge_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsleu.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"uge", <8 x i1> %m, i32 %evl) @@ -294,11 +364,21 @@ } define <8 x i1> @icmp_uge_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_uge_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsleu.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_uge_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsleu.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_uge_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsleu.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"uge", <8 x i1> %m, i32 %evl) @@ -340,11 +420,21 @@ } define <8 x i1> @icmp_ult_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ult_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ult_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsltu.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ult_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsltu.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"ult", <8 x i1> %m, i32 %evl) @@ -352,11 +442,21 @@ } define <8 x i1> @icmp_ult_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_ult_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgtu.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_ult_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsgtu.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_ult_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsltu.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"ult", <8 x i1> %m, i32 %evl) @@ -398,11 +498,21 @@ } define <8 x i1> @icmp_sgt_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sgt_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sgt_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsgt.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sgt_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmslt.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"sgt", <8 x i1> %m, i32 %evl) @@ -410,11 +520,21 @@ } define <8 x i1> @icmp_sgt_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sgt_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sgt_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmslt.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sgt_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmslt.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"sgt", <8 x i1> %m, i32 %evl) @@ -456,13 +576,23 @@ } define <8 x i1> @icmp_sge_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sge_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sge_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsle.vv v0, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sge_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsle.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"sge", <8 x i1> %m, i32 %evl) @@ -470,11 +600,21 @@ } define <8 x i1> @icmp_sge_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sge_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sge_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsle.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sge_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsle.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"sge", <8 x i1> %m, i32 %evl) @@ -516,11 +656,21 @@ } define <8 x i1> @icmp_slt_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_slt_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmslt.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_slt_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmslt.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_slt_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmslt.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"slt", <8 x i1> %m, i32 %evl) @@ -528,11 +678,21 @@ } define <8 x i1> @icmp_slt_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_slt_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsgt.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_slt_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsgt.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_slt_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmslt.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"slt", <8 x i1> %m, i32 %evl) @@ -574,11 +734,21 @@ } define <8 x i1> @icmp_sle_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sle_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vx v0, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sle_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsle.vx v0, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sle_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsle.vv v0, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %va, <8 x i8> %vb, metadata !"sle", <8 x i1> %m, i32 %evl) @@ -586,13 +756,23 @@ } define <8 x i1> @icmp_sle_vx_swap_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_sle_vx_swap_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmsle.vv v0, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: icmp_sle_vx_swap_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmsle.vv v0, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_sle_vx_swap_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmsle.vv v0, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> %vb, <8 x i8> %va, metadata !"sle", <8 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll @@ -97,10 +97,9 @@ ; CHECK-NEXT: or a3, a3, a4 ; CHECK-NEXT: or a1, a1, a3 ; CHECK-NEXT: andi a3, a2, 16 -; CHECK-NEXT: andi a2, a2, -32 +; CHECK-NEXT: andi a2, a2, 32 ; CHECK-NEXT: or a2, a3, a2 ; CHECK-NEXT: or a1, a1, a2 -; CHECK-NEXT: andi a1, a1, 63 ; CHECK-NEXT: sb a1, 0(a0) ; CHECK-NEXT: ret store <6 x i1> %v, ptr %p diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -24,17 +24,14 @@ define void @widen_3xv4i16(ptr %x, ptr %z) { ; CHECK-LABEL: widen_3xv4i16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, a0, 16 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a2, a0, 8 -; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) ; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: vslideup.vi v10, v8, 8 +; CHECK-NEXT: vse16.v v10, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 8 @@ -51,9 +48,17 @@ define void @widen_4xv4i16(ptr %x, ptr %z) { ; CHECK-LABEL: widen_4xv4i16: ; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, a0, 16 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: addi a2, a0, 24 +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: vslideup.vi v10, v8, 8 +; CHECK-NEXT: vse16.v v10, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 8 @@ -91,9 +96,17 @@ ; ; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned: ; RV64-MISALIGN: # %bb.0: +; RV64-MISALIGN-NEXT: addi a2, a0, 16 +; RV64-MISALIGN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-MISALIGN-NEXT: vle16.v v8, (a2) +; RV64-MISALIGN-NEXT: addi a2, a0, 24 +; RV64-MISALIGN-NEXT: vle16.v v9, (a2) +; RV64-MISALIGN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-MISALIGN-NEXT: vle16.v v10, (a0) +; RV64-MISALIGN-NEXT: vslideup.vi v8, v9, 4 ; RV64-MISALIGN-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-MISALIGN-NEXT: vle16.v v8, (a0) -; RV64-MISALIGN-NEXT: vse16.v v8, (a1) +; RV64-MISALIGN-NEXT: vslideup.vi v10, v8, 8 +; RV64-MISALIGN-NEXT: vse16.v v10, (a1) ; RV64-MISALIGN-NEXT: ret %a = load <4 x i16>, ptr %x, align 1 %b.gep = getelementptr i8, ptr %x, i64 8 @@ -185,21 +198,19 @@ define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) { ; CHECK-LABEL: strided_constant_mismatch_4xv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a2, a0, 2 -; CHECK-NEXT: vle16.v v10, (a2) ; CHECK-NEXT: addi a2, a0, 6 -; CHECK-NEXT: vle16.v v12, (a2) -; CHECK-NEXT: addi a0, a0, 8 -; CHECK-NEXT: vle16.v v14, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: addi a2, a0, 8 +; CHECK-NEXT: vle16.v v9, (a2) +; CHECK-NEXT: li a2, 2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vlse64.v v10, (a0), a2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v14, 12 -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: vslideup.vi v10, v8, 8 +; CHECK-NEXT: vse16.v v10, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 2 @@ -233,9 +244,14 @@ define void @strided_runtime_4xv4i16(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_runtime_4xv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: add a3, a3, a2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 -; CHECK-NEXT: vse64.v v8, (a1) +; CHECK-NEXT: vlse64.v v10, (a3), a2 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 8 +; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 %s @@ -255,58 +271,37 @@ define void @strided_runtime_mismatch_4xv4i16(ptr %x, ptr %z, i64 %s, i64 %t) { ; RV32-LABEL: strided_runtime_mismatch_4xv4i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV32-NEXT: vle16.v v8, (a0) -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: vle16.v v10, (a0) -; RV32-NEXT: add a0, a0, a4 -; RV32-NEXT: vle16.v v12, (a0) -; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: vle16.v v14, (a0) -; RV32-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 4 -; RV32-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v12, 8 +; RV32-NEXT: add a3, a0, a2 +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v8, (a0), a2 +; RV32-NEXT: vlse64.v v10, (a3), a2 ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vslideup.vi v8, v14, 12 +; RV32-NEXT: vslideup.vi v8, v10, 8 ; RV32-NEXT: vse16.v v8, (a1) ; RV32-NEXT: ret ; ; RV64-LABEL: strided_runtime_mismatch_4xv4i16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV64-NEXT: vle16.v v8, (a0) -; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: vle16.v v10, (a0) -; RV64-NEXT: add a0, a0, a3 -; RV64-NEXT: vle16.v v12, (a0) -; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: vle16.v v14, (a0) -; RV64-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v10, 4 -; RV64-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v12, 8 +; RV64-NEXT: add a4, a0, a2 +; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vlse64.v v8, (a0), a2 +; RV64-NEXT: vlse64.v v10, (a3), a2 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vslideup.vi v8, v14, 12 +; RV64-NEXT: vslideup.vi v8, v10, 8 ; RV64-NEXT: vse16.v v8, (a1) ; RV64-NEXT: ret ; ; ZVE64F-LABEL: strided_runtime_mismatch_4xv4i16: ; ZVE64F: # %bb.0: -; ZVE64F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVE64F-NEXT: vle16.v v8, (a0) -; ZVE64F-NEXT: add a0, a0, a2 -; ZVE64F-NEXT: vle16.v v10, (a0) -; ZVE64F-NEXT: add a0, a0, a3 -; ZVE64F-NEXT: vle16.v v12, (a0) -; ZVE64F-NEXT: add a0, a0, a2 -; ZVE64F-NEXT: vle16.v v14, (a0) -; ZVE64F-NEXT: vsetivli zero, 8, e16, m2, tu, ma -; ZVE64F-NEXT: vslideup.vi v8, v10, 4 -; ZVE64F-NEXT: vsetivli zero, 12, e16, m2, tu, ma -; ZVE64F-NEXT: vslideup.vi v8, v12, 8 +; ZVE64F-NEXT: add a4, a0, a2 +; ZVE64F-NEXT: add a3, a4, a3 +; ZVE64F-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVE64F-NEXT: vlse64.v v8, (a0), a2 +; ZVE64F-NEXT: vlse64.v v10, (a3), a2 ; ZVE64F-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVE64F-NEXT: vslideup.vi v8, v14, 12 +; ZVE64F-NEXT: vslideup.vi v8, v10, 8 ; ZVE64F-NEXT: vse16.v v8, (a1) ; ZVE64F-NEXT: ret %a = load <4 x i16>, ptr %x @@ -326,9 +321,14 @@ define void @strided_runtime_4xv4f16(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_runtime_4xv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: add a3, a3, a2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 -; CHECK-NEXT: vse64.v v8, (a1) +; CHECK-NEXT: vlse64.v v10, (a3), a2 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 8 +; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %a = load <4 x half>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 %s @@ -347,9 +347,14 @@ define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) { ; CHECK-LABEL: strided_runtime_4xv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: add a3, a3, a2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vlse64.v v8, (a0), a2 -; CHECK-NEXT: vse64.v v8, (a1) +; CHECK-NEXT: vlse64.v v10, (a3), a2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b.gep = getelementptr i8, ptr %x, i64 %s diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -49,10 +49,10 @@ define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> %passthru) { ; RV32-LABEL: mgather_v2i16_align1: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, ma +; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV32-NEXT: vfirst.m a1, v0 ; RV32-NEXT: vmv.x.s a0, v0 -; RV32-NEXT: andi a1, a0, 1 -; RV32-NEXT: bnez a1, .LBB4_3 +; RV32-NEXT: beqz a1, .LBB4_3 ; RV32-NEXT: # %bb.1: # %else ; RV32-NEXT: andi a0, a0, 2 ; RV32-NEXT: bnez a0, .LBB4_4 @@ -86,10 +86,10 @@ ; ; RV64-LABEL: mgather_v2i16_align1: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, ma +; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV64-NEXT: vfirst.m a1, v0 ; RV64-NEXT: vmv.x.s a0, v0 -; RV64-NEXT: andi a1, a0, 1 -; RV64-NEXT: bnez a1, .LBB4_3 +; RV64-NEXT: beqz a1, .LBB4_3 ; RV64-NEXT: # %bb.1: # %else ; RV64-NEXT: andi a0, a0, 2 ; RV64-NEXT: bnez a0, .LBB4_4 @@ -129,10 +129,10 @@ define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %passthru) { ; RV32-LABEL: mgather_v2i64_align4: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, ma +; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV32-NEXT: vfirst.m a1, v0 ; RV32-NEXT: vmv.x.s a0, v0 -; RV32-NEXT: andi a1, a0, 1 -; RV32-NEXT: bnez a1, .LBB5_3 +; RV32-NEXT: beqz a1, .LBB5_3 ; RV32-NEXT: # %bb.1: # %else ; RV32-NEXT: andi a0, a0, 2 ; RV32-NEXT: bnez a0, .LBB5_4 @@ -165,10 +165,10 @@ ; ; RV64-LABEL: mgather_v2i64_align4: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, ma +; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV64-NEXT: vfirst.m a1, v0 ; RV64-NEXT: vmv.x.s a0, v0 -; RV64-NEXT: andi a1, a0, 1 -; RV64-NEXT: bnez a1, .LBB5_3 +; RV64-NEXT: beqz a1, .LBB5_3 ; RV64-NEXT: # %bb.1: # %else ; RV64-NEXT: andi a0, a0, 2 ; RV64-NEXT: bnez a0, .LBB5_4 @@ -210,8 +210,9 @@ ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, ma ; RV32-NEXT: vmv.x.s a0, v0 -; RV32-NEXT: andi a1, a0, 1 -; RV32-NEXT: bnez a1, .LBB6_5 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vfirst.m a1, v0 +; RV32-NEXT: beqz a1, .LBB6_5 ; RV32-NEXT: # %bb.1: # %else ; RV32-NEXT: andi a1, a0, 2 ; RV32-NEXT: bnez a1, .LBB6_6 @@ -224,7 +225,7 @@ ; RV32-NEXT: .LBB6_4: # %else6 ; RV32-NEXT: ret ; RV32-NEXT: .LBB6_5: # %cond.store -; RV32-NEXT: vsetivli zero, 0, e16, mf2, ta, ma +; RV32-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.x.s a2, v9 @@ -273,8 +274,9 @@ ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, ma ; RV64-NEXT: vmv.x.s a0, v0 -; RV64-NEXT: andi a1, a0, 1 -; RV64-NEXT: bnez a1, .LBB6_5 +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vfirst.m a1, v0 +; RV64-NEXT: beqz a1, .LBB6_5 ; RV64-NEXT: # %bb.1: # %else ; RV64-NEXT: andi a1, a0, 2 ; RV64-NEXT: bnez a1, .LBB6_6 @@ -287,7 +289,7 @@ ; RV64-NEXT: .LBB6_4: # %else6 ; RV64-NEXT: ret ; RV64-NEXT: .LBB6_5: # %cond.store -; RV64-NEXT: vsetivli zero, 0, e16, mf2, ta, ma +; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; RV64-NEXT: vmv.x.s a2, v10 @@ -340,10 +342,10 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x ptr> %ptrs, <2 x i1> %m) { ; RV32-LABEL: mscatter_v2i32_align2: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 0, e8, mf8, ta, ma +; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV32-NEXT: vfirst.m a1, v0 ; RV32-NEXT: vmv.x.s a0, v0 -; RV32-NEXT: andi a1, a0, 1 -; RV32-NEXT: bnez a1, .LBB7_3 +; RV32-NEXT: beqz a1, .LBB7_3 ; RV32-NEXT: # %bb.1: # %else ; RV32-NEXT: andi a0, a0, 2 ; RV32-NEXT: bnez a0, .LBB7_4 @@ -371,10 +373,10 @@ ; ; RV64-LABEL: mscatter_v2i32_align2: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 0, e8, mf8, ta, ma +; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV64-NEXT: vfirst.m a1, v0 ; RV64-NEXT: vmv.x.s a0, v0 -; RV64-NEXT: andi a1, a0, 1 -; RV64-NEXT: bnez a1, .LBB7_3 +; RV64-NEXT: beqz a1, .LBB7_3 ; RV64-NEXT: # %bb.1: # %else ; RV64-NEXT: andi a0, a0, 2 ; RV64-NEXT: bnez a0, .LBB7_4 @@ -414,9 +416,10 @@ ; RV32-NEXT: vmseq.vi v8, v8, 0 ; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; RV32-NEXT: vmv.x.s a2, v8 -; RV32-NEXT: andi a3, a2, 1 +; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV32-NEXT: vfirst.m a3, v8 ; RV32-NEXT: # implicit-def: $v8 -; RV32-NEXT: beqz a3, .LBB8_2 +; RV32-NEXT: bnez a3, .LBB8_2 ; RV32-NEXT: # %bb.1: # %cond.load ; RV32-NEXT: lbu a3, 1(a0) ; RV32-NEXT: lbu a4, 0(a0) @@ -428,7 +431,7 @@ ; RV32-NEXT: slli a6, a6, 24 ; RV32-NEXT: or a4, a6, a5 ; RV32-NEXT: or a3, a4, a3 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: .LBB8_2: # %else ; RV32-NEXT: andi a2, a2, 2 @@ -444,11 +447,11 @@ ; RV32-NEXT: slli a0, a0, 24 ; RV32-NEXT: or a0, a0, a4 ; RV32-NEXT: or a0, a0, a2 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vslideup.vi v8, v9, 1 ; RV32-NEXT: .LBB8_4: # %else2 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: ret ; @@ -458,9 +461,10 @@ ; RV64-NEXT: vmseq.vi v8, v8, 0 ; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; RV64-NEXT: vmv.x.s a2, v8 -; RV64-NEXT: andi a3, a2, 1 +; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; RV64-NEXT: vfirst.m a3, v8 ; RV64-NEXT: # implicit-def: $v8 -; RV64-NEXT: beqz a3, .LBB8_2 +; RV64-NEXT: bnez a3, .LBB8_2 ; RV64-NEXT: # %bb.1: # %cond.load ; RV64-NEXT: lbu a3, 1(a0) ; RV64-NEXT: lbu a4, 0(a0) @@ -472,7 +476,7 @@ ; RV64-NEXT: slli a6, a6, 24 ; RV64-NEXT: or a4, a6, a5 ; RV64-NEXT: or a3, a4, a3 -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vmv.v.x v8, a3 ; RV64-NEXT: .LBB8_2: # %else ; RV64-NEXT: andi a2, a2, 2 @@ -488,11 +492,11 @@ ; RV64-NEXT: slli a0, a0, 24 ; RV64-NEXT: or a0, a0, a4 ; RV64-NEXT: or a0, a0, a2 -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vslideup.vi v8, v9, 1 ; RV64-NEXT: .LBB8_4: # %else2 -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vse32.v v8, (a1) ; RV64-NEXT: ret %mask = icmp eq <2 x i32> %m, zeroinitializer @@ -510,8 +514,9 @@ ; CHECK-NEXT: vmseq.vi v9, v9, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; CHECK-NEXT: vmv.x.s a1, v9 -; CHECK-NEXT: andi a2, a1, 1 -; CHECK-NEXT: bnez a2, .LBB9_3 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vfirst.m a2, v9 +; CHECK-NEXT: beqz a2, .LBB9_3 ; CHECK-NEXT: # %bb.1: # %else ; CHECK-NEXT: andi a1, a1, 2 ; CHECK-NEXT: bnez a1, .LBB9_4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll @@ -117,11 +117,21 @@ } define <4 x i8> @vadd_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -129,11 +139,21 @@ } define <4 x i8> @vadd_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v4i8_commute: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v4i8_commute: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vadd.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v4i8_commute: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.add.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) @@ -141,11 +161,21 @@ } define <4 x i8> @vadd_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vadd.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -281,11 +311,21 @@ } define <8 x i8> @vadd_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vadd.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.add.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -293,11 +333,21 @@ } define <8 x i8> @vadd_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -522,11 +572,21 @@ } define <2 x i16> @vadd_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.add.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -534,11 +594,21 @@ } define <2 x i16> @vadd_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vadd.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -598,11 +668,21 @@ } define <4 x i16> @vadd_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vadd.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.add.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -610,11 +690,21 @@ } define <4 x i16> @vadd_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -826,11 +916,21 @@ } define <2 x i32> @vadd_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vadd.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.add.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -838,11 +938,21 @@ } define <2 x i32> @vadd_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vadd_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vadd_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vadd_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll @@ -143,11 +143,21 @@ } define <4 x i8> @vand_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.and.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -155,11 +165,21 @@ } define <4 x i8> @vand_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -219,11 +239,21 @@ } define <8 x i8> @vand_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.and.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -231,11 +261,21 @@ } define <8 x i8> @vand_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -371,11 +411,21 @@ } define <2 x i16> @vand_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.and.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -383,11 +433,21 @@ } define <2 x i16> @vand_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -447,11 +507,21 @@ } define <4 x i16> @vand_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.and.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -459,11 +529,21 @@ } define <4 x i16> @vand_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -675,11 +755,21 @@ } define <2 x i32> @vand_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.and.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -687,11 +777,21 @@ } define <2 x i32> @vand_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vand_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vand_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vand_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll @@ -96,11 +96,21 @@ } define <4 x i8> @vdiv_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vdiv.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vdiv.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.sdiv.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -108,11 +118,21 @@ } define <4 x i8> @vdiv_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vdiv.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vdiv.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -158,11 +178,21 @@ } define <8 x i8> @vdiv_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vdiv.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vdiv.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.sdiv.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -170,11 +200,21 @@ } define <8 x i8> @vdiv_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vdiv.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vdiv.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -258,11 +298,21 @@ } define <2 x i16> @vdiv_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vdiv.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vdiv.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.sdiv.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -270,11 +320,21 @@ } define <2 x i16> @vdiv_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vdiv.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vdiv.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -308,11 +368,21 @@ } define <4 x i16> @vdiv_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vdiv.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vdiv.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.sdiv.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -320,11 +390,21 @@ } define <4 x i16> @vdiv_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vdiv.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vdiv.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -458,11 +538,21 @@ } define <2 x i32> @vdiv_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vdiv.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vdiv.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.sdiv.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -470,11 +560,21 @@ } define <2 x i32> @vdiv_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdiv_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vdiv.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdiv_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vdiv.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdiv_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vdiv.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll @@ -95,11 +95,21 @@ } define <4 x i8> @vdivu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vdivu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vdivu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.udiv.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -107,11 +117,21 @@ } define <4 x i8> @vdivu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vdivu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vdivu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -157,11 +177,21 @@ } define <8 x i8> @vdivu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vdivu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vdivu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.udiv.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -169,11 +199,21 @@ } define <8 x i8> @vdivu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vdivu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vdivu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -257,11 +297,21 @@ } define <2 x i16> @vdivu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vdivu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vdivu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.udiv.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -269,11 +319,21 @@ } define <2 x i16> @vdivu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vdivu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vdivu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -307,11 +367,21 @@ } define <4 x i16> @vdivu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vdivu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vdivu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.udiv.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -319,11 +389,21 @@ } define <4 x i16> @vdivu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vdivu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vdivu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -457,11 +537,21 @@ } define <2 x i32> @vdivu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vdivu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vdivu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.udiv.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -469,11 +559,21 @@ } define <2 x i32> @vdivu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vdivu_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vdivu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vdivu_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vdivu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vdivu_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vdivu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll @@ -141,12 +141,23 @@ } define <4 x i8> @vmacc_vx_nxv4i8(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; RV32-NEXT: vmacc.vv v9, v8, v11, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; RV64-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -158,12 +169,23 @@ } define <4 x i8> @vmacc_vx_nxv4i8_unmasked(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, tu, ma +; RV32-NEXT: vmacc.vv v9, v8, v11 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, tu, ma +; RV64-NEXT: vmacc.vx v9, a0, v8 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -190,12 +212,23 @@ } define <4 x i8> @vmacc_vx_nxv4i8_ta(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv4i8_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv4i8_ta: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vmacc.vv v9, v8, v11, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv4i8_ta: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -242,12 +275,23 @@ } define <8 x i8> @vmacc_vx_nxv8i8(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; RV32-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; RV64-NEXT: vmacc.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %splat = insertelement <8 x i1> poison, i1 -1, i32 0 @@ -259,12 +303,23 @@ } define <8 x i8> @vmacc_vx_nxv8i8_unmasked(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV32-NEXT: vmacc.vx v9, a0, v8 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV64-NEXT: vmacc.vv v9, v8, v11 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %splat = insertelement <8 x i1> poison, i1 -1, i32 0 @@ -291,12 +346,23 @@ } define <8 x i8> @vmacc_vx_nxv8i8_ta(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv8i8_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv8i8_ta: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv8i8_ta: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vmacc.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %splat = insertelement <8 x i1> poison, i1 -1, i32 0 @@ -646,12 +712,23 @@ } define <2 x i16> @vmacc_vx_nxv2i16(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV32-NEXT: vmacc.vv v9, v8, v11, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV64-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 @@ -663,12 +740,23 @@ } define <2 x i16> @vmacc_vx_nxv2i16_unmasked(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, ma +; RV32-NEXT: vmacc.vv v9, v8, v11 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, ma +; RV64-NEXT: vmacc.vx v9, a0, v8 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 @@ -695,12 +783,23 @@ } define <2 x i16> @vmacc_vx_nxv2i16_ta(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv2i16_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv2i16_ta: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vmacc.vv v9, v8, v11, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv2i16_ta: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 @@ -747,12 +846,23 @@ } define <4 x i16> @vmacc_vx_nxv4i16(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV32-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV64-NEXT: vmacc.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -764,12 +874,23 @@ } define <4 x i16> @vmacc_vx_nxv4i16_unmasked(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, ma +; RV32-NEXT: vmacc.vx v9, a0, v8 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, ma +; RV64-NEXT: vmacc.vv v9, v8, v11 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -796,12 +917,23 @@ } define <4 x i16> @vmacc_vx_nxv4i16_ta(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv4i16_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv4i16_ta: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv4i16_ta: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vmacc.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -1151,12 +1283,23 @@ } define <2 x i32> @vmacc_vx_nxv2i32(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV32-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vmacc.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 @@ -1168,12 +1311,23 @@ } define <2 x i32> @vmacc_vx_nxv2i32_unmasked(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, ma +; RV32-NEXT: vmacc.vx v9, a0, v8 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, ma +; RV64-NEXT: vmacc.vv v9, v8, v11 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 @@ -1200,12 +1354,23 @@ } define <2 x i32> @vmacc_vx_nxv2i32_ta(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmacc_vx_nxv2i32_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vmacc_vx_nxv2i32_ta: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vmacc_vx_nxv2i32_ta: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vmacc.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmax-vp.ll @@ -96,11 +96,21 @@ } define <4 x i8> @vmax_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmax.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmax.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.smax.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -108,11 +118,21 @@ } define <4 x i8> @vmax_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v4i8_commute: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v4i8_commute: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmax.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v4i8_commute: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmax.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.smax.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) @@ -120,11 +140,21 @@ } define <4 x i8> @vmax_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmax.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmax.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -208,11 +238,21 @@ } define <8 x i8> @vmax_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmax.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmax.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.smax.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -220,11 +260,21 @@ } define <8 x i8> @vmax_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmax.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmax.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -397,11 +447,21 @@ } define <2 x i16> @vmax_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vmax.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vmax.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.smax.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -409,11 +469,21 @@ } define <2 x i16> @vmax_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vmax.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vmax.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -447,11 +517,21 @@ } define <4 x i16> @vmax_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vmax.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vmax.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.smax.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -459,11 +539,21 @@ } define <4 x i16> @vmax_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vmax.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vmax.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -597,11 +687,21 @@ } define <2 x i32> @vmax_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmax.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmax.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.smax.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -609,11 +709,21 @@ } define <2 x i32> @vmax_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmax_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmax.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmax_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmax.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmax_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmax.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmaxu-vp.ll @@ -95,11 +95,21 @@ } define <4 x i8> @vmaxu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmaxu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmaxu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.umax.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -107,11 +117,21 @@ } define <4 x i8> @vmaxu_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v4i8_commute: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v4i8_commute: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmaxu.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v4i8_commute: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmaxu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.umax.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) @@ -119,11 +139,21 @@ } define <4 x i8> @vmaxu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmaxu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmaxu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -207,11 +237,21 @@ } define <8 x i8> @vmaxu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmaxu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmaxu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.umax.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -219,11 +259,21 @@ } define <8 x i8> @vmaxu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmaxu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmaxu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -396,11 +446,21 @@ } define <2 x i16> @vmaxu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vmaxu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vmaxu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.umax.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -408,11 +468,21 @@ } define <2 x i16> @vmaxu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vmaxu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vmaxu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -446,11 +516,21 @@ } define <4 x i16> @vmaxu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vmaxu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vmaxu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.umax.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -458,11 +538,21 @@ } define <4 x i16> @vmaxu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vmaxu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vmaxu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -596,11 +686,21 @@ } define <2 x i32> @vmaxu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmaxu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmaxu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.umax.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -608,11 +708,21 @@ } define <2 x i32> @vmaxu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmaxu_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmaxu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmaxu_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmaxu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmaxu_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmaxu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmin-vp.ll @@ -96,11 +96,21 @@ } define <4 x i8> @vmin_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmin.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmin.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.smin.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -108,11 +118,21 @@ } define <4 x i8> @vmin_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v4i8_commute: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v4i8_commute: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmin.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v4i8_commute: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmin.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.smin.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) @@ -120,11 +140,21 @@ } define <4 x i8> @vmin_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmin.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmin.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -208,11 +238,21 @@ } define <8 x i8> @vmin_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmin.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmin.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.smin.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -220,11 +260,21 @@ } define <8 x i8> @vmin_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmin.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmin.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -397,11 +447,21 @@ } define <2 x i16> @vmin_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vmin.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vmin.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.smin.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -409,11 +469,21 @@ } define <2 x i16> @vmin_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vmin.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vmin.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -447,11 +517,21 @@ } define <4 x i16> @vmin_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vmin.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vmin.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.smin.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -459,11 +539,21 @@ } define <4 x i16> @vmin_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vmin.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vmin.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -597,11 +687,21 @@ } define <2 x i32> @vmin_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmin.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmin.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.smin.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -609,11 +709,21 @@ } define <2 x i32> @vmin_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmin_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmin.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmin_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmin.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmin_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmin.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vminu-vp.ll @@ -95,11 +95,21 @@ } define <4 x i8> @vminu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vminu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vminu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.umin.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -107,11 +117,21 @@ } define <4 x i8> @vminu_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v4i8_commute: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v4i8_commute: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vminu.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v4i8_commute: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vminu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.umin.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) @@ -119,11 +139,21 @@ } define <4 x i8> @vminu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vminu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vminu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -207,11 +237,21 @@ } define <8 x i8> @vminu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vminu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vminu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.umin.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -219,11 +259,21 @@ } define <8 x i8> @vminu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vminu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vminu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -396,11 +446,21 @@ } define <2 x i16> @vminu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vminu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vminu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.umin.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -408,11 +468,21 @@ } define <2 x i16> @vminu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vminu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vminu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -446,11 +516,21 @@ } define <4 x i16> @vminu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vminu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vminu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.umin.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -458,11 +538,21 @@ } define <4 x i16> @vminu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vminu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vminu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -596,11 +686,21 @@ } define <2 x i32> @vminu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vminu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vminu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.umin.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -608,11 +708,21 @@ } define <2 x i32> @vminu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vminu_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vminu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vminu_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vminu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vminu_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vminu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll @@ -91,11 +91,21 @@ } define <4 x i8> @vmul_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.mul.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -103,11 +113,21 @@ } define <4 x i8> @vmul_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -141,11 +161,21 @@ } define <8 x i8> @vmul_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmul.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.mul.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -153,11 +183,21 @@ } define <8 x i8> @vmul_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -241,11 +281,21 @@ } define <2 x i16> @vmul_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.mul.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -253,11 +303,21 @@ } define <2 x i16> @vmul_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -291,11 +351,21 @@ } define <4 x i16> @vmul_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vmul.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.mul.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -303,11 +373,21 @@ } define <4 x i16> @vmul_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -503,11 +583,21 @@ } define <2 x i32> @vmul_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmul.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmul.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.mul.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -515,11 +605,21 @@ } define <2 x i32> @vmul_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vmul_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vmul.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vmul_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vmul_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vmul.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll @@ -141,12 +141,23 @@ } define <4 x i8> @vnmsac_vx_nxv4i8(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; RV32-NEXT: vnmsac.vv v9, v8, v11, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; RV64-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -158,12 +169,23 @@ } define <4 x i8> @vnmsac_vx_nxv4i8_unmasked(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, tu, ma +; RV32-NEXT: vnmsac.vv v9, v8, v11 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, tu, ma +; RV64-NEXT: vnmsac.vx v9, a0, v8 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -190,12 +212,23 @@ } define <4 x i8> @vnmsac_vx_nxv4i8_ta(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv4i8_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv4i8_ta: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV32-NEXT: vnmsac.vv v9, v8, v11, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv4i8_ta: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; RV64-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -242,12 +275,23 @@ } define <8 x i8> @vnmsac_vx_nxv8i8(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; RV32-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; RV64-NEXT: vnmsac.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %splat = insertelement <8 x i1> poison, i1 -1, i32 0 @@ -259,12 +303,23 @@ } define <8 x i8> @vnmsac_vx_nxv8i8_unmasked(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV32-NEXT: vnmsac.vx v9, a0, v8 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV64-NEXT: vnmsac.vv v9, v8, v11 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %splat = insertelement <8 x i1> poison, i1 -1, i32 0 @@ -291,12 +346,23 @@ } define <8 x i8> @vnmsac_vx_nxv8i8_ta(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv8i8_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv8i8_ta: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV32-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv8i8_ta: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; RV64-NEXT: vnmsac.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %splat = insertelement <8 x i1> poison, i1 -1, i32 0 @@ -646,12 +712,23 @@ } define <2 x i16> @vnmsac_vx_nxv2i16(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV32-NEXT: vnmsac.vv v9, v8, v11, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV64-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 @@ -663,12 +740,23 @@ } define <2 x i16> @vnmsac_vx_nxv2i16_unmasked(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, ma +; RV32-NEXT: vnmsac.vv v9, v8, v11 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, ma +; RV64-NEXT: vnmsac.vx v9, a0, v8 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 @@ -695,12 +783,23 @@ } define <2 x i16> @vnmsac_vx_nxv2i16_ta(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv2i16_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv2i16_ta: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v11, v10, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV32-NEXT: vnmsac.vv v9, v8, v11, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv2i16_ta: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; RV64-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 @@ -747,12 +846,23 @@ } define <4 x i16> @vnmsac_vx_nxv4i16(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV32-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV64-NEXT: vnmsac.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -764,12 +874,23 @@ } define <4 x i16> @vnmsac_vx_nxv4i16_unmasked(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, ma +; RV32-NEXT: vnmsac.vx v9, a0, v8 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, ma +; RV64-NEXT: vnmsac.vv v9, v8, v11 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -796,12 +917,23 @@ } define <4 x i16> @vnmsac_vx_nxv4i16_ta(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv4i16_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv4i16_ta: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV32-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv4i16_ta: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; RV64-NEXT: vnmsac.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %splat = insertelement <4 x i1> poison, i1 -1, i32 0 @@ -1151,12 +1283,23 @@ } define <2 x i32> @vnmsac_vx_nxv2i32(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV32-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vnmsac.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 @@ -1168,12 +1311,23 @@ } define <2 x i32> @vnmsac_vx_nxv2i32_unmasked(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, ma +; RV32-NEXT: vnmsac.vx v9, a0, v8 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, ma +; RV64-NEXT: vnmsac.vv v9, v8, v11 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 @@ -1200,12 +1354,23 @@ } define <2 x i32> @vnmsac_vx_nxv2i32_ta(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vnmsac_vx_nxv2i32_ta: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vnmsac_vx_nxv2i32_ta: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV32-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vnmsac_vx_nxv2i32_ta: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v11, v10, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; RV64-NEXT: vnmsac.vv v9, v8, v11, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %splat = insertelement <2 x i1> poison, i1 -1, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll @@ -117,11 +117,21 @@ } define <4 x i8> @vor_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vor.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.or.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -129,11 +139,21 @@ } define <4 x i8> @vor_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v4i8_commute: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v4i8_commute: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v4i8_commute: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vor.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.or.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) @@ -141,11 +161,21 @@ } define <4 x i8> @vor_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vor.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -281,11 +311,21 @@ } define <8 x i8> @vor_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.or.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -293,11 +333,21 @@ } define <8 x i8> @vor_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -433,11 +483,21 @@ } define <2 x i16> @vor_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vor.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.or.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -445,11 +505,21 @@ } define <2 x i16> @vor_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vor.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -509,11 +579,21 @@ } define <4 x i16> @vor_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.or.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -521,11 +601,21 @@ } define <4 x i16> @vor_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -737,11 +827,21 @@ } define <2 x i32> @vor_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.or.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -749,11 +849,21 @@ } define <2 x i32> @vor_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vor_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vor_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vor_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll @@ -83,11 +83,21 @@ } define <4 x i8> @vpmerge_vx_v4i8(i8 %a, <4 x i8> %vb, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vpmerge_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, ma -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vpmerge_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, tu, ma +; RV32-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, tu, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %a, i32 0 %va = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.merge.v4i8(<4 x i1> %m, <4 x i8> %va, <4 x i8> %vb, i32 %evl) @@ -194,11 +204,21 @@ } define <8 x i8> @vpmerge_vx_v8i8(i8 %a, <8 x i8> %vb, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vpmerge_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vpmerge_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV32-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %a, i32 0 %va = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.merge.v8i8(<8 x i1> %m, <8 x i8> %va, <8 x i8> %vb, i32 %evl) @@ -268,11 +288,21 @@ } define <2 x i16> @vpmerge_vx_v2i16(i16 %a, <2 x i16> %vb, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vpmerge_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, ma -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vpmerge_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, ma +; RV32-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %a, i32 0 %va = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.merge.v2i16(<2 x i1> %m, <2 x i16> %va, <2 x i16> %vb, i32 %evl) @@ -305,11 +335,21 @@ } define <4 x i16> @vpmerge_vx_v4i16(i16 %a, <4 x i16> %vb, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vpmerge_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, ma -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vpmerge_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, ma +; RV32-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, ma +; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %a, i32 0 %va = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.merge.v4i16(<4 x i1> %m, <4 x i16> %va, <4 x i16> %vb, i32 %evl) @@ -416,11 +456,21 @@ } define <2 x i32> @vpmerge_vx_v2i32(i32 %a, <2 x i32> %vb, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vpmerge_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, ma -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vpmerge_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, ma +; RV32-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vpmerge_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, ma +; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %a, i32 0 %va = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.merge.v2i32(<2 x i1> %m, <2 x i32> %va, <2 x i32> %vb, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll @@ -96,11 +96,21 @@ } define <4 x i8> @vrem_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vrem.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vrem.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.srem.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -108,11 +118,21 @@ } define <4 x i8> @vrem_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vrem.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vrem.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -158,11 +178,21 @@ } define <8 x i8> @vrem_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vrem.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vrem.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.srem.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -170,11 +200,21 @@ } define <8 x i8> @vrem_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vrem.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vrem.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -258,11 +298,21 @@ } define <2 x i16> @vrem_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vrem.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vrem.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.srem.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -270,11 +320,21 @@ } define <2 x i16> @vrem_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vrem.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vrem.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -308,11 +368,21 @@ } define <4 x i16> @vrem_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vrem.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vrem.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.srem.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -320,11 +390,21 @@ } define <4 x i16> @vrem_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vrem.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vrem.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -458,11 +538,21 @@ } define <2 x i32> @vrem_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vrem.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vrem.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.srem.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -470,11 +560,21 @@ } define <2 x i32> @vrem_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrem_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vrem.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrem_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vrem.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrem_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vrem.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll @@ -95,11 +95,21 @@ } define <4 x i8> @vremu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vremu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vremu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.urem.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -107,11 +117,21 @@ } define <4 x i8> @vremu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vremu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vremu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -157,11 +177,21 @@ } define <8 x i8> @vremu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vremu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vremu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.urem.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -169,11 +199,21 @@ } define <8 x i8> @vremu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vremu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vremu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -257,11 +297,21 @@ } define <2 x i16> @vremu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vremu.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vremu.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.urem.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -269,11 +319,21 @@ } define <2 x i16> @vremu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vremu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vremu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -307,11 +367,21 @@ } define <4 x i16> @vremu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vremu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vremu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.urem.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -319,11 +389,21 @@ } define <4 x i16> @vremu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vremu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vremu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -457,11 +537,21 @@ } define <2 x i32> @vremu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vremu.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vremu.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.urem.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -469,11 +559,21 @@ } define <2 x i32> @vremu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vremu_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vremu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vremu_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vremu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vremu_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vremu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrsub-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrsub-vp.ll @@ -61,11 +61,21 @@ declare <4 x i8> @llvm.vp.sub.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32) define <4 x i8> @vrsub_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vsub.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vrsub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.sub.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl) @@ -73,11 +83,21 @@ } define <4 x i8> @vrsub_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vsub.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vrsub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -115,11 +135,21 @@ declare <8 x i8> @llvm.vp.sub.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32) define <8 x i8> @vrsub_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vrsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.sub.v8i8(<8 x i8> %vb, <8 x i8> %va, <8 x i1> %m, i32 %evl) @@ -127,11 +157,21 @@ } define <8 x i8> @vrsub_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vrsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v10, v8 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -223,11 +263,21 @@ declare <2 x i16> @llvm.vp.sub.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32) define <2 x i16> @vrsub_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vsub.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vrsub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.sub.v2i16(<2 x i16> %vb, <2 x i16> %va, <2 x i1> %m, i32 %evl) @@ -235,11 +285,21 @@ } define <2 x i16> @vrsub_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vsub.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vrsub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -277,11 +337,21 @@ declare <4 x i16> @llvm.vp.sub.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32) define <4 x i16> @vrsub_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vrsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.sub.v4i16(<4 x i16> %vb, <4 x i16> %va, <4 x i1> %m, i32 %evl) @@ -289,11 +359,21 @@ } define <4 x i16> @vrsub_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vrsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v10, v8 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -439,11 +519,21 @@ declare <2 x i32> @llvm.vp.sub.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32) define <2 x i32> @vrsub_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vrsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.sub.v2i32(<2 x i32> %vb, <2 x i32> %va, <2 x i1> %m, i32 %evl) @@ -451,11 +541,21 @@ } define <2 x i32> @vrsub_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vrsub_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vrsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vrsub_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vrsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrsub_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v10, v8 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd.ll @@ -53,11 +53,20 @@ } define <4 x i8> @sadd_v4i8_vx(<4 x i8> %va, i8 %b) { -; CHECK-LABEL: sadd_v4i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v4i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsadd.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v4i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb) @@ -89,11 +98,20 @@ } define <8 x i8> @sadd_v8i8_vx(<8 x i8> %va, i8 %b) { -; CHECK-LABEL: sadd_v8i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v8i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vsadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v8i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb) @@ -161,11 +179,20 @@ } define <2 x i16> @sadd_v2i16_vx(<2 x i16> %va, i16 %b) { -; CHECK-LABEL: sadd_v2i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v2i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsadd.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v2i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vsadd.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb) @@ -197,11 +224,20 @@ } define <4 x i16> @sadd_v4i16_vx(<4 x i16> %va, i16 %b) { -; CHECK-LABEL: sadd_v4i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v4i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vsadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v4i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb) @@ -305,11 +341,20 @@ } define <2 x i32> @sadd_v2i32_vx(<2 x i32> %va, i32 %b) { -; CHECK-LABEL: sadd_v2i32_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v2i32_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vsadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v2i32_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsadd.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb) @@ -317,11 +362,20 @@ } define <2 x i32> @sadd_v2i32_vx_commute(<2 x i32> %va, i32 %b) { -; CHECK-LABEL: sadd_v2i32_vx_commute: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vsadd.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: sadd_v2i32_vx_commute: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vsadd.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: sadd_v2i32_vx_commute: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsadd.vv v8, v10, v8 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %vb, <2 x i32> %va) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu.ll @@ -53,11 +53,20 @@ } define <4 x i8> @uadd_v4i8_vx(<4 x i8> %va, i8 %b) { -; CHECK-LABEL: uadd_v4i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v4i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsaddu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v4i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb) @@ -89,11 +98,20 @@ } define <8 x i8> @uadd_v8i8_vx(<8 x i8> %va, i8 %b) { -; CHECK-LABEL: uadd_v8i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v8i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vsaddu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v8i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsaddu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb) @@ -161,11 +179,20 @@ } define <2 x i16> @uadd_v2i16_vx(<2 x i16> %va, i16 %b) { -; CHECK-LABEL: uadd_v2i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v2i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsaddu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v2i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vsaddu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb) @@ -197,11 +224,20 @@ } define <4 x i16> @uadd_v4i16_vx(<4 x i16> %va, i16 %b) { -; CHECK-LABEL: uadd_v4i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v4i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vsaddu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v4i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsaddu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb) @@ -305,11 +341,20 @@ } define <2 x i32> @uadd_v2i32_vx(<2 x i32> %va, i32 %b) { -; CHECK-LABEL: uadd_v2i32_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v2i32_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vsaddu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v2i32_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsaddu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb) @@ -317,11 +362,20 @@ } define <2 x i32> @uadd_v2i32_vx_commute(<2 x i32> %va, i32 %b) { -; CHECK-LABEL: uadd_v2i32_vx_commute: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vsaddu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: uadd_v2i32_vx_commute: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vsaddu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: uadd_v2i32_vx_commute: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsaddu.vv v8, v10, v8 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %vb, <2 x i32> %va) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll @@ -132,11 +132,21 @@ } define <4 x i8> @vsll_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vsll.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vsll.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.shl.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -144,11 +154,21 @@ } define <4 x i8> @vsll_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -208,11 +228,21 @@ } define <8 x i8> @vsll_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vsll.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vsll.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.shl.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -220,11 +250,21 @@ } define <8 x i8> @vsll_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vsll.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vsll.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -360,11 +400,21 @@ } define <2 x i16> @vsll_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vsll.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vsll.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.shl.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -372,11 +422,21 @@ } define <2 x i16> @vsll_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vsll.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -436,11 +496,21 @@ } define <4 x i16> @vsll_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vsll.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vsll.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.shl.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -448,11 +518,21 @@ } define <4 x i16> @vsll_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vsll.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vsll.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -664,11 +744,21 @@ } define <2 x i32> @vsll_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsll.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsll.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.shl.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -676,11 +766,21 @@ } define <2 x i32> @vsll_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsll_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vsll.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsll_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsll.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsll_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsll.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll @@ -122,11 +122,21 @@ } define <4 x i8> @vsra_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vsra.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vsra.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.ashr.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -134,11 +144,21 @@ } define <4 x i8> @vsra_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vsra.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vsra.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -210,11 +230,21 @@ } define <8 x i8> @vsra_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vsra.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vsra.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.ashr.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -222,11 +252,21 @@ } define <8 x i8> @vsra_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vsra.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vsra.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -362,11 +402,21 @@ } define <2 x i16> @vsra_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vsra.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vsra.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.ashr.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -374,11 +424,21 @@ } define <2 x i16> @vsra_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vsra.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vsra.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -438,11 +498,21 @@ } define <4 x i16> @vsra_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vsra.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vsra.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.ashr.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -450,11 +520,21 @@ } define <4 x i16> @vsra_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vsra.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vsra.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -666,11 +746,21 @@ } define <2 x i32> @vsra_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsra.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsra.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.ashr.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -678,11 +768,21 @@ } define <2 x i32> @vsra_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsra_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vsra.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsra_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsra.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsra_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsra.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll @@ -121,11 +121,21 @@ } define <4 x i8> @vsrl_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vsrl.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.lshr.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -133,11 +143,21 @@ } define <4 x i8> @vsrl_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -209,11 +229,21 @@ } define <8 x i8> @vsrl_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vsrl.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.lshr.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -221,11 +251,21 @@ } define <8 x i8> @vsrl_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vsrl.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -361,11 +401,21 @@ } define <2 x i16> @vsrl_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vsrl.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.lshr.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -373,11 +423,21 @@ } define <2 x i16> @vsrl_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vsrl.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -437,11 +497,21 @@ } define <4 x i16> @vsrl_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vsrl.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.lshr.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -449,11 +519,21 @@ } define <4 x i16> @vsrl_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vsrl.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -665,11 +745,21 @@ } define <2 x i32> @vsrl_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsrl.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.lshr.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -677,11 +767,21 @@ } define <2 x i32> @vsrl_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsrl_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vsrl.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsrl_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsrl_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsrl.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub.ll @@ -54,11 +54,20 @@ } define <4 x i8> @ssub_v4i8_vx(<4 x i8> %va, i8 %b) { -; CHECK-LABEL: ssub_v4i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vssub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: ssub_v4i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vssub.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: ssub_v4i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb) @@ -91,11 +100,20 @@ } define <8 x i8> @ssub_v8i8_vx(<8 x i8> %va, i8 %b) { -; CHECK-LABEL: ssub_v8i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vssub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: ssub_v8i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vssub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: ssub_v8i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb) @@ -165,11 +183,20 @@ } define <2 x i16> @ssub_v2i16_vx(<2 x i16> %va, i16 %b) { -; CHECK-LABEL: ssub_v2i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vssub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: ssub_v2i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vssub.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: ssub_v2i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vssub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb) @@ -202,11 +229,20 @@ } define <4 x i16> @ssub_v4i16_vx(<4 x i16> %va, i16 %b) { -; CHECK-LABEL: ssub_v4i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vssub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: ssub_v4i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vssub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: ssub_v4i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb) @@ -313,11 +349,20 @@ } define <2 x i32> @ssub_v2i32_vx(<2 x i32> %va, i32 %b) { -; CHECK-LABEL: ssub_v2i32_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vssub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: ssub_v2i32_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vssub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: ssub_v2i32_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu.ll @@ -54,11 +54,20 @@ } define <4 x i8> @usub_v4i8_vx(<4 x i8> %va, i8 %b) { -; CHECK-LABEL: usub_v4i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vssubu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: usub_v4i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vssubu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: usub_v4i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb) @@ -91,11 +100,20 @@ } define <8 x i8> @usub_v8i8_vx(<8 x i8> %va, i8 %b) { -; CHECK-LABEL: usub_v8i8_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vssubu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: usub_v8i8_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vssubu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: usub_v8i8_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssubu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb) @@ -165,11 +183,20 @@ } define <2 x i16> @usub_v2i16_vx(<2 x i16> %va, i16 %b) { -; CHECK-LABEL: usub_v2i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vssubu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: usub_v2i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vssubu.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: usub_v2i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vssubu.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb) @@ -202,11 +229,20 @@ } define <4 x i16> @usub_v4i16_vx(<4 x i16> %va, i16 %b) { -; CHECK-LABEL: usub_v4i16_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vssubu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: usub_v4i16_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vssubu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: usub_v4i16_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssubu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb) @@ -313,11 +349,20 @@ } define <2 x i32> @usub_v2i32_vx(<2 x i32> %va, i32 %b) { -; CHECK-LABEL: usub_v2i32_vx: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vssubu.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: usub_v2i32_vx: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vssubu.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: usub_v2i32_vx: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vssubu.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll @@ -141,11 +141,21 @@ } define <4 x i8> @vsub_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vsub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.sub.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -153,11 +163,21 @@ } define <4 x i8> @vsub_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vsub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -191,11 +211,21 @@ } define <8 x i8> @vsub_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.sub.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -203,11 +233,21 @@ } define <8 x i8> @vsub_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -291,11 +331,21 @@ } define <2 x i16> @vsub_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vsub.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.sub.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -303,11 +353,21 @@ } define <2 x i16> @vsub_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vsub.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vsub.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -341,11 +401,21 @@ } define <4 x i16> @vsub_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.sub.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -353,11 +423,21 @@ } define <4 x i16> @vsub_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -491,11 +571,21 @@ } define <2 x i32> @vsub_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsub.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.sub.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -503,11 +593,21 @@ } define <2 x i32> @vsub_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vsub_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vsub.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vsub_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsub.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vsub_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -417,12 +417,22 @@ } define <4 x i16> @vwadd_vx_v4i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwadd_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwadd_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwadd.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwadd.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -433,12 +443,22 @@ } define <2 x i32> @vwadd_vx_v2i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwadd_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwadd_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwadd.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwadd.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -449,12 +469,22 @@ } define <8 x i16> @vwadd_vx_v8i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwadd_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwadd_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwadd.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwadd.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -465,12 +495,22 @@ } define <4 x i32> @vwadd_vx_v4i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwadd_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwadd_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwadd.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwadd.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -481,12 +521,22 @@ } define <2 x i64> @vwadd_vx_v2i64(ptr %x, i32 %y) { -; CHECK-LABEL: vwadd_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwadd_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwadd.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwadd_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwadd.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -417,12 +417,22 @@ } define <4 x i16> @vwaddu_vx_v4i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwaddu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwaddu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwaddu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwaddu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -433,12 +443,22 @@ } define <2 x i32> @vwaddu_vx_v2i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwaddu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwaddu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwaddu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwaddu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -449,12 +469,22 @@ } define <8 x i16> @vwaddu_vx_v8i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwaddu_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwaddu_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwaddu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwaddu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -465,12 +495,22 @@ } define <4 x i32> @vwaddu_vx_v4i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwaddu_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwaddu_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwaddu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwaddu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -481,12 +521,22 @@ } define <2 x i64> @vwaddu_vx_v2i64(ptr %x, i32 %y) { -; CHECK-LABEL: vwaddu_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwaddu_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwaddu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwaddu_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwaddu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -447,12 +447,22 @@ } define <4 x i16> @vwmul_vx_v4i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwmul_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmul_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwmul.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwmul.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -463,12 +473,22 @@ } define <2 x i32> @vwmul_vx_v2i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwmul_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmul_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwmul.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwmul.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -479,12 +499,22 @@ } define <8 x i16> @vwmul_vx_v8i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwmul_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmul_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwmul.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmul.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -495,12 +525,22 @@ } define <4 x i32> @vwmul_vx_v4i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwmul_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmul_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwmul.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmul.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -511,12 +551,22 @@ } define <2 x i64> @vwmul_vx_v2i64(ptr %x, i32 %y) { -; CHECK-LABEL: vwmul_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmul_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwmul.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmul_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmul.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -456,12 +456,22 @@ } define <4 x i16> @vwmulsu_vx_v4i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwmulsu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwmulsu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwmulsu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -472,12 +482,22 @@ } define <2 x i32> @vwmulsu_vx_v2i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwmulsu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwmulsu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwmulsu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -488,12 +508,22 @@ } define <8 x i16> @vwmulsu_vx_v8i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwmulsu_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwmulsu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmulsu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -504,12 +534,22 @@ } define <4 x i32> @vwmulsu_vx_v4i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwmulsu_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwmulsu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmulsu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -520,12 +560,22 @@ } define <2 x i64> @vwmulsu_vx_v2i64(ptr %x, i32 %y) { -; CHECK-LABEL: vwmulsu_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwmulsu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwmulsu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -417,12 +417,22 @@ } define <4 x i16> @vwsub_vx_v4i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwsub_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsub_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwsub.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwsub.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -433,12 +443,22 @@ } define <2 x i32> @vwsub_vx_v2i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwsub_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsub_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwsub.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwsub.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -449,12 +469,22 @@ } define <8 x i16> @vwsub_vx_v8i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwsub_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsub_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwsub.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsub.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -465,12 +495,22 @@ } define <4 x i32> @vwsub_vx_v4i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwsub_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsub_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwsub.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsub.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -481,12 +521,22 @@ } define <2 x i64> @vwsub_vx_v2i64(ptr %x, i32 %y) { -; CHECK-LABEL: vwsub_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsub_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwsub.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsub_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsub.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -417,12 +417,22 @@ } define <4 x i16> @vwsubu_vx_v4i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwsubu_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsubu_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwsubu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vwsubu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 %c = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -433,12 +443,22 @@ } define <2 x i32> @vwsubu_vx_v2i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwsubu_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsubu_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vrgather.vi v10, v8, 0 +; RV32-NEXT: vwsubu.vv v8, v9, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vwsubu.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 %c = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer @@ -449,12 +469,22 @@ } define <8 x i16> @vwsubu_vx_v8i16(ptr %x, i8 %y) { -; CHECK-LABEL: vwsubu_vx_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsubu_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vle8.v v9, (a0) +; RV32-NEXT: vwsubu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vle8.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsubu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 %c = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -465,12 +495,22 @@ } define <4 x i32> @vwsubu_vx_v4i32(ptr %x, i16 %y) { -; CHECK-LABEL: vwsubu_vx_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsubu_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vle16.v v9, (a0) +; RV32-NEXT: vwsubu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vle16.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsubu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 %c = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer @@ -481,12 +521,22 @@ } define <2 x i64> @vwsubu_vx_v2i64(ptr %x, i32 %y) { -; CHECK-LABEL: vwsubu_vx_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwsubu_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vwsubu.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwsubu_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vrgather.vi v10, v8, 0 +; RV64-NEXT: vwsubu.vv v8, v9, v10 +; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 %c = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll @@ -155,11 +155,21 @@ } define <4 x i8> @vxor_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vxor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vxor.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.xor.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl) @@ -167,11 +177,21 @@ } define <4 x i8> @vxor_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v4i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v4i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV32-NEXT: vxor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v4i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; RV64-NEXT: vxor.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -257,11 +277,21 @@ } define <8 x i8> @vxor_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vxor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vxor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.xor.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl) @@ -269,11 +299,21 @@ } define <8 x i8> @vxor_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v8i8_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v8i8_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV32-NEXT: vxor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v8i8_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; RV64-NEXT: vxor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer %head = insertelement <8 x i1> poison, i1 true, i32 0 @@ -563,11 +603,21 @@ } define <2 x i16> @vxor_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vxor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vxor.vx v8, v8, a0, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.xor.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl) @@ -575,11 +625,21 @@ } define <2 x i16> @vxor_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v2i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v2i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vrgather.vi v10, v9, 0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV32-NEXT: vxor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v2i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; RV64-NEXT: vxor.vx v8, v8, a0 +; RV64-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 @@ -665,11 +725,21 @@ } define <4 x i16> @vxor_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vxor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vxor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.xor.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl) @@ -677,11 +747,21 @@ } define <4 x i16> @vxor_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v4i16_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v4i16_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV32-NEXT: vxor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v4i16_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; RV64-NEXT: vxor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer %head = insertelement <4 x i1> poison, i1 true, i32 0 @@ -971,11 +1051,21 @@ } define <2 x i32> @vxor_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vxor.vx v8, v8, a0, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vxor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.xor.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl) @@ -983,11 +1073,21 @@ } define <2 x i32> @vxor_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) { -; CHECK-LABEL: vxor_vx_v2i32_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vxor.vx v8, v8, a0 -; CHECK-NEXT: ret +; RV32-LABEL: vxor_vx_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vxor.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vxor_vx_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vrgather.vi v10, v9, 0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vxor.vv v8, v8, v10 +; RV64-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer %head = insertelement <2 x i1> poison, i1 true, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll --- a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll @@ -6,29 +6,43 @@ ; RV64-1024-LABEL: interleave256: ; RV64-1024: # %bb.0: # %entry ; RV64-1024-NEXT: li a3, 128 -; RV64-1024-NEXT: vsetvli zero, a3, e16, m2, ta, ma -; RV64-1024-NEXT: vle16.v v8, (a1) -; RV64-1024-NEXT: vle16.v v10, (a2) -; RV64-1024-NEXT: vwaddu.vv v12, v8, v10 -; RV64-1024-NEXT: li a1, -1 -; RV64-1024-NEXT: vwmaccu.vx v12, a1, v10 +; RV64-1024-NEXT: vsetvli zero, a3, e32, m4, ta, ma +; RV64-1024-NEXT: vle16.v v10, (a1) +; RV64-1024-NEXT: vle16.v v8, (a2) +; RV64-1024-NEXT: vzext.vf2 v16, v10 ; RV64-1024-NEXT: li a1, 256 ; RV64-1024-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; RV64-1024-NEXT: vid.v v20 +; RV64-1024-NEXT: vrgather.vv v12, v16, v20 +; RV64-1024-NEXT: vsrl.vi v16, v20, 1 +; RV64-1024-NEXT: lui a2, %hi(.LCPI0_0) +; RV64-1024-NEXT: addi a2, a2, %lo(.LCPI0_0) +; RV64-1024-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; RV64-1024-NEXT: vlse64.v v0, (a2), zero +; RV64-1024-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; RV64-1024-NEXT: vrgather.vv v12, v8, v16, v0.t ; RV64-1024-NEXT: vse16.v v12, (a0) ; RV64-1024-NEXT: ret ; ; RV64-2048-LABEL: interleave256: ; RV64-2048: # %bb.0: # %entry ; RV64-2048-NEXT: li a3, 128 -; RV64-2048-NEXT: vsetvli zero, a3, e16, m1, ta, ma +; RV64-2048-NEXT: vsetvli zero, a3, e32, m2, ta, ma ; RV64-2048-NEXT: vle16.v v8, (a1) -; RV64-2048-NEXT: vle16.v v9, (a2) -; RV64-2048-NEXT: vwaddu.vv v10, v8, v9 -; RV64-2048-NEXT: li a1, -1 -; RV64-2048-NEXT: vwmaccu.vx v10, a1, v9 +; RV64-2048-NEXT: vle16.v v10, (a2) +; RV64-2048-NEXT: vzext.vf2 v12, v8 ; RV64-2048-NEXT: li a1, 256 ; RV64-2048-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; RV64-2048-NEXT: vse16.v v10, (a0) +; RV64-2048-NEXT: vid.v v8 +; RV64-2048-NEXT: vrgather.vv v14, v12, v8 +; RV64-2048-NEXT: vsrl.vi v8, v8, 1 +; RV64-2048-NEXT: lui a2, %hi(.LCPI0_0) +; RV64-2048-NEXT: addi a2, a2, %lo(.LCPI0_0) +; RV64-2048-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; RV64-2048-NEXT: vlse64.v v0, (a2), zero +; RV64-2048-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; RV64-2048-NEXT: vrgather.vv v14, v10, v8, v0.t +; RV64-2048-NEXT: vse16.v v14, (a0) ; RV64-2048-NEXT: ret entry: %ve = load <128 x i16>, ptr %0, align 256 @@ -44,28 +58,42 @@ ; RV64-1024-LABEL: interleave512: ; RV64-1024: # %bb.0: # %entry ; RV64-1024-NEXT: li a3, 256 -; RV64-1024-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; RV64-1024-NEXT: vle16.v v8, (a1) -; RV64-1024-NEXT: vle16.v v12, (a2) -; RV64-1024-NEXT: vwaddu.vv v16, v8, v12 -; RV64-1024-NEXT: li a1, -1 -; RV64-1024-NEXT: vwmaccu.vx v16, a1, v12 +; RV64-1024-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV64-1024-NEXT: vle16.v v12, (a1) +; RV64-1024-NEXT: vle16.v v8, (a2) +; RV64-1024-NEXT: vzext.vf2 v24, v12 ; RV64-1024-NEXT: li a1, 512 ; RV64-1024-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; RV64-1024-NEXT: vid.v v0 +; RV64-1024-NEXT: vrgather.vv v16, v24, v0 +; RV64-1024-NEXT: vsrl.vi v24, v0, 1 +; RV64-1024-NEXT: lui a2, %hi(.LCPI1_0) +; RV64-1024-NEXT: addi a2, a2, %lo(.LCPI1_0) +; RV64-1024-NEXT: vsetivli zero, 8, e64, m1, ta, ma +; RV64-1024-NEXT: vlse64.v v0, (a2), zero +; RV64-1024-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; RV64-1024-NEXT: vrgather.vv v16, v8, v24, v0.t ; RV64-1024-NEXT: vse16.v v16, (a0) ; RV64-1024-NEXT: ret ; ; RV64-2048-LABEL: interleave512: ; RV64-2048: # %bb.0: # %entry ; RV64-2048-NEXT: li a3, 256 -; RV64-2048-NEXT: vsetvli zero, a3, e16, m2, ta, ma -; RV64-2048-NEXT: vle16.v v8, (a1) -; RV64-2048-NEXT: vle16.v v10, (a2) -; RV64-2048-NEXT: vwaddu.vv v12, v8, v10 -; RV64-2048-NEXT: li a1, -1 -; RV64-2048-NEXT: vwmaccu.vx v12, a1, v10 +; RV64-2048-NEXT: vsetvli zero, a3, e32, m4, ta, ma +; RV64-2048-NEXT: vle16.v v10, (a1) +; RV64-2048-NEXT: vle16.v v8, (a2) +; RV64-2048-NEXT: vzext.vf2 v16, v10 ; RV64-2048-NEXT: li a1, 512 ; RV64-2048-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; RV64-2048-NEXT: vid.v v20 +; RV64-2048-NEXT: vrgather.vv v12, v16, v20 +; RV64-2048-NEXT: vsrl.vi v16, v20, 1 +; RV64-2048-NEXT: lui a2, %hi(.LCPI1_0) +; RV64-2048-NEXT: addi a2, a2, %lo(.LCPI1_0) +; RV64-2048-NEXT: vsetivli zero, 8, e64, m1, ta, ma +; RV64-2048-NEXT: vlse64.v v0, (a2), zero +; RV64-2048-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; RV64-2048-NEXT: vrgather.vv v12, v8, v16, v0.t ; RV64-2048-NEXT: vse16.v v12, (a0) ; RV64-2048-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll --- a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll +++ b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll @@ -20,24 +20,24 @@ ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: andi sp, sp, -16 ; CHECK-NEXT: mv s1, sp -; CHECK-NEXT: lw t0, 44(s1) +; CHECK-NEXT: sw a0, 52(s1) +; CHECK-NEXT: lw a1, 52(s1) +; CHECK-NEXT: lw a0, 44(s1) ; CHECK-NEXT: lw a2, 40(s1) ; CHECK-NEXT: lw a3, 36(s1) ; CHECK-NEXT: lw a4, 32(s1) ; CHECK-NEXT: lw a5, 28(s1) ; CHECK-NEXT: lw a6, 24(s1) ; CHECK-NEXT: lw a7, 20(s1) -; CHECK-NEXT: lw t1, 16(s1) -; CHECK-NEXT: lw a1, 12(s1) +; CHECK-NEXT: lw t0, 16(s1) +; CHECK-NEXT: lw t1, 12(s1) ; CHECK-NEXT: lw t2, 8(s1) -; CHECK-NEXT: sw a0, 52(s1) -; CHECK-NEXT: sw a0, 48(s1) +; CHECK-NEXT: sw a1, 48(s1) ; CHECK-NEXT: addi sp, sp, -32 ; CHECK-NEXT: sd t2, 16(sp) -; CHECK-NEXT: sd a1, 8(sp) +; CHECK-NEXT: sd t1, 8(sp) ; CHECK-NEXT: addi a1, s1, 48 -; CHECK-NEXT: sd t1, 0(sp) -; CHECK-NEXT: mv a0, t0 +; CHECK-NEXT: sd t0, 0(sp) ; CHECK-NEXT: call gfunc@plt ; CHECK-NEXT: addi sp, sp, 32 ; CHECK-NEXT: li a0, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -4136,12 +4136,15 @@ ; CHECK-LABEL: sink_splat_mul_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB81_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmul.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vmul.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB81_1 @@ -4170,12 +4173,15 @@ ; CHECK-LABEL: sink_splat_add_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB82_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vadd.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vadd.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB82_1 @@ -4204,12 +4210,15 @@ ; CHECK-LABEL: sink_splat_sub_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB83_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsub.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsub.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB83_1 @@ -4238,12 +4247,15 @@ ; CHECK-LABEL: sink_splat_rsub_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB84_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vrsub.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB84_1 @@ -4272,12 +4284,15 @@ ; CHECK-LABEL: sink_splat_and_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB85_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vand.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB85_1 @@ -4306,12 +4321,15 @@ ; CHECK-LABEL: sink_splat_or_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB86_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vor.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vor.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB86_1 @@ -4340,12 +4358,15 @@ ; CHECK-LABEL: sink_splat_xor_lmulmf2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; CHECK-NEXT: vmv.s.x v9, a1 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vrgather.vi v8, v9, 0 ; CHECK-NEXT: .LBB87_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vxor.vx v8, v8, a1 -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vxor.vv v9, v9, v8 +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: addi a2, a2, -4 ; CHECK-NEXT: addi a0, a0, 32 ; CHECK-NEXT: bnez a2, .LBB87_1 diff --git a/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll b/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll --- a/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll @@ -8,13 +8,11 @@ ; RV32-NEXT: lui a0, 1048571 ; RV32-NEXT: addi a0, a0, -1365 ; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsll.vi v9, v8, 15 -; RV32-NEXT: vsrl.vi v8, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, -1366 -; RV32-NEXT: vmsgtu.vx v0, v8, a0 +; RV32-NEXT: vmulhu.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v9, v9, 2 +; RV32-NEXT: li a0, 6 +; RV32-NEXT: vnmsub.vx v9, a0, v8 +; RV32-NEXT: vmsne.vi v0, v9, 0 ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32-NEXT: ret @@ -24,13 +22,11 @@ ; RV64-NEXT: lui a0, 1048571 ; RV64-NEXT: addiw a0, a0, -1365 ; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsll.vi v9, v8, 15 -; RV64-NEXT: vsrl.vi v8, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, -1366 -; RV64-NEXT: vmsgtu.vx v0, v8, a0 +; RV64-NEXT: vmulhu.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v9, v9, 2 +; RV64-NEXT: li a0, 6 +; RV64-NEXT: vnmsub.vx v9, a0, v8 +; RV64-NEXT: vmsne.vi v0, v9, 0 ; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64-NEXT: ret @@ -50,10 +46,11 @@ ; RV32-NEXT: lui a0, 1048573 ; RV32-NEXT: addi a0, a0, -819 ; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vmsgtu.vx v0, v8, a0 +; RV32-NEXT: vmulhu.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v9, v9, 2 +; RV32-NEXT: li a0, 5 +; RV32-NEXT: vnmsub.vx v9, a0, v8 +; RV32-NEXT: vmsne.vi v0, v9, 0 ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32-NEXT: ret @@ -63,10 +60,11 @@ ; RV64-NEXT: lui a0, 1048573 ; RV64-NEXT: addiw a0, a0, -819 ; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vmsgtu.vx v0, v8, a0 +; RV64-NEXT: vmulhu.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v9, v9, 2 +; RV64-NEXT: li a0, 5 +; RV64-NEXT: vnmsub.vx v9, a0, v8 +; RV64-NEXT: vmsne.vi v0, v9, 0 ; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64-NEXT: ret @@ -83,36 +81,28 @@ define @test_urem_vec_even_divisor_eq1( %x) nounwind { ; RV32-LABEL: test_urem_vec_even_divisor_eq1: ; RV32: # %bb.0: -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; RV32-NEXT: vsub.vx v8, v8, a0 ; RV32-NEXT: lui a0, 1048571 ; RV32-NEXT: addi a0, a0, -1365 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsll.vi v9, v8, 15 -; RV32-NEXT: vsrl.vi v8, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, -1366 -; RV32-NEXT: vmsgtu.vx v0, v8, a0 +; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; RV32-NEXT: vmulhu.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v9, v9, 2 +; RV32-NEXT: li a0, 6 +; RV32-NEXT: vnmsub.vx v9, a0, v8 +; RV32-NEXT: vmsne.vi v0, v9, 1 ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_vec_even_divisor_eq1: ; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; RV64-NEXT: vsub.vx v8, v8, a0 ; RV64-NEXT: lui a0, 1048571 ; RV64-NEXT: addiw a0, a0, -1365 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsll.vi v9, v8, 15 -; RV64-NEXT: vsrl.vi v8, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, -1366 -; RV64-NEXT: vmsgtu.vx v0, v8, a0 +; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; RV64-NEXT: vmulhu.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v9, v9, 2 +; RV64-NEXT: li a0, 6 +; RV64-NEXT: vnmsub.vx v9, a0, v8 +; RV64-NEXT: vmsne.vi v0, v9, 1 ; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64-NEXT: ret @@ -129,30 +119,28 @@ define @test_urem_vec_odd_divisor_eq1( %x) nounwind { ; RV32-LABEL: test_urem_vec_odd_divisor_eq1: ; RV32: # %bb.0: -; RV32-NEXT: li a0, 1 -; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; RV32-NEXT: vsub.vx v8, v8, a0 ; RV32-NEXT: lui a0, 1048573 ; RV32-NEXT: addi a0, a0, -819 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 818 -; RV32-NEXT: vmsgtu.vx v0, v8, a0 +; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; RV32-NEXT: vmulhu.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v9, v9, 2 +; RV32-NEXT: li a0, 5 +; RV32-NEXT: vnmsub.vx v9, a0, v8 +; RV32-NEXT: vmsne.vi v0, v9, 1 ; RV32-NEXT: vmv.v.i v8, 0 ; RV32-NEXT: vmerge.vim v8, v8, -1, v0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_vec_odd_divisor_eq1: ; RV64: # %bb.0: -; RV64-NEXT: li a0, 1 -; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma -; RV64-NEXT: vsub.vx v8, v8, a0 ; RV64-NEXT: lui a0, 1048573 ; RV64-NEXT: addiw a0, a0, -819 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 818 -; RV64-NEXT: vmsgtu.vx v0, v8, a0 +; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; RV64-NEXT: vmulhu.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v9, v9, 2 +; RV64-NEXT: li a0, 5 +; RV64-NEXT: vnmsub.vx v9, a0, v8 +; RV64-NEXT: vmsne.vi v0, v9, 1 ; RV64-NEXT: vmv.v.i v8, 0 ; RV64-NEXT: vmerge.vim v8, v8, -1, v0 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/variant-cc.ll b/llvm/test/CodeGen/RISCV/rvv/variant-cc.ll --- a/llvm/test/CodeGen/RISCV/rvv/variant-cc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/variant-cc.ll @@ -1,50 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=riscv64 -mattr=+v -o - %s | FileCheck %s --check-prefix=CHECK-ASM ; RUN: llc -mtriple=riscv64 -mattr=+v -filetype=obj -o - %s \ ; RUN: | llvm-readobj --symbols - | FileCheck %s --check-prefix=CHECK-OBJ define i32 @base_cc() { ; CHECK-ASM-LABEL: base_cc: -; CHECK-ASM-NOT: .variant_cc +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: li a0, 42 +; CHECK-ASM-NEXT: ret ; CHECK-OBJ-LABEL: Name: base_cc ; CHECK-OBJ: Other: 0 ret i32 42 } define <4 x i32> @fixed_vector_cc_1(<4 x i32> %arg) { -; CHECK-ASM: .variant_cc fixed_vector_cc_1 -; CHECK-ASM-NEXT: fixed_vector_cc_1: +; CHECK-ASM-LABEL: fixed_vector_cc_1: +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: ret ; CHECK-OBJ-LABEL: Name: fixed_vector_cc_1 ; CHECK-OBJ: Other [ (0x80) ret <4 x i32> %arg } define @rvv_vector_cc_1() { -; CHECK-ASM: .variant_cc rvv_vector_cc_1 -; CHECK-ASM-NEXT: rvv_vector_cc_1: +; CHECK-ASM-LABEL: rvv_vector_cc_1: +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: ret ; CHECK-OBJ-LABEL: Name: rvv_vector_cc_1 ; CHECK-OBJ: Other [ (0x80) ret undef } define @rvv_vector_cc_2() { -; CHECK-ASM: .variant_cc rvv_vector_cc_2 -; CHECK-ASM-NEXT: rvv_vector_cc_2: +; CHECK-ASM-LABEL: rvv_vector_cc_2: +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: ret ; CHECK-OBJ-LABEL: Name: rvv_vector_cc_2 ; CHECK-OBJ: Other [ (0x80) ret undef } define void @rvv_vector_cc_3( %arg) { -; CHECK-ASM: .variant_cc rvv_vector_cc_3 -; CHECK-ASM-NEXT: rvv_vector_cc_3: +; CHECK-ASM-LABEL: rvv_vector_cc_3: +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: ret ; CHECK-OBJ-LABEL: Name: rvv_vector_cc_3 ; CHECK-OBJ: Other [ (0x80) ret void } define void @rvv_vector_cc_4( %arg) { -; CHECK-ASM: .variant_cc rvv_vector_cc_4 -; CHECK-ASM-NEXT: rvv_vector_cc_4: +; CHECK-ASM-LABEL: rvv_vector_cc_4: +; CHECK-ASM: # %bb.0: +; CHECK-ASM-NEXT: ret ; CHECK-OBJ-LABEL: Name: rvv_vector_cc_4 ; CHECK-OBJ: Other [ (0x80) ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll @@ -7,15 +7,14 @@ define @srem_eq_fold_nxv4i8( %va) { ; CHECK-LABEL: srem_eq_fold_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 42 +; CHECK-NEXT: li a0, 43 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: li a1, -85 -; CHECK-NEXT: vmacc.vx v9, a1, v8 -; CHECK-NEXT: vsll.vi v8, v9, 7 -; CHECK-NEXT: vsrl.vi v9, v9, 1 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vmsleu.vx v0, v8, a0 +; CHECK-NEXT: vmulh.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v10, v9, 7 +; CHECK-NEXT: vadd.vv v9, v9, v10 +; CHECK-NEXT: li a0, 6 +; CHECK-NEXT: vnmsub.vx v9, a0, v8 +; CHECK-NEXT: vmseq.vi v0, v9, 0 ; CHECK-NEXT: ret %head_six = insertelement poison, i8 6, i32 0 %splat_six = shufflevector %head_six, poison, zeroinitializer @@ -90,9 +89,10 @@ define @vmulh_vi_nxv1i32_1( %va) { ; CHECK-LABEL: vmulh_vi_nxv1i32_1: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 16 -; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; CHECK-NEXT: vmulh.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v9, 28 ; CHECK-NEXT: ret %head1 = insertelement poison, i32 16, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer @@ -171,9 +171,10 @@ define @vmulh_vi_nxv2i32_1( %va) { ; CHECK-LABEL: vmulh_vi_nxv2i32_1: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 16 -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; CHECK-NEXT: vmulh.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v10, 28 ; CHECK-NEXT: ret %head1 = insertelement poison, i32 16, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer @@ -252,9 +253,10 @@ define @vmulh_vi_nxv4i32_1( %va) { ; CHECK-LABEL: vmulh_vi_nxv4i32_1: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 16 -; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; CHECK-NEXT: vmulh.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v12, 28 ; CHECK-NEXT: ret %head1 = insertelement poison, i32 16, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer @@ -333,9 +335,10 @@ define @vmulh_vi_nxv8i32_1( %va) { ; CHECK-LABEL: vmulh_vi_nxv8i32_1: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 16 -; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; CHECK-NEXT: vmulh.vx v8, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v16, 28 ; CHECK-NEXT: ret %head1 = insertelement poison, i32 16, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll @@ -65,18 +65,13 @@ } define @vmulhu_vi_nxv1i32_1( %va) { -; RV32-LABEL: vmulhu_vi_nxv1i32_1: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 28 -; RV32-NEXT: ret -; -; RV64-LABEL: vmulhu_vi_nxv1i32_1: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 16 -; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; RV64-NEXT: vmulhu.vx v8, v8, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vmulhu_vi_nxv1i32_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v9, 28 +; CHECK-NEXT: ret %head1 = insertelement poison, i32 16, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer %vb = zext %splat1 to @@ -152,18 +147,13 @@ } define @vmulhu_vi_nxv2i32_1( %va) { -; RV32-LABEL: vmulhu_vi_nxv2i32_1: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 28 -; RV32-NEXT: ret -; -; RV64-LABEL: vmulhu_vi_nxv2i32_1: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 16 -; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV64-NEXT: vmulhu.vx v8, v8, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vmulhu_vi_nxv2i32_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v10, 28 +; CHECK-NEXT: ret %head1 = insertelement poison, i32 16, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer %vb = zext %splat1 to @@ -239,18 +229,13 @@ } define @vmulhu_vi_nxv4i32_1( %va) { -; RV32-LABEL: vmulhu_vi_nxv4i32_1: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 28 -; RV32-NEXT: ret -; -; RV64-LABEL: vmulhu_vi_nxv4i32_1: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 16 -; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV64-NEXT: vmulhu.vx v8, v8, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vmulhu_vi_nxv4i32_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v12, 28 +; CHECK-NEXT: ret %head1 = insertelement poison, i32 16, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer %vb = zext %splat1 to @@ -326,18 +311,13 @@ } define @vmulhu_vi_nxv8i32_1( %va) { -; RV32-LABEL: vmulhu_vi_nxv8i32_1: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 28 -; RV32-NEXT: ret -; -; RV64-LABEL: vmulhu_vi_nxv8i32_1: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 16 -; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV64-NEXT: vmulhu.vx v8, v8, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vmulhu_vi_nxv8i32_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v16, 28 +; CHECK-NEXT: ret %head1 = insertelement poison, i32 16, i32 0 %splat1 = shufflevector %head1, poison, zeroinitializer %vb = zext %splat1 to diff --git a/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll b/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll --- a/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll @@ -9,14 +9,14 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a0, %hi(c) ; CHECK-NEXT: addi a0, a0, %lo(c) -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: addi a1, a0, 16 -; CHECK-NEXT: vle64.v v9, (a1) -; CHECK-NEXT: addi a1, a0, 8 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: addi a1, a0, 24 ; CHECK-NEXT: vse64.v v8, (a1) -; CHECK-NEXT: addi a0, a0, 24 -; CHECK-NEXT: vse64.v v9, (a0) +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 8 +; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret entry: ; this thing is "__builtin_memmove(&c[1], &c[0], sizeof(c[0]) * 4);" diff --git a/llvm/test/CodeGen/RISCV/select-constant-xor.ll b/llvm/test/CodeGen/RISCV/select-constant-xor.ll --- a/llvm/test/CodeGen/RISCV/select-constant-xor.ll +++ b/llvm/test/CodeGen/RISCV/select-constant-xor.ll @@ -56,10 +56,10 @@ ; ; RV64-LABEL: selecti64i32: ; RV64: # %bb.0: -; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: slti a0, a0, 0 +; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: lui a1, 524288 -; RV64-NEXT: addiw a1, a1, -1 -; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: subw a0, a1, a0 ; RV64-NEXT: ret %c = icmp sgt i64 %a, -1 %s = select i1 %c, i32 2147483647, i32 -2147483648 diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll --- a/llvm/test/CodeGen/RISCV/select.ll +++ b/llvm/test/CodeGen/RISCV/select.ll @@ -808,8 +808,8 @@ ; RV32IM-LABEL: select_sub_3: ; RV32IM: # %bb.0: # %entry ; RV32IM-NEXT: addi a0, a0, -1 -; RV32IM-NEXT: andi a0, a0, 42 -; RV32IM-NEXT: sub a0, a1, a0 +; RV32IM-NEXT: andi a0, a0, -42 +; RV32IM-NEXT: add a0, a1, a0 ; RV32IM-NEXT: ret ; ; RV64IM-LABEL: select_sub_3: @@ -831,9 +831,9 @@ ; ; RV32IMZICOND-LABEL: select_sub_3: ; RV32IMZICOND: # %bb.0: # %entry -; RV32IMZICOND-NEXT: li a2, 42 +; RV32IMZICOND-NEXT: li a2, -42 ; RV32IMZICOND-NEXT: czero.nez a0, a2, a0 -; RV32IMZICOND-NEXT: sub a0, a1, a0 +; RV32IMZICOND-NEXT: add a0, a1, a0 ; RV32IMZICOND-NEXT: ret ; ; RV64IMZICOND-LABEL: select_sub_3: diff --git a/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll b/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll --- a/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll +++ b/llvm/test/CodeGen/RISCV/selectcc-to-shiftand.ll @@ -18,7 +18,7 @@ ; ; RV64-LABEL: neg_sel_constants: ; RV64: # %bb.0: -; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: srli a0, a0, 31 ; RV64-NEXT: andi a0, a0, 5 ; RV64-NEXT: ret %tmp.1 = icmp slt i32 %a, 0 @@ -108,8 +108,8 @@ ; ; RV64-LABEL: pos_sel_special_constant: ; RV64: # %bb.0: -; RV64-NEXT: slti a0, a0, 0 -; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: not a0, a0 +; RV64-NEXT: srliw a0, a0, 31 ; RV64-NEXT: slli a0, a0, 9 ; RV64-NEXT: ret %tmp.1 = icmp sgt i32 %a, -1 @@ -216,19 +216,10 @@ } define i8 @sel_shift_bool_i8(i1 %t) { -; RV32-LABEL: sel_shift_bool_i8: -; RV32: # %bb.0: -; RV32-NEXT: slli a0, a0, 31 -; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: andi a0, a0, -128 -; RV32-NEXT: ret -; -; RV64-LABEL: sel_shift_bool_i8: -; RV64: # %bb.0: -; RV64-NEXT: slli a0, a0, 63 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: andi a0, a0, -128 -; RV64-NEXT: ret +; CHECK-LABEL: sel_shift_bool_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 7 +; CHECK-NEXT: ret %shl = select i1 %t, i8 128, i8 0 ret i8 %shl } diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll --- a/llvm/test/CodeGen/RISCV/sextw-removal.ll +++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll @@ -1086,14 +1086,14 @@ ; NOREMOVAL-NEXT: li a1, 32 ; NOREMOVAL-NEXT: j .LBB18_4 ; NOREMOVAL-NEXT: .LBB18_3: -; NOREMOVAL-NEXT: slli a0, a0, 16 +; NOREMOVAL-NEXT: slliw a0, a0, 16 ; NOREMOVAL-NEXT: li a1, 16 ; NOREMOVAL-NEXT: .LBB18_4: # %if.end ; NOREMOVAL-NEXT: srliw a3, a0, 24 ; NOREMOVAL-NEXT: snez a2, a3 ; NOREMOVAL-NEXT: bnez a3, .LBB18_6 ; NOREMOVAL-NEXT: # %bb.5: -; NOREMOVAL-NEXT: slli a0, a0, 8 +; NOREMOVAL-NEXT: slliw a0, a0, 8 ; NOREMOVAL-NEXT: .LBB18_6: # %if.end ; NOREMOVAL-NEXT: addiw a2, a2, -1 ; NOREMOVAL-NEXT: andi a2, a2, -8 @@ -1102,7 +1102,7 @@ ; NOREMOVAL-NEXT: snez a2, a3 ; NOREMOVAL-NEXT: bnez a3, .LBB18_8 ; NOREMOVAL-NEXT: # %bb.7: -; NOREMOVAL-NEXT: slli a0, a0, 4 +; NOREMOVAL-NEXT: slliw a0, a0, 4 ; NOREMOVAL-NEXT: .LBB18_8: # %if.end ; NOREMOVAL-NEXT: addiw a2, a2, -1 ; NOREMOVAL-NEXT: andi a2, a2, -4 @@ -1111,9 +1111,8 @@ ; NOREMOVAL-NEXT: snez a2, a3 ; NOREMOVAL-NEXT: bnez a3, .LBB18_10 ; NOREMOVAL-NEXT: # %bb.9: -; NOREMOVAL-NEXT: slli a0, a0, 2 +; NOREMOVAL-NEXT: slliw a0, a0, 2 ; NOREMOVAL-NEXT: .LBB18_10: # %if.end -; NOREMOVAL-NEXT: sext.w a0, a0 ; NOREMOVAL-NEXT: addiw a2, a2, -1 ; NOREMOVAL-NEXT: andi a2, a2, -2 ; NOREMOVAL-NEXT: not a0, a0 diff --git a/llvm/test/CodeGen/RISCV/signbit-test.ll b/llvm/test/CodeGen/RISCV/signbit-test.ll --- a/llvm/test/CodeGen/RISCV/signbit-test.ll +++ b/llvm/test/CodeGen/RISCV/signbit-test.ll @@ -5,7 +5,8 @@ define i64 @test_clear_mask_i64_i32(i64 %x) nounwind { ; RV32-LABEL: test_clear_mask_i64_i32: ; RV32: # %bb.0: # %entry -; RV32-NEXT: bltz a0, .LBB0_2 +; RV32-NEXT: srli a2, a0, 31 +; RV32-NEXT: bnez a2, .LBB0_2 ; RV32-NEXT: # %bb.1: # %t ; RV32-NEXT: li a1, 0 ; RV32-NEXT: li a0, 42 diff --git a/llvm/test/CodeGen/RISCV/signed-truncation-check.ll b/llvm/test/CodeGen/RISCV/signed-truncation-check.ll --- a/llvm/test/CodeGen/RISCV/signed-truncation-check.ll +++ b/llvm/test/CodeGen/RISCV/signed-truncation-check.ll @@ -402,7 +402,8 @@ ; RV32I-NEXT: lui a1, 1048560 ; RV32I-NEXT: addi a1, a1, -1 ; RV32I-NEXT: sltu a1, a1, a2 -; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: addi a0, a0, -1 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: ret ; @@ -442,7 +443,8 @@ ; RV32I-NEXT: addi a2, a0, -128 ; RV32I-NEXT: sltu a0, a2, a0 ; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: addi a0, a0, -1 ; RV32I-NEXT: sltiu a1, a2, -256 ; RV32I-NEXT: xori a1, a1, 1 ; RV32I-NEXT: and a0, a0, a1 @@ -651,7 +653,8 @@ ; RV32I-NEXT: addi a2, a0, 128 ; RV32I-NEXT: sltu a0, a2, a0 ; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: addi a0, a0, -1 ; RV32I-NEXT: sltiu a1, a2, 256 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -389,23 +389,23 @@ ; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: mv s0, a0 -; RV64-NEXT: lbu a0, 12(a0) -; RV64-NEXT: lwu a1, 8(s0) -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: ld a2, 0(s0) -; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ld a1, 0(a0) +; RV64-NEXT: lwu a0, 8(a0) +; RV64-NEXT: srli a2, a1, 2 +; RV64-NEXT: lbu a3, 12(s0) +; RV64-NEXT: slli a4, a0, 62 +; RV64-NEXT: or a2, a4, a2 +; RV64-NEXT: srai s1, a2, 31 +; RV64-NEXT: slli a3, a3, 32 +; RV64-NEXT: or a0, a0, a3 ; RV64-NEXT: slli a0, a0, 29 -; RV64-NEXT: srai s1, a0, 31 -; RV64-NEXT: srli a0, a2, 2 -; RV64-NEXT: slli a1, a1, 62 -; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: srai a0, a0, 31 -; RV64-NEXT: slli a2, a2, 31 -; RV64-NEXT: srai s2, a2, 31 -; RV64-NEXT: li a1, 7 +; RV64-NEXT: slli a1, a1, 31 +; RV64-NEXT: srai s2, a1, 31 +; RV64-NEXT: li a1, -5 ; RV64-NEXT: call __moddi3@plt ; RV64-NEXT: mv s3, a0 -; RV64-NEXT: li a1, -5 +; RV64-NEXT: li a1, 7 ; RV64-NEXT: mv a0, s1 ; RV64-NEXT: call __moddi3@plt ; RV64-NEXT: mv s1, a0 @@ -422,26 +422,26 @@ ; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: sltu a0, a1, a0 -; RV64-NEXT: addi s1, s1, -2 +; RV64-NEXT: addi s1, s1, -1 ; RV64-NEXT: seqz a1, s1 -; RV64-NEXT: addi s3, s3, -1 +; RV64-NEXT: addi s3, s3, -2 ; RV64-NEXT: seqz a2, s3 ; RV64-NEXT: neg a0, a0 -; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: slli a2, a2, 2 ; RV64-NEXT: addi a1, a1, -1 -; RV64-NEXT: slli a3, a1, 2 -; RV64-NEXT: slli a4, a2, 31 -; RV64-NEXT: srli a4, a4, 62 -; RV64-NEXT: or a3, a4, a3 -; RV64-NEXT: sw a3, 8(s0) -; RV64-NEXT: slli a1, a1, 29 -; RV64-NEXT: srli a1, a1, 61 -; RV64-NEXT: sb a1, 12(s0) +; RV64-NEXT: slli a3, a1, 31 +; RV64-NEXT: srli a3, a3, 62 +; RV64-NEXT: or a2, a2, a3 +; RV64-NEXT: addi a2, a2, -4 +; RV64-NEXT: sw a2, 8(s0) ; RV64-NEXT: slli a0, a0, 31 ; RV64-NEXT: srli a0, a0, 31 -; RV64-NEXT: slli a2, a2, 33 -; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: sd a0, 0(s0) +; RV64-NEXT: slli a2, a2, 29 +; RV64-NEXT: srli a2, a2, 61 +; RV64-NEXT: sb a2, 12(s0) ; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -562,38 +562,35 @@ ; RV64M-NEXT: srli a5, a4, 63 ; RV64M-NEXT: srai a4, a4, 1 ; RV64M-NEXT: add a4, a4, a5 -; RV64M-NEXT: slli a5, a4, 3 +; RV64M-NEXT: lui a5, %hi(.LCPI3_2) +; RV64M-NEXT: ld a5, %lo(.LCPI3_2)(a5) ; RV64M-NEXT: add a3, a3, a4 -; RV64M-NEXT: sub a3, a3, a5 +; RV64M-NEXT: slli a4, a4, 3 +; RV64M-NEXT: sub a3, a3, a4 +; RV64M-NEXT: mulh a4, a1, a5 +; RV64M-NEXT: srli a5, a4, 63 +; RV64M-NEXT: add a4, a4, a5 +; RV64M-NEXT: li a5, 6 +; RV64M-NEXT: mul a4, a4, a5 +; RV64M-NEXT: sub a1, a1, a4 ; RV64M-NEXT: addi a3, a3, -1 ; RV64M-NEXT: seqz a3, a3 -; RV64M-NEXT: lui a4, 699051 -; RV64M-NEXT: addiw a4, a4, -1365 -; RV64M-NEXT: slli a5, a4, 32 -; RV64M-NEXT: add a4, a4, a5 -; RV64M-NEXT: lui a5, %hi(.LCPI3_2) -; RV64M-NEXT: ld a5, %lo(.LCPI3_2)(a5) ; RV64M-NEXT: addi a2, a2, -2 ; RV64M-NEXT: seqz a2, a2 -; RV64M-NEXT: mul a1, a1, a4 -; RV64M-NEXT: add a1, a1, a5 -; RV64M-NEXT: slli a4, a1, 63 -; RV64M-NEXT: srli a1, a1, 1 -; RV64M-NEXT: or a1, a1, a4 -; RV64M-NEXT: sltu a1, a5, a1 -; RV64M-NEXT: addi a2, a2, -1 +; RV64M-NEXT: seqz a1, a1 +; RV64M-NEXT: slli a2, a2, 2 ; RV64M-NEXT: addi a3, a3, -1 -; RV64M-NEXT: neg a1, a1 +; RV64M-NEXT: addi a1, a1, -1 ; RV64M-NEXT: slli a4, a3, 33 ; RV64M-NEXT: slli a1, a1, 31 ; RV64M-NEXT: srli a1, a1, 31 ; RV64M-NEXT: or a1, a1, a4 ; RV64M-NEXT: sd a1, 0(a0) -; RV64M-NEXT: slli a1, a2, 2 ; RV64M-NEXT: slli a3, a3, 31 ; RV64M-NEXT: srli a3, a3, 62 -; RV64M-NEXT: or a1, a3, a1 -; RV64M-NEXT: sw a1, 8(a0) +; RV64M-NEXT: or a2, a2, a3 +; RV64M-NEXT: addi a2, a2, -4 +; RV64M-NEXT: sw a2, 8(a0) ; RV64M-NEXT: slli a2, a2, 29 ; RV64M-NEXT: srli a2, a2, 61 ; RV64M-NEXT: sb a2, 12(a0) diff --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll --- a/llvm/test/CodeGen/RISCV/stack-store-check.ll +++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll @@ -277,15 +277,15 @@ ; CHECK-NEXT: addi a2, sp, 56 ; CHECK-NEXT: sw a3, 72(sp) ; CHECK-NEXT: call __addtf3@plt -; CHECK-NEXT: lw a0, 96(sp) -; CHECK-NEXT: lw a1, 100(sp) -; CHECK-NEXT: lw a2, 88(sp) -; CHECK-NEXT: lw a3, 92(sp) +; CHECK-NEXT: lw a0, 100(sp) +; CHECK-NEXT: lw a1, 96(sp) +; CHECK-NEXT: lw a2, 92(sp) +; CHECK-NEXT: lw a3, 88(sp) ; CHECK-NEXT: lui a4, %hi(Y1) -; CHECK-NEXT: sw a0, %lo(Y1+8)(a4) -; CHECK-NEXT: sw a1, %lo(Y1+12)(a4) -; CHECK-NEXT: sw a2, %lo(Y1)(a4) -; CHECK-NEXT: sw a3, %lo(Y1+4)(a4) +; CHECK-NEXT: sw a0, %lo(Y1+12)(a4) +; CHECK-NEXT: sw a1, %lo(Y1+8)(a4) +; CHECK-NEXT: sw a2, %lo(Y1+4)(a4) +; CHECK-NEXT: sw a3, %lo(Y1)(a4) ; CHECK-NEXT: lw ra, 684(sp) # 4-byte Folded Reload ; CHECK-NEXT: lw s0, 680(sp) # 4-byte Folded Reload ; CHECK-NEXT: lw s1, 676(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -211,42 +211,54 @@ ; ; RV32M-LABEL: test_urem_odd_setne: ; RV32M: # %bb.0: -; RV32M-NEXT: slli a1, a0, 1 -; RV32M-NEXT: neg a0, a0 +; RV32M-NEXT: andi a1, a0, 15 +; RV32M-NEXT: li a2, 13 +; RV32M-NEXT: mul a1, a1, a2 +; RV32M-NEXT: srli a1, a1, 6 +; RV32M-NEXT: slli a2, a1, 2 +; RV32M-NEXT: or a1, a2, a1 ; RV32M-NEXT: sub a0, a0, a1 ; RV32M-NEXT: andi a0, a0, 15 -; RV32M-NEXT: sltiu a0, a0, 4 -; RV32M-NEXT: xori a0, a0, 1 +; RV32M-NEXT: snez a0, a0 ; RV32M-NEXT: ret ; ; RV64M-LABEL: test_urem_odd_setne: ; RV64M: # %bb.0: -; RV64M-NEXT: slli a1, a0, 1 -; RV64M-NEXT: negw a0, a0 +; RV64M-NEXT: andi a1, a0, 15 +; RV64M-NEXT: li a2, 13 +; RV64M-NEXT: mul a1, a1, a2 +; RV64M-NEXT: srli a1, a1, 6 +; RV64M-NEXT: slli a2, a1, 2 +; RV64M-NEXT: or a1, a2, a1 ; RV64M-NEXT: subw a0, a0, a1 ; RV64M-NEXT: andi a0, a0, 15 -; RV64M-NEXT: sltiu a0, a0, 4 -; RV64M-NEXT: xori a0, a0, 1 +; RV64M-NEXT: snez a0, a0 ; RV64M-NEXT: ret ; ; RV32MV-LABEL: test_urem_odd_setne: ; RV32MV: # %bb.0: -; RV32MV-NEXT: slli a1, a0, 1 -; RV32MV-NEXT: neg a0, a0 +; RV32MV-NEXT: andi a1, a0, 15 +; RV32MV-NEXT: li a2, 13 +; RV32MV-NEXT: mul a1, a1, a2 +; RV32MV-NEXT: srli a1, a1, 6 +; RV32MV-NEXT: slli a2, a1, 2 +; RV32MV-NEXT: or a1, a2, a1 ; RV32MV-NEXT: sub a0, a0, a1 ; RV32MV-NEXT: andi a0, a0, 15 -; RV32MV-NEXT: sltiu a0, a0, 4 -; RV32MV-NEXT: xori a0, a0, 1 +; RV32MV-NEXT: snez a0, a0 ; RV32MV-NEXT: ret ; ; RV64MV-LABEL: test_urem_odd_setne: ; RV64MV: # %bb.0: -; RV64MV-NEXT: slli a1, a0, 1 -; RV64MV-NEXT: negw a0, a0 +; RV64MV-NEXT: andi a1, a0, 15 +; RV64MV-NEXT: li a2, 13 +; RV64MV-NEXT: mul a1, a1, a2 +; RV64MV-NEXT: srli a1, a1, 6 +; RV64MV-NEXT: slli a2, a1, 2 +; RV64MV-NEXT: or a1, a2, a1 ; RV64MV-NEXT: subw a0, a0, a1 ; RV64MV-NEXT: andi a0, a0, 15 -; RV64MV-NEXT: sltiu a0, a0, 4 -; RV64MV-NEXT: xori a0, a0, 1 +; RV64MV-NEXT: snez a0, a0 ; RV64MV-NEXT: ret %urem = urem i4 %X, 5 %cmp = icmp ne i4 %urem, 0 @@ -391,8 +403,8 @@ ; RV64-NEXT: lwu a1, 0(s0) ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: or a0, a1, a0 -; RV64-NEXT: srli s1, a0, 22 -; RV64-NEXT: srli s2, a0, 11 +; RV64-NEXT: srli s1, a0, 11 +; RV64-NEXT: srli s2, a0, 22 ; RV64-NEXT: andi a0, a0, 2047 ; RV64-NEXT: li a1, 683 ; RV64-NEXT: call __muldi3@plt @@ -402,31 +414,31 @@ ; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: andi a0, a0, 2047 ; RV64-NEXT: sltiu s3, a0, 342 -; RV64-NEXT: li a1, 1463 +; RV64-NEXT: li a1, 819 ; RV64-NEXT: mv a0, s2 ; RV64-NEXT: call __muldi3@plt -; RV64-NEXT: addiw a0, a0, -1463 +; RV64-NEXT: addiw a0, a0, -1638 ; RV64-NEXT: andi a0, a0, 2047 -; RV64-NEXT: sltiu s2, a0, 293 -; RV64-NEXT: li a1, 819 +; RV64-NEXT: sltiu s2, a0, 2 +; RV64-NEXT: li a1, 1463 ; RV64-NEXT: mv a0, s1 ; RV64-NEXT: call __muldi3@plt -; RV64-NEXT: addiw a0, a0, -1638 +; RV64-NEXT: addiw a0, a0, -1463 ; RV64-NEXT: andi a0, a0, 2047 -; RV64-NEXT: sltiu a0, a0, 2 +; RV64-NEXT: sltiu a0, a0, 293 ; RV64-NEXT: addiw s3, s3, -1 -; RV64-NEXT: addi a0, a0, -1 -; RV64-NEXT: addiw s2, s2, -1 +; RV64-NEXT: addiw a0, a0, -1 +; RV64-NEXT: addi s2, s2, -1 +; RV64-NEXT: slli a1, s2, 31 +; RV64-NEXT: srli a1, a1, 63 +; RV64-NEXT: sb a1, 4(s0) ; RV64-NEXT: andi a1, s3, 2047 -; RV64-NEXT: andi a2, s2, 2047 -; RV64-NEXT: slli a2, a2, 11 -; RV64-NEXT: slli a0, a0, 22 -; RV64-NEXT: or a0, a2, a0 +; RV64-NEXT: andi a0, a0, 2047 +; RV64-NEXT: slli a0, a0, 11 +; RV64-NEXT: slli s2, s2, 22 +; RV64-NEXT: or a0, a0, s2 ; RV64-NEXT: or a0, a1, a0 ; RV64-NEXT: sw a0, 0(s0) -; RV64-NEXT: slli a0, a0, 31 -; RV64-NEXT: srli a0, a0, 63 -; RV64-NEXT: sb a0, 4(s0) ; RV64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 32(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -483,8 +495,8 @@ ; RV64M-NEXT: lwu a2, 0(a0) ; RV64M-NEXT: slli a1, a1, 32 ; RV64M-NEXT: or a1, a2, a1 -; RV64M-NEXT: srli a2, a1, 22 -; RV64M-NEXT: srli a3, a1, 11 +; RV64M-NEXT: srli a2, a1, 11 +; RV64M-NEXT: srli a3, a1, 22 ; RV64M-NEXT: andi a1, a1, 2047 ; RV64M-NEXT: li a4, 683 ; RV64M-NEXT: mul a1, a1, a4 @@ -494,29 +506,29 @@ ; RV64M-NEXT: or a1, a1, a4 ; RV64M-NEXT: andi a1, a1, 2047 ; RV64M-NEXT: sltiu a1, a1, 342 -; RV64M-NEXT: li a4, 1463 +; RV64M-NEXT: li a4, 819 ; RV64M-NEXT: mul a3, a3, a4 -; RV64M-NEXT: addiw a3, a3, -1463 +; RV64M-NEXT: addiw a3, a3, -1638 ; RV64M-NEXT: andi a3, a3, 2047 -; RV64M-NEXT: sltiu a3, a3, 293 -; RV64M-NEXT: li a4, 819 +; RV64M-NEXT: sltiu a3, a3, 2 +; RV64M-NEXT: li a4, 1463 ; RV64M-NEXT: mul a2, a2, a4 -; RV64M-NEXT: addiw a2, a2, -1638 +; RV64M-NEXT: addiw a2, a2, -1463 ; RV64M-NEXT: andi a2, a2, 2047 -; RV64M-NEXT: sltiu a2, a2, 2 +; RV64M-NEXT: sltiu a2, a2, 293 ; RV64M-NEXT: addiw a1, a1, -1 -; RV64M-NEXT: addi a2, a2, -1 -; RV64M-NEXT: addiw a3, a3, -1 +; RV64M-NEXT: addiw a2, a2, -1 +; RV64M-NEXT: addi a3, a3, -1 +; RV64M-NEXT: slli a4, a3, 31 +; RV64M-NEXT: srli a4, a4, 63 +; RV64M-NEXT: sb a4, 4(a0) ; RV64M-NEXT: andi a1, a1, 2047 -; RV64M-NEXT: andi a3, a3, 2047 -; RV64M-NEXT: slli a3, a3, 11 -; RV64M-NEXT: slli a2, a2, 22 -; RV64M-NEXT: or a2, a3, a2 +; RV64M-NEXT: andi a2, a2, 2047 +; RV64M-NEXT: slli a2, a2, 11 +; RV64M-NEXT: slli a3, a3, 22 +; RV64M-NEXT: or a2, a2, a3 ; RV64M-NEXT: or a1, a1, a2 ; RV64M-NEXT: sw a1, 0(a0) -; RV64M-NEXT: slli a1, a1, 31 -; RV64M-NEXT: srli a1, a1, 63 -; RV64M-NEXT: sb a1, 4(a0) ; RV64M-NEXT: ret ; ; RV32MV-LABEL: test_urem_vec: @@ -625,21 +637,21 @@ ; RV64MV-NEXT: vand.vx v8, v8, a1 ; RV64MV-NEXT: vmsltu.vv v0, v12, v8 ; RV64MV-NEXT: vmerge.vim v8, v10, -1, v0 -; RV64MV-NEXT: vmv.x.s a1, v8 -; RV64MV-NEXT: andi a1, a1, 2047 -; RV64MV-NEXT: vslidedown.vi v9, v8, 1 -; RV64MV-NEXT: vmv.x.s a2, v9 +; RV64MV-NEXT: vslidedown.vi v9, v8, 2 +; RV64MV-NEXT: vmv.x.s a1, v9 +; RV64MV-NEXT: slli a2, a1, 53 +; RV64MV-NEXT: srli a2, a2, 63 +; RV64MV-NEXT: sb a2, 4(a0) +; RV64MV-NEXT: vmv.x.s a2, v8 ; RV64MV-NEXT: andi a2, a2, 2047 -; RV64MV-NEXT: slli a2, a2, 11 -; RV64MV-NEXT: vslidedown.vi v8, v8, 2 +; RV64MV-NEXT: vslidedown.vi v8, v8, 1 ; RV64MV-NEXT: vmv.x.s a3, v8 -; RV64MV-NEXT: slli a3, a3, 22 +; RV64MV-NEXT: andi a3, a3, 2047 +; RV64MV-NEXT: slli a3, a3, 11 +; RV64MV-NEXT: slli a1, a1, 22 +; RV64MV-NEXT: or a1, a2, a1 ; RV64MV-NEXT: or a1, a1, a3 -; RV64MV-NEXT: or a1, a1, a2 ; RV64MV-NEXT: sw a1, 0(a0) -; RV64MV-NEXT: slli a1, a1, 31 -; RV64MV-NEXT: srli a1, a1, 63 -; RV64MV-NEXT: sb a1, 4(a0) ; RV64MV-NEXT: ret %ld = load <3 x i11>, ptr %X %urem = urem <3 x i11> %ld, diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -19,10 +19,10 @@ ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -52,10 +52,10 @@ ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: ret @@ -83,10 +83,10 @@ ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -116,10 +116,10 @@ ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: ret @@ -147,10 +147,10 @@ ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: sraw a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -180,10 +180,10 @@ ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: ret @@ -245,18 +245,18 @@ ; RV64I-NEXT: or a1, a3, a1 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 48 -; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 56 ; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 40 ; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 4(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -310,17 +310,17 @@ ; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a1, a4, a1 ; RV32I-NEXT: sb a1, 4(a2) -; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a3, a1, 24 ; RV32I-NEXT: sb a3, 7(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a1, a1, 8 ; RV32I-NEXT: sb a1, 5(a2) ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: ret @@ -381,18 +381,18 @@ ; RV64I-NEXT: or a1, a3, a1 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 48 -; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 56 ; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 40 ; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 4(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -447,16 +447,16 @@ ; RV32I-NEXT: and a1, a4, a1 ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: sb a3, 2(a2) ; RV32I-NEXT: srli a3, a1, 24 ; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) ; RV32I-NEXT: srli a1, a1, 8 ; RV32I-NEXT: sb a1, 1(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 6(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: ret @@ -517,18 +517,18 @@ ; RV64I-NEXT: or a1, a3, a1 ; RV64I-NEXT: sra a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 48 -; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 56 ; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 40 ; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 4(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -582,17 +582,17 @@ ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: .LBB5_3: ; RV32I-NEXT: sb a1, 4(a2) -; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a3, a1, 24 ; RV32I-NEXT: sb a3, 7(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a1, a1, 8 ; RV32I-NEXT: sb a1, 5(a2) ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: ret @@ -779,38 +779,38 @@ ; RV32I-NEXT: andi a1, a1, 15 ; RV32I-NEXT: addi a0, sp, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 7(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 12(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu t5, 14(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu a0, 9(a0) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb a0, 9(a2) -; RV32I-NEXT: sb t5, 14(a2) -; RV32I-NEXT: sb t4, 15(a2) -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a6, 1(a2) -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a6, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu t1, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t4, 10(a0) +; RV32I-NEXT: lbu t5, 11(a0) +; RV32I-NEXT: lbu t6, 15(a0) +; RV32I-NEXT: lbu s0, 14(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu a0, 12(a0) +; RV32I-NEXT: sb t6, 15(a2) +; RV32I-NEXT: sb s0, 14(a2) +; RV32I-NEXT: sb s1, 13(a2) +; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload @@ -998,38 +998,38 @@ ; RV32I-NEXT: andi a1, a1, 15 ; RV32I-NEXT: addi a0, sp, 20 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 7(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 12(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu t5, 14(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu a0, 9(a0) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb a0, 9(a2) -; RV32I-NEXT: sb t5, 14(a2) -; RV32I-NEXT: sb t4, 15(a2) -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a6, 1(a2) -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a6, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu t1, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t4, 10(a0) +; RV32I-NEXT: lbu t5, 11(a0) +; RV32I-NEXT: lbu t6, 15(a0) +; RV32I-NEXT: lbu s0, 14(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu a0, 12(a0) +; RV32I-NEXT: sb t6, 15(a2) +; RV32I-NEXT: sb s0, 14(a2) +; RV32I-NEXT: sb s1, 13(a2) +; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload @@ -1223,38 +1223,38 @@ ; RV32I-NEXT: andi a1, a1, 15 ; RV32I-NEXT: mv a0, sp ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lbu a1, 5(a0) -; RV32I-NEXT: lbu a3, 4(a0) -; RV32I-NEXT: lbu a4, 7(a0) -; RV32I-NEXT: lbu a5, 6(a0) -; RV32I-NEXT: lbu a6, 1(a0) -; RV32I-NEXT: lbu a7, 0(a0) -; RV32I-NEXT: lbu t0, 3(a0) -; RV32I-NEXT: lbu t1, 2(a0) -; RV32I-NEXT: lbu t2, 13(a0) -; RV32I-NEXT: lbu t3, 12(a0) -; RV32I-NEXT: lbu t4, 15(a0) -; RV32I-NEXT: lbu t5, 14(a0) -; RV32I-NEXT: lbu t6, 10(a0) -; RV32I-NEXT: lbu s0, 11(a0) -; RV32I-NEXT: lbu s1, 8(a0) -; RV32I-NEXT: lbu a0, 9(a0) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb a0, 9(a2) -; RV32I-NEXT: sb t5, 14(a2) -; RV32I-NEXT: sb t4, 15(a2) -; RV32I-NEXT: sb t3, 12(a2) -; RV32I-NEXT: sb t2, 13(a2) -; RV32I-NEXT: sb t1, 2(a2) -; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a6, 1(a2) -; RV32I-NEXT: sb a5, 6(a2) -; RV32I-NEXT: sb a4, 7(a2) -; RV32I-NEXT: sb a3, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: lbu a1, 0(a0) +; RV32I-NEXT: lbu a3, 1(a0) +; RV32I-NEXT: lbu a4, 2(a0) +; RV32I-NEXT: lbu a5, 3(a0) +; RV32I-NEXT: lbu a6, 4(a0) +; RV32I-NEXT: lbu a7, 5(a0) +; RV32I-NEXT: lbu t0, 6(a0) +; RV32I-NEXT: lbu t1, 7(a0) +; RV32I-NEXT: lbu t2, 8(a0) +; RV32I-NEXT: lbu t3, 9(a0) +; RV32I-NEXT: lbu t4, 10(a0) +; RV32I-NEXT: lbu t5, 11(a0) +; RV32I-NEXT: lbu t6, 15(a0) +; RV32I-NEXT: lbu s0, 14(a0) +; RV32I-NEXT: lbu s1, 13(a0) +; RV32I-NEXT: lbu a0, 12(a0) +; RV32I-NEXT: sb t6, 15(a2) +; RV32I-NEXT: sb s0, 14(a2) +; RV32I-NEXT: sb s1, 13(a2) +; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: sb a5, 3(a2) +; RV32I-NEXT: sb a4, 2(a2) +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: lw s0, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 40(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 36(sp) # 4-byte Folded Reload @@ -1398,80 +1398,80 @@ ; RV64I-NEXT: andi a1, a1, 31 ; RV64I-NEXT: addi a0, sp, 56 ; RV64I-NEXT: add a5, a0, a1 -; RV64I-NEXT: lbu a0, 8(a5) +; RV64I-NEXT: lbu a0, 0(a5) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a5) +; RV64I-NEXT: lbu a0, 1(a5) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a5) +; RV64I-NEXT: lbu a0, 2(a5) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a5) +; RV64I-NEXT: lbu a0, 3(a5) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a5) +; RV64I-NEXT: lbu a0, 4(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a5) -; RV64I-NEXT: lbu t0, 14(a5) -; RV64I-NEXT: lbu t1, 15(a5) -; RV64I-NEXT: lbu t2, 0(a5) -; RV64I-NEXT: lbu t3, 1(a5) -; RV64I-NEXT: lbu t4, 2(a5) -; RV64I-NEXT: lbu t5, 3(a5) -; RV64I-NEXT: lbu t6, 4(a5) -; RV64I-NEXT: lbu s0, 5(a5) -; RV64I-NEXT: lbu s1, 6(a5) -; RV64I-NEXT: lbu s2, 7(a5) -; RV64I-NEXT: lbu s3, 24(a5) -; RV64I-NEXT: lbu s4, 25(a5) -; RV64I-NEXT: lbu s5, 26(a5) -; RV64I-NEXT: lbu s6, 27(a5) -; RV64I-NEXT: lbu s7, 28(a5) -; RV64I-NEXT: lbu s8, 29(a5) -; RV64I-NEXT: lbu s9, 30(a5) -; RV64I-NEXT: lbu s10, 31(a5) -; RV64I-NEXT: lbu s11, 16(a5) -; RV64I-NEXT: lbu ra, 17(a5) -; RV64I-NEXT: lbu a6, 18(a5) -; RV64I-NEXT: lbu a4, 19(a5) -; RV64I-NEXT: lbu a0, 23(a5) -; RV64I-NEXT: lbu a1, 22(a5) -; RV64I-NEXT: lbu a3, 21(a5) -; RV64I-NEXT: lbu a5, 20(a5) -; RV64I-NEXT: sb a0, 23(a2) -; RV64I-NEXT: sb a1, 22(a2) -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a5, 20(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a6, 18(a2) -; RV64I-NEXT: sb ra, 17(a2) -; RV64I-NEXT: sb s11, 16(a2) -; RV64I-NEXT: sb s10, 31(a2) -; RV64I-NEXT: sb s9, 30(a2) -; RV64I-NEXT: sb s8, 29(a2) -; RV64I-NEXT: sb s7, 28(a2) -; RV64I-NEXT: sb s6, 27(a2) -; RV64I-NEXT: sb s5, 26(a2) -; RV64I-NEXT: sb s4, 25(a2) -; RV64I-NEXT: sb s3, 24(a2) -; RV64I-NEXT: sb s2, 7(a2) -; RV64I-NEXT: sb s1, 6(a2) -; RV64I-NEXT: sb s0, 5(a2) -; RV64I-NEXT: sb t6, 4(a2) -; RV64I-NEXT: sb t5, 3(a2) -; RV64I-NEXT: sb t4, 2(a2) -; RV64I-NEXT: sb t3, 1(a2) -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb t1, 15(a2) -; RV64I-NEXT: sb t0, 14(a2) -; RV64I-NEXT: sb a7, 13(a2) +; RV64I-NEXT: lbu a7, 5(a5) +; RV64I-NEXT: lbu t0, 6(a5) +; RV64I-NEXT: lbu t1, 7(a5) +; RV64I-NEXT: lbu t2, 8(a5) +; RV64I-NEXT: lbu t3, 9(a5) +; RV64I-NEXT: lbu t4, 10(a5) +; RV64I-NEXT: lbu t5, 11(a5) +; RV64I-NEXT: lbu t6, 12(a5) +; RV64I-NEXT: lbu s0, 13(a5) +; RV64I-NEXT: lbu s1, 14(a5) +; RV64I-NEXT: lbu s2, 15(a5) +; RV64I-NEXT: lbu s3, 16(a5) +; RV64I-NEXT: lbu s4, 17(a5) +; RV64I-NEXT: lbu s5, 18(a5) +; RV64I-NEXT: lbu s6, 19(a5) +; RV64I-NEXT: lbu s7, 20(a5) +; RV64I-NEXT: lbu s8, 21(a5) +; RV64I-NEXT: lbu s9, 22(a5) +; RV64I-NEXT: lbu s10, 23(a5) +; RV64I-NEXT: lbu s11, 24(a5) +; RV64I-NEXT: lbu ra, 25(a5) +; RV64I-NEXT: lbu a6, 26(a5) +; RV64I-NEXT: lbu a4, 27(a5) +; RV64I-NEXT: lbu a0, 31(a5) +; RV64I-NEXT: lbu a1, 30(a5) +; RV64I-NEXT: lbu a3, 29(a5) +; RV64I-NEXT: lbu a5, 28(a5) +; RV64I-NEXT: sb a0, 31(a2) +; RV64I-NEXT: sb a1, 30(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb a5, 28(a2) +; RV64I-NEXT: sb a4, 27(a2) +; RV64I-NEXT: sb a6, 26(a2) +; RV64I-NEXT: sb ra, 25(a2) +; RV64I-NEXT: sb s11, 24(a2) +; RV64I-NEXT: sb s10, 23(a2) +; RV64I-NEXT: sb s9, 22(a2) +; RV64I-NEXT: sb s8, 21(a2) +; RV64I-NEXT: sb s7, 20(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: sb s5, 18(a2) +; RV64I-NEXT: sb s4, 17(a2) +; RV64I-NEXT: sb s3, 16(a2) +; RV64I-NEXT: sb s2, 15(a2) +; RV64I-NEXT: sb s1, 14(a2) +; RV64I-NEXT: sb s0, 13(a2) +; RV64I-NEXT: sb t6, 12(a2) +; RV64I-NEXT: sb t5, 11(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb t3, 9(a2) +; RV64I-NEXT: sb t2, 8(a2) +; RV64I-NEXT: sb t1, 7(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb a7, 5(a2) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 12(a2) +; RV64I-NEXT: sb a0, 4(a2) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 11(a2) +; RV64I-NEXT: sb a0, 3(a2) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 10(a2) +; RV64I-NEXT: sb a0, 2(a2) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload @@ -1616,80 +1616,80 @@ ; RV32I-NEXT: andi a1, a1, 31 ; RV32I-NEXT: addi a0, sp, 28 ; RV32I-NEXT: add a5, a0, a1 -; RV32I-NEXT: lbu a0, 6(a5) +; RV32I-NEXT: lbu a0, 0(a5) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a5) +; RV32I-NEXT: lbu a0, 1(a5) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a5) +; RV32I-NEXT: lbu a0, 2(a5) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a0, 3(a5) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a5) +; RV32I-NEXT: lbu a0, 4(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a5) -; RV32I-NEXT: lbu t0, 2(a5) -; RV32I-NEXT: lbu t1, 3(a5) -; RV32I-NEXT: lbu t2, 14(a5) -; RV32I-NEXT: lbu t3, 15(a5) -; RV32I-NEXT: lbu t4, 12(a5) -; RV32I-NEXT: lbu t5, 13(a5) -; RV32I-NEXT: lbu t6, 10(a5) -; RV32I-NEXT: lbu s0, 11(a5) -; RV32I-NEXT: lbu s1, 8(a5) -; RV32I-NEXT: lbu s2, 9(a5) -; RV32I-NEXT: lbu s3, 22(a5) -; RV32I-NEXT: lbu s4, 23(a5) -; RV32I-NEXT: lbu s5, 20(a5) -; RV32I-NEXT: lbu s6, 21(a5) -; RV32I-NEXT: lbu s7, 18(a5) -; RV32I-NEXT: lbu s8, 19(a5) -; RV32I-NEXT: lbu s9, 16(a5) -; RV32I-NEXT: lbu s10, 17(a5) -; RV32I-NEXT: lbu s11, 30(a5) -; RV32I-NEXT: lbu ra, 31(a5) -; RV32I-NEXT: lbu a6, 28(a5) -; RV32I-NEXT: lbu a4, 29(a5) -; RV32I-NEXT: lbu a0, 25(a5) -; RV32I-NEXT: lbu a1, 24(a5) -; RV32I-NEXT: lbu a3, 27(a5) -; RV32I-NEXT: lbu a5, 26(a5) -; RV32I-NEXT: sb a0, 25(a2) -; RV32I-NEXT: sb a1, 24(a2) -; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a5, 26(a2) -; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a6, 28(a2) -; RV32I-NEXT: sb ra, 31(a2) -; RV32I-NEXT: sb s11, 30(a2) -; RV32I-NEXT: sb s10, 17(a2) -; RV32I-NEXT: sb s9, 16(a2) -; RV32I-NEXT: sb s8, 19(a2) -; RV32I-NEXT: sb s7, 18(a2) -; RV32I-NEXT: sb s6, 21(a2) -; RV32I-NEXT: sb s5, 20(a2) -; RV32I-NEXT: sb s4, 23(a2) -; RV32I-NEXT: sb s3, 22(a2) -; RV32I-NEXT: sb s2, 9(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb t4, 12(a2) -; RV32I-NEXT: sb t3, 15(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a7, 1(a2) +; RV32I-NEXT: lbu a7, 5(a5) +; RV32I-NEXT: lbu t0, 6(a5) +; RV32I-NEXT: lbu t1, 7(a5) +; RV32I-NEXT: lbu t2, 8(a5) +; RV32I-NEXT: lbu t3, 9(a5) +; RV32I-NEXT: lbu t4, 10(a5) +; RV32I-NEXT: lbu t5, 11(a5) +; RV32I-NEXT: lbu t6, 12(a5) +; RV32I-NEXT: lbu s0, 13(a5) +; RV32I-NEXT: lbu s1, 14(a5) +; RV32I-NEXT: lbu s2, 15(a5) +; RV32I-NEXT: lbu s3, 16(a5) +; RV32I-NEXT: lbu s4, 17(a5) +; RV32I-NEXT: lbu s5, 18(a5) +; RV32I-NEXT: lbu s6, 19(a5) +; RV32I-NEXT: lbu s7, 20(a5) +; RV32I-NEXT: lbu s8, 21(a5) +; RV32I-NEXT: lbu s9, 22(a5) +; RV32I-NEXT: lbu s10, 23(a5) +; RV32I-NEXT: lbu s11, 24(a5) +; RV32I-NEXT: lbu ra, 25(a5) +; RV32I-NEXT: lbu a6, 26(a5) +; RV32I-NEXT: lbu a4, 27(a5) +; RV32I-NEXT: lbu a0, 31(a5) +; RV32I-NEXT: lbu a1, 30(a5) +; RV32I-NEXT: lbu a3, 29(a5) +; RV32I-NEXT: lbu a5, 28(a5) +; RV32I-NEXT: sb a0, 31(a2) +; RV32I-NEXT: sb a1, 30(a2) +; RV32I-NEXT: sb a3, 29(a2) +; RV32I-NEXT: sb a5, 28(a2) +; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: sb a6, 26(a2) +; RV32I-NEXT: sb ra, 25(a2) +; RV32I-NEXT: sb s11, 24(a2) +; RV32I-NEXT: sb s10, 23(a2) +; RV32I-NEXT: sb s9, 22(a2) +; RV32I-NEXT: sb s8, 21(a2) +; RV32I-NEXT: sb s7, 20(a2) +; RV32I-NEXT: sb s6, 19(a2) +; RV32I-NEXT: sb s5, 18(a2) +; RV32I-NEXT: sb s4, 17(a2) +; RV32I-NEXT: sb s3, 16(a2) +; RV32I-NEXT: sb s2, 15(a2) +; RV32I-NEXT: sb s1, 14(a2) +; RV32I-NEXT: sb s0, 13(a2) +; RV32I-NEXT: sb t6, 12(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb a7, 5(a2) ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: sb a0, 3(a2) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: sb a0, 2(a2) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: sb a0, 0(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload @@ -1841,80 +1841,80 @@ ; RV64I-NEXT: andi a1, a1, 31 ; RV64I-NEXT: addi a0, sp, 88 ; RV64I-NEXT: sub a5, a0, a1 -; RV64I-NEXT: lbu a0, 8(a5) +; RV64I-NEXT: lbu a0, 0(a5) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a5) +; RV64I-NEXT: lbu a0, 1(a5) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a5) +; RV64I-NEXT: lbu a0, 2(a5) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a5) +; RV64I-NEXT: lbu a0, 3(a5) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a5) +; RV64I-NEXT: lbu a0, 4(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a5) -; RV64I-NEXT: lbu t0, 14(a5) -; RV64I-NEXT: lbu t1, 15(a5) -; RV64I-NEXT: lbu t2, 0(a5) -; RV64I-NEXT: lbu t3, 1(a5) -; RV64I-NEXT: lbu t4, 2(a5) -; RV64I-NEXT: lbu t5, 3(a5) -; RV64I-NEXT: lbu t6, 4(a5) -; RV64I-NEXT: lbu s0, 5(a5) -; RV64I-NEXT: lbu s1, 6(a5) -; RV64I-NEXT: lbu s2, 7(a5) -; RV64I-NEXT: lbu s3, 24(a5) -; RV64I-NEXT: lbu s4, 25(a5) -; RV64I-NEXT: lbu s5, 26(a5) -; RV64I-NEXT: lbu s6, 27(a5) -; RV64I-NEXT: lbu s7, 28(a5) -; RV64I-NEXT: lbu s8, 29(a5) -; RV64I-NEXT: lbu s9, 30(a5) -; RV64I-NEXT: lbu s10, 31(a5) -; RV64I-NEXT: lbu s11, 16(a5) -; RV64I-NEXT: lbu ra, 17(a5) -; RV64I-NEXT: lbu a6, 18(a5) -; RV64I-NEXT: lbu a4, 19(a5) -; RV64I-NEXT: lbu a0, 23(a5) -; RV64I-NEXT: lbu a1, 22(a5) -; RV64I-NEXT: lbu a3, 21(a5) -; RV64I-NEXT: lbu a5, 20(a5) -; RV64I-NEXT: sb a0, 23(a2) -; RV64I-NEXT: sb a1, 22(a2) -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a5, 20(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a6, 18(a2) -; RV64I-NEXT: sb ra, 17(a2) -; RV64I-NEXT: sb s11, 16(a2) -; RV64I-NEXT: sb s10, 31(a2) -; RV64I-NEXT: sb s9, 30(a2) -; RV64I-NEXT: sb s8, 29(a2) -; RV64I-NEXT: sb s7, 28(a2) -; RV64I-NEXT: sb s6, 27(a2) -; RV64I-NEXT: sb s5, 26(a2) -; RV64I-NEXT: sb s4, 25(a2) -; RV64I-NEXT: sb s3, 24(a2) -; RV64I-NEXT: sb s2, 7(a2) -; RV64I-NEXT: sb s1, 6(a2) -; RV64I-NEXT: sb s0, 5(a2) -; RV64I-NEXT: sb t6, 4(a2) -; RV64I-NEXT: sb t5, 3(a2) -; RV64I-NEXT: sb t4, 2(a2) -; RV64I-NEXT: sb t3, 1(a2) -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb t1, 15(a2) -; RV64I-NEXT: sb t0, 14(a2) -; RV64I-NEXT: sb a7, 13(a2) +; RV64I-NEXT: lbu a7, 5(a5) +; RV64I-NEXT: lbu t0, 6(a5) +; RV64I-NEXT: lbu t1, 7(a5) +; RV64I-NEXT: lbu t2, 8(a5) +; RV64I-NEXT: lbu t3, 9(a5) +; RV64I-NEXT: lbu t4, 10(a5) +; RV64I-NEXT: lbu t5, 11(a5) +; RV64I-NEXT: lbu t6, 12(a5) +; RV64I-NEXT: lbu s0, 13(a5) +; RV64I-NEXT: lbu s1, 14(a5) +; RV64I-NEXT: lbu s2, 15(a5) +; RV64I-NEXT: lbu s3, 16(a5) +; RV64I-NEXT: lbu s4, 17(a5) +; RV64I-NEXT: lbu s5, 18(a5) +; RV64I-NEXT: lbu s6, 19(a5) +; RV64I-NEXT: lbu s7, 20(a5) +; RV64I-NEXT: lbu s8, 21(a5) +; RV64I-NEXT: lbu s9, 22(a5) +; RV64I-NEXT: lbu s10, 23(a5) +; RV64I-NEXT: lbu s11, 24(a5) +; RV64I-NEXT: lbu ra, 25(a5) +; RV64I-NEXT: lbu a6, 26(a5) +; RV64I-NEXT: lbu a4, 27(a5) +; RV64I-NEXT: lbu a0, 31(a5) +; RV64I-NEXT: lbu a1, 30(a5) +; RV64I-NEXT: lbu a3, 29(a5) +; RV64I-NEXT: lbu a5, 28(a5) +; RV64I-NEXT: sb a0, 31(a2) +; RV64I-NEXT: sb a1, 30(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb a5, 28(a2) +; RV64I-NEXT: sb a4, 27(a2) +; RV64I-NEXT: sb a6, 26(a2) +; RV64I-NEXT: sb ra, 25(a2) +; RV64I-NEXT: sb s11, 24(a2) +; RV64I-NEXT: sb s10, 23(a2) +; RV64I-NEXT: sb s9, 22(a2) +; RV64I-NEXT: sb s8, 21(a2) +; RV64I-NEXT: sb s7, 20(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: sb s5, 18(a2) +; RV64I-NEXT: sb s4, 17(a2) +; RV64I-NEXT: sb s3, 16(a2) +; RV64I-NEXT: sb s2, 15(a2) +; RV64I-NEXT: sb s1, 14(a2) +; RV64I-NEXT: sb s0, 13(a2) +; RV64I-NEXT: sb t6, 12(a2) +; RV64I-NEXT: sb t5, 11(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb t3, 9(a2) +; RV64I-NEXT: sb t2, 8(a2) +; RV64I-NEXT: sb t1, 7(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb a7, 5(a2) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 12(a2) +; RV64I-NEXT: sb a0, 4(a2) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 11(a2) +; RV64I-NEXT: sb a0, 3(a2) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 10(a2) +; RV64I-NEXT: sb a0, 2(a2) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload @@ -2059,80 +2059,80 @@ ; RV32I-NEXT: andi a1, a1, 31 ; RV32I-NEXT: addi a0, sp, 60 ; RV32I-NEXT: sub a5, a0, a1 -; RV32I-NEXT: lbu a0, 6(a5) +; RV32I-NEXT: lbu a0, 0(a5) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a5) +; RV32I-NEXT: lbu a0, 1(a5) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a5) +; RV32I-NEXT: lbu a0, 2(a5) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a0, 3(a5) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a5) +; RV32I-NEXT: lbu a0, 4(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a5) -; RV32I-NEXT: lbu t0, 2(a5) -; RV32I-NEXT: lbu t1, 3(a5) -; RV32I-NEXT: lbu t2, 14(a5) -; RV32I-NEXT: lbu t3, 15(a5) -; RV32I-NEXT: lbu t4, 12(a5) -; RV32I-NEXT: lbu t5, 13(a5) -; RV32I-NEXT: lbu t6, 10(a5) -; RV32I-NEXT: lbu s0, 11(a5) -; RV32I-NEXT: lbu s1, 8(a5) -; RV32I-NEXT: lbu s2, 9(a5) -; RV32I-NEXT: lbu s3, 22(a5) -; RV32I-NEXT: lbu s4, 23(a5) -; RV32I-NEXT: lbu s5, 20(a5) -; RV32I-NEXT: lbu s6, 21(a5) -; RV32I-NEXT: lbu s7, 18(a5) -; RV32I-NEXT: lbu s8, 19(a5) -; RV32I-NEXT: lbu s9, 16(a5) -; RV32I-NEXT: lbu s10, 17(a5) -; RV32I-NEXT: lbu s11, 30(a5) -; RV32I-NEXT: lbu ra, 31(a5) -; RV32I-NEXT: lbu a6, 28(a5) -; RV32I-NEXT: lbu a4, 29(a5) -; RV32I-NEXT: lbu a0, 25(a5) -; RV32I-NEXT: lbu a1, 24(a5) -; RV32I-NEXT: lbu a3, 27(a5) -; RV32I-NEXT: lbu a5, 26(a5) -; RV32I-NEXT: sb a0, 25(a2) -; RV32I-NEXT: sb a1, 24(a2) -; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a5, 26(a2) -; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a6, 28(a2) -; RV32I-NEXT: sb ra, 31(a2) -; RV32I-NEXT: sb s11, 30(a2) -; RV32I-NEXT: sb s10, 17(a2) -; RV32I-NEXT: sb s9, 16(a2) -; RV32I-NEXT: sb s8, 19(a2) -; RV32I-NEXT: sb s7, 18(a2) -; RV32I-NEXT: sb s6, 21(a2) -; RV32I-NEXT: sb s5, 20(a2) -; RV32I-NEXT: sb s4, 23(a2) -; RV32I-NEXT: sb s3, 22(a2) -; RV32I-NEXT: sb s2, 9(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb t4, 12(a2) -; RV32I-NEXT: sb t3, 15(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a7, 1(a2) +; RV32I-NEXT: lbu a7, 5(a5) +; RV32I-NEXT: lbu t0, 6(a5) +; RV32I-NEXT: lbu t1, 7(a5) +; RV32I-NEXT: lbu t2, 8(a5) +; RV32I-NEXT: lbu t3, 9(a5) +; RV32I-NEXT: lbu t4, 10(a5) +; RV32I-NEXT: lbu t5, 11(a5) +; RV32I-NEXT: lbu t6, 12(a5) +; RV32I-NEXT: lbu s0, 13(a5) +; RV32I-NEXT: lbu s1, 14(a5) +; RV32I-NEXT: lbu s2, 15(a5) +; RV32I-NEXT: lbu s3, 16(a5) +; RV32I-NEXT: lbu s4, 17(a5) +; RV32I-NEXT: lbu s5, 18(a5) +; RV32I-NEXT: lbu s6, 19(a5) +; RV32I-NEXT: lbu s7, 20(a5) +; RV32I-NEXT: lbu s8, 21(a5) +; RV32I-NEXT: lbu s9, 22(a5) +; RV32I-NEXT: lbu s10, 23(a5) +; RV32I-NEXT: lbu s11, 24(a5) +; RV32I-NEXT: lbu ra, 25(a5) +; RV32I-NEXT: lbu a6, 26(a5) +; RV32I-NEXT: lbu a4, 27(a5) +; RV32I-NEXT: lbu a0, 31(a5) +; RV32I-NEXT: lbu a1, 30(a5) +; RV32I-NEXT: lbu a3, 29(a5) +; RV32I-NEXT: lbu a5, 28(a5) +; RV32I-NEXT: sb a0, 31(a2) +; RV32I-NEXT: sb a1, 30(a2) +; RV32I-NEXT: sb a3, 29(a2) +; RV32I-NEXT: sb a5, 28(a2) +; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: sb a6, 26(a2) +; RV32I-NEXT: sb ra, 25(a2) +; RV32I-NEXT: sb s11, 24(a2) +; RV32I-NEXT: sb s10, 23(a2) +; RV32I-NEXT: sb s9, 22(a2) +; RV32I-NEXT: sb s8, 21(a2) +; RV32I-NEXT: sb s7, 20(a2) +; RV32I-NEXT: sb s6, 19(a2) +; RV32I-NEXT: sb s5, 18(a2) +; RV32I-NEXT: sb s4, 17(a2) +; RV32I-NEXT: sb s3, 16(a2) +; RV32I-NEXT: sb s2, 15(a2) +; RV32I-NEXT: sb s1, 14(a2) +; RV32I-NEXT: sb s0, 13(a2) +; RV32I-NEXT: sb t6, 12(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb a7, 5(a2) ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: sb a0, 3(a2) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: sb a0, 2(a2) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: sb a0, 0(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload @@ -2294,80 +2294,80 @@ ; RV64I-NEXT: andi a0, t1, 31 ; RV64I-NEXT: addi a1, sp, 56 ; RV64I-NEXT: add a5, a1, a0 -; RV64I-NEXT: lbu a0, 8(a5) +; RV64I-NEXT: lbu a0, 0(a5) ; RV64I-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 9(a5) +; RV64I-NEXT: lbu a0, 1(a5) ; RV64I-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 10(a5) +; RV64I-NEXT: lbu a0, 2(a5) ; RV64I-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 11(a5) +; RV64I-NEXT: lbu a0, 3(a5) ; RV64I-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a0, 12(a5) +; RV64I-NEXT: lbu a0, 4(a5) ; RV64I-NEXT: sd a0, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: lbu a7, 13(a5) -; RV64I-NEXT: lbu t0, 14(a5) -; RV64I-NEXT: lbu t1, 15(a5) -; RV64I-NEXT: lbu t2, 0(a5) -; RV64I-NEXT: lbu t3, 1(a5) -; RV64I-NEXT: lbu t4, 2(a5) -; RV64I-NEXT: lbu t5, 3(a5) -; RV64I-NEXT: lbu t6, 4(a5) -; RV64I-NEXT: lbu s0, 5(a5) -; RV64I-NEXT: lbu s1, 6(a5) -; RV64I-NEXT: lbu s2, 7(a5) -; RV64I-NEXT: lbu s3, 24(a5) -; RV64I-NEXT: lbu s4, 25(a5) -; RV64I-NEXT: lbu s5, 26(a5) -; RV64I-NEXT: lbu s6, 27(a5) -; RV64I-NEXT: lbu s7, 28(a5) -; RV64I-NEXT: lbu s8, 29(a5) -; RV64I-NEXT: lbu s9, 30(a5) -; RV64I-NEXT: lbu s10, 31(a5) -; RV64I-NEXT: lbu s11, 16(a5) -; RV64I-NEXT: lbu ra, 17(a5) -; RV64I-NEXT: lbu a6, 18(a5) -; RV64I-NEXT: lbu a4, 19(a5) -; RV64I-NEXT: lbu a0, 23(a5) -; RV64I-NEXT: lbu a1, 22(a5) -; RV64I-NEXT: lbu a3, 21(a5) -; RV64I-NEXT: lbu a5, 20(a5) -; RV64I-NEXT: sb a0, 23(a2) -; RV64I-NEXT: sb a1, 22(a2) -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: sb a5, 20(a2) -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: sb a6, 18(a2) -; RV64I-NEXT: sb ra, 17(a2) -; RV64I-NEXT: sb s11, 16(a2) -; RV64I-NEXT: sb s10, 31(a2) -; RV64I-NEXT: sb s9, 30(a2) -; RV64I-NEXT: sb s8, 29(a2) -; RV64I-NEXT: sb s7, 28(a2) -; RV64I-NEXT: sb s6, 27(a2) -; RV64I-NEXT: sb s5, 26(a2) -; RV64I-NEXT: sb s4, 25(a2) -; RV64I-NEXT: sb s3, 24(a2) -; RV64I-NEXT: sb s2, 7(a2) -; RV64I-NEXT: sb s1, 6(a2) -; RV64I-NEXT: sb s0, 5(a2) -; RV64I-NEXT: sb t6, 4(a2) -; RV64I-NEXT: sb t5, 3(a2) -; RV64I-NEXT: sb t4, 2(a2) -; RV64I-NEXT: sb t3, 1(a2) -; RV64I-NEXT: sb t2, 0(a2) -; RV64I-NEXT: sb t1, 15(a2) -; RV64I-NEXT: sb t0, 14(a2) -; RV64I-NEXT: sb a7, 13(a2) +; RV64I-NEXT: lbu a7, 5(a5) +; RV64I-NEXT: lbu t0, 6(a5) +; RV64I-NEXT: lbu t1, 7(a5) +; RV64I-NEXT: lbu t2, 8(a5) +; RV64I-NEXT: lbu t3, 9(a5) +; RV64I-NEXT: lbu t4, 10(a5) +; RV64I-NEXT: lbu t5, 11(a5) +; RV64I-NEXT: lbu t6, 12(a5) +; RV64I-NEXT: lbu s0, 13(a5) +; RV64I-NEXT: lbu s1, 14(a5) +; RV64I-NEXT: lbu s2, 15(a5) +; RV64I-NEXT: lbu s3, 16(a5) +; RV64I-NEXT: lbu s4, 17(a5) +; RV64I-NEXT: lbu s5, 18(a5) +; RV64I-NEXT: lbu s6, 19(a5) +; RV64I-NEXT: lbu s7, 20(a5) +; RV64I-NEXT: lbu s8, 21(a5) +; RV64I-NEXT: lbu s9, 22(a5) +; RV64I-NEXT: lbu s10, 23(a5) +; RV64I-NEXT: lbu s11, 24(a5) +; RV64I-NEXT: lbu ra, 25(a5) +; RV64I-NEXT: lbu a6, 26(a5) +; RV64I-NEXT: lbu a4, 27(a5) +; RV64I-NEXT: lbu a0, 31(a5) +; RV64I-NEXT: lbu a1, 30(a5) +; RV64I-NEXT: lbu a3, 29(a5) +; RV64I-NEXT: lbu a5, 28(a5) +; RV64I-NEXT: sb a0, 31(a2) +; RV64I-NEXT: sb a1, 30(a2) +; RV64I-NEXT: sb a3, 29(a2) +; RV64I-NEXT: sb a5, 28(a2) +; RV64I-NEXT: sb a4, 27(a2) +; RV64I-NEXT: sb a6, 26(a2) +; RV64I-NEXT: sb ra, 25(a2) +; RV64I-NEXT: sb s11, 24(a2) +; RV64I-NEXT: sb s10, 23(a2) +; RV64I-NEXT: sb s9, 22(a2) +; RV64I-NEXT: sb s8, 21(a2) +; RV64I-NEXT: sb s7, 20(a2) +; RV64I-NEXT: sb s6, 19(a2) +; RV64I-NEXT: sb s5, 18(a2) +; RV64I-NEXT: sb s4, 17(a2) +; RV64I-NEXT: sb s3, 16(a2) +; RV64I-NEXT: sb s2, 15(a2) +; RV64I-NEXT: sb s1, 14(a2) +; RV64I-NEXT: sb s0, 13(a2) +; RV64I-NEXT: sb t6, 12(a2) +; RV64I-NEXT: sb t5, 11(a2) +; RV64I-NEXT: sb t4, 10(a2) +; RV64I-NEXT: sb t3, 9(a2) +; RV64I-NEXT: sb t2, 8(a2) +; RV64I-NEXT: sb t1, 7(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb a7, 5(a2) ; RV64I-NEXT: ld a0, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 12(a2) +; RV64I-NEXT: sb a0, 4(a2) ; RV64I-NEXT: ld a0, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 11(a2) +; RV64I-NEXT: sb a0, 3(a2) ; RV64I-NEXT: ld a0, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 10(a2) +; RV64I-NEXT: sb a0, 2(a2) ; RV64I-NEXT: ld a0, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 9(a2) +; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ld a0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload @@ -2518,80 +2518,80 @@ ; RV32I-NEXT: andi a0, t1, 31 ; RV32I-NEXT: addi a1, sp, 28 ; RV32I-NEXT: add a5, a1, a0 -; RV32I-NEXT: lbu a0, 6(a5) +; RV32I-NEXT: lbu a0, 0(a5) ; RV32I-NEXT: sw a0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 7(a5) +; RV32I-NEXT: lbu a0, 1(a5) ; RV32I-NEXT: sw a0, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 4(a5) +; RV32I-NEXT: lbu a0, 2(a5) ; RV32I-NEXT: sw a0, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 5(a5) +; RV32I-NEXT: lbu a0, 3(a5) ; RV32I-NEXT: sw a0, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a0, 0(a5) +; RV32I-NEXT: lbu a0, 4(a5) ; RV32I-NEXT: sw a0, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: lbu a7, 1(a5) -; RV32I-NEXT: lbu t0, 2(a5) -; RV32I-NEXT: lbu t1, 3(a5) -; RV32I-NEXT: lbu t2, 14(a5) -; RV32I-NEXT: lbu t3, 15(a5) -; RV32I-NEXT: lbu t4, 12(a5) -; RV32I-NEXT: lbu t5, 13(a5) -; RV32I-NEXT: lbu t6, 10(a5) -; RV32I-NEXT: lbu s0, 11(a5) -; RV32I-NEXT: lbu s1, 8(a5) -; RV32I-NEXT: lbu s2, 9(a5) -; RV32I-NEXT: lbu s3, 22(a5) -; RV32I-NEXT: lbu s4, 23(a5) -; RV32I-NEXT: lbu s5, 20(a5) -; RV32I-NEXT: lbu s6, 21(a5) -; RV32I-NEXT: lbu s7, 18(a5) -; RV32I-NEXT: lbu s8, 19(a5) -; RV32I-NEXT: lbu s9, 16(a5) -; RV32I-NEXT: lbu s10, 17(a5) -; RV32I-NEXT: lbu s11, 30(a5) -; RV32I-NEXT: lbu ra, 31(a5) -; RV32I-NEXT: lbu a6, 28(a5) -; RV32I-NEXT: lbu a4, 29(a5) -; RV32I-NEXT: lbu a0, 25(a5) -; RV32I-NEXT: lbu a1, 24(a5) -; RV32I-NEXT: lbu a3, 27(a5) -; RV32I-NEXT: lbu a5, 26(a5) -; RV32I-NEXT: sb a0, 25(a2) -; RV32I-NEXT: sb a1, 24(a2) -; RV32I-NEXT: sb a3, 27(a2) -; RV32I-NEXT: sb a5, 26(a2) -; RV32I-NEXT: sb a4, 29(a2) -; RV32I-NEXT: sb a6, 28(a2) -; RV32I-NEXT: sb ra, 31(a2) -; RV32I-NEXT: sb s11, 30(a2) -; RV32I-NEXT: sb s10, 17(a2) -; RV32I-NEXT: sb s9, 16(a2) -; RV32I-NEXT: sb s8, 19(a2) -; RV32I-NEXT: sb s7, 18(a2) -; RV32I-NEXT: sb s6, 21(a2) -; RV32I-NEXT: sb s5, 20(a2) -; RV32I-NEXT: sb s4, 23(a2) -; RV32I-NEXT: sb s3, 22(a2) -; RV32I-NEXT: sb s2, 9(a2) -; RV32I-NEXT: sb s1, 8(a2) -; RV32I-NEXT: sb s0, 11(a2) -; RV32I-NEXT: sb t6, 10(a2) -; RV32I-NEXT: sb t5, 13(a2) -; RV32I-NEXT: sb t4, 12(a2) -; RV32I-NEXT: sb t3, 15(a2) -; RV32I-NEXT: sb t2, 14(a2) -; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a7, 1(a2) +; RV32I-NEXT: lbu a7, 5(a5) +; RV32I-NEXT: lbu t0, 6(a5) +; RV32I-NEXT: lbu t1, 7(a5) +; RV32I-NEXT: lbu t2, 8(a5) +; RV32I-NEXT: lbu t3, 9(a5) +; RV32I-NEXT: lbu t4, 10(a5) +; RV32I-NEXT: lbu t5, 11(a5) +; RV32I-NEXT: lbu t6, 12(a5) +; RV32I-NEXT: lbu s0, 13(a5) +; RV32I-NEXT: lbu s1, 14(a5) +; RV32I-NEXT: lbu s2, 15(a5) +; RV32I-NEXT: lbu s3, 16(a5) +; RV32I-NEXT: lbu s4, 17(a5) +; RV32I-NEXT: lbu s5, 18(a5) +; RV32I-NEXT: lbu s6, 19(a5) +; RV32I-NEXT: lbu s7, 20(a5) +; RV32I-NEXT: lbu s8, 21(a5) +; RV32I-NEXT: lbu s9, 22(a5) +; RV32I-NEXT: lbu s10, 23(a5) +; RV32I-NEXT: lbu s11, 24(a5) +; RV32I-NEXT: lbu ra, 25(a5) +; RV32I-NEXT: lbu a6, 26(a5) +; RV32I-NEXT: lbu a4, 27(a5) +; RV32I-NEXT: lbu a0, 31(a5) +; RV32I-NEXT: lbu a1, 30(a5) +; RV32I-NEXT: lbu a3, 29(a5) +; RV32I-NEXT: lbu a5, 28(a5) +; RV32I-NEXT: sb a0, 31(a2) +; RV32I-NEXT: sb a1, 30(a2) +; RV32I-NEXT: sb a3, 29(a2) +; RV32I-NEXT: sb a5, 28(a2) +; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: sb a6, 26(a2) +; RV32I-NEXT: sb ra, 25(a2) +; RV32I-NEXT: sb s11, 24(a2) +; RV32I-NEXT: sb s10, 23(a2) +; RV32I-NEXT: sb s9, 22(a2) +; RV32I-NEXT: sb s8, 21(a2) +; RV32I-NEXT: sb s7, 20(a2) +; RV32I-NEXT: sb s6, 19(a2) +; RV32I-NEXT: sb s5, 18(a2) +; RV32I-NEXT: sb s4, 17(a2) +; RV32I-NEXT: sb s3, 16(a2) +; RV32I-NEXT: sb s2, 15(a2) +; RV32I-NEXT: sb s1, 14(a2) +; RV32I-NEXT: sb s0, 13(a2) +; RV32I-NEXT: sb t6, 12(a2) +; RV32I-NEXT: sb t5, 11(a2) +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: sb t3, 9(a2) +; RV32I-NEXT: sb t2, 8(a2) +; RV32I-NEXT: sb t1, 7(a2) +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: sb a7, 5(a2) ; RV32I-NEXT: lw a0, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: lw a0, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 5(a2) +; RV32I-NEXT: sb a0, 3(a2) ; RV32I-NEXT: lw a0, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 4(a2) +; RV32I-NEXT: sb a0, 2(a2) ; RV32I-NEXT: lw a0, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: lw a0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: sb a0, 6(a2) +; RV32I-NEXT: sb a0, 0(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -18,10 +18,10 @@ ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -50,10 +50,10 @@ ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: ret @@ -79,10 +79,10 @@ ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -111,10 +111,10 @@ ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: ret @@ -140,10 +140,10 @@ ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: sraw a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -172,10 +172,10 @@ ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: ret @@ -235,18 +235,18 @@ ; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 48 -; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 56 ; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 40 ; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 4(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -299,17 +299,17 @@ ; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: and a1, a4, a1 ; RV32I-NEXT: sb a1, 4(a2) -; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a3, a1, 24 ; RV32I-NEXT: sb a3, 7(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a1, a1, 8 ; RV32I-NEXT: sb a1, 5(a2) ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: ret @@ -368,18 +368,18 @@ ; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 48 -; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 56 ; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 40 ; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 4(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -433,16 +433,16 @@ ; RV32I-NEXT: and a1, a4, a1 ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: sb a3, 2(a2) ; RV32I-NEXT: srli a3, a1, 24 ; RV32I-NEXT: sb a3, 3(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 2(a2) ; RV32I-NEXT: srli a1, a1, 8 ; RV32I-NEXT: sb a1, 1(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 6(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 6(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: ret @@ -501,18 +501,18 @@ ; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: sra a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) -; RV64I-NEXT: srli a1, a0, 48 -; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 56 ; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a1, a0, 32 -; RV64I-NEXT: sb a1, 4(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a0, 40 ; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a0, 16 -; RV64I-NEXT: sb a1, 2(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 4(a2) ; RV64I-NEXT: srli a1, a0, 24 ; RV64I-NEXT: sb a1, 3(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: srli a0, a0, 8 ; RV64I-NEXT: sb a0, 1(a2) ; RV64I-NEXT: ret @@ -565,17 +565,17 @@ ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: .LBB5_3: ; RV32I-NEXT: sb a1, 4(a2) -; RV32I-NEXT: srli a3, a1, 16 -; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a3, a1, 24 ; RV32I-NEXT: sb a3, 7(a2) +; RV32I-NEXT: srli a3, a1, 16 +; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a1, a1, 8 ; RV32I-NEXT: sb a1, 5(a2) ; RV32I-NEXT: sb a0, 0(a2) -; RV32I-NEXT: srli a1, a0, 16 -; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a1, a0, 24 ; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: srli a1, a0, 16 +; RV32I-NEXT: sb a1, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 1(a2) ; RV32I-NEXT: ret @@ -772,48 +772,48 @@ ; RV32I-NEXT: srli a0, a0, 28 ; RV32I-NEXT: addi a3, sp, 12 ; RV32I-NEXT: add a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a4, 4(a3) -; RV32I-NEXT: lbu a5, 6(a3) -; RV32I-NEXT: lbu a6, 7(a3) +; RV32I-NEXT: lbu a0, 1(a3) +; RV32I-NEXT: lbu a4, 0(a3) +; RV32I-NEXT: lbu a5, 2(a3) +; RV32I-NEXT: lbu a6, 3(a3) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: andi a5, a1, 7 -; RV32I-NEXT: srl a0, a4, a5 -; RV32I-NEXT: lbu a1, 9(a3) -; RV32I-NEXT: lbu a6, 8(a3) -; RV32I-NEXT: lbu a7, 10(a3) -; RV32I-NEXT: lbu t0, 11(a3) +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: andi a4, a1, 7 +; RV32I-NEXT: srl a0, a0, a4 +; RV32I-NEXT: lbu a1, 5(a3) +; RV32I-NEXT: lbu a5, 4(a3) +; RV32I-NEXT: lbu a6, 6(a3) +; RV32I-NEXT: lbu a7, 7(a3) ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a6, a6, a1 -; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a5 -; RV32I-NEXT: sll a1, a1, a7 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a5, a5, a1 +; RV32I-NEXT: slli a1, a5, 1 +; RV32I-NEXT: xori a6, a4, 31 +; RV32I-NEXT: sll a1, a1, a6 ; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: lbu a7, 1(a3) -; RV32I-NEXT: lbu t0, 0(a3) -; RV32I-NEXT: lbu t1, 2(a3) -; RV32I-NEXT: lbu t2, 3(a3) +; RV32I-NEXT: srl a5, a5, a4 +; RV32I-NEXT: lbu a7, 9(a3) +; RV32I-NEXT: lbu t0, 8(a3) +; RV32I-NEXT: lbu t1, 10(a3) +; RV32I-NEXT: lbu t2, 11(a3) ; RV32I-NEXT: slli a7, a7, 8 ; RV32I-NEXT: or a7, a7, t0 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a5 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: xori t0, a5, 31 -; RV32I-NEXT: sll a4, a4, t0 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: srl a6, a6, a5 +; RV32I-NEXT: slli t0, a7, 1 +; RV32I-NEXT: not t1, a4 +; RV32I-NEXT: sll t0, t0, t1 +; RV32I-NEXT: or t0, a5, t0 +; RV32I-NEXT: srl a7, a7, a4 ; RV32I-NEXT: lbu t1, 13(a3) ; RV32I-NEXT: lbu t2, 12(a3) ; RV32I-NEXT: lbu t3, 14(a3) @@ -825,37 +825,37 @@ ; RV32I-NEXT: or a3, a3, t3 ; RV32I-NEXT: or a3, a3, t1 ; RV32I-NEXT: slli t1, a3, 1 -; RV32I-NEXT: sll t0, t1, t0 -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: srl a3, a3, a5 -; RV32I-NEXT: sb a6, 8(a2) +; RV32I-NEXT: sll a6, t1, a6 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: srl a3, a3, a4 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: srli a5, a6, 8 -; RV32I-NEXT: sb a5, 9(a2) -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: srli a5, a3, 24 -; RV32I-NEXT: sb a5, 15(a2) +; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb a5, 4(a2) +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 13(a2) ; RV32I-NEXT: srli a3, a7, 16 -; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: sb a3, 10(a2) ; RV32I-NEXT: srli a3, a7, 8 -; RV32I-NEXT: sb a3, 1(a2) -; RV32I-NEXT: srli a3, a0, 16 +; RV32I-NEXT: sb a3, 9(a2) +; RV32I-NEXT: srli a3, a5, 16 ; RV32I-NEXT: sb a3, 6(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 5(a2) +; RV32I-NEXT: srli a3, a0, 16 +; RV32I-NEXT: sb a3, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 -; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: srli a0, t0, 24 +; RV32I-NEXT: sb a0, 1(a2) +; RV32I-NEXT: srli a0, a6, 24 ; RV32I-NEXT: sb a0, 11(a2) -; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: sb a4, 3(a2) +; RV32I-NEXT: srli a0, t0, 24 +; RV32I-NEXT: sb a0, 7(a2) ; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: sb a1, 3(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload @@ -1081,63 +1081,63 @@ ; RV32I-NEXT: xori a7, a5, 31 ; RV32I-NEXT: srl a1, a1, a7 ; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: lbu t0, 13(a3) -; RV32I-NEXT: lbu t1, 12(a3) -; RV32I-NEXT: lbu t2, 14(a3) -; RV32I-NEXT: lbu t3, 15(a3) +; RV32I-NEXT: lbu t0, 9(a3) +; RV32I-NEXT: lbu t1, 8(a3) +; RV32I-NEXT: lbu t2, 10(a3) +; RV32I-NEXT: lbu t3, 11(a3) ; RV32I-NEXT: slli t0, t0, 8 ; RV32I-NEXT: or t0, t0, t1 ; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli t3, t3, 24 ; RV32I-NEXT: or t1, t3, t2 ; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: sll t0, t0, a5 -; RV32I-NEXT: lbu t1, 9(a3) -; RV32I-NEXT: lbu t2, 8(a3) -; RV32I-NEXT: lbu t3, 10(a3) -; RV32I-NEXT: lbu a3, 11(a3) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t1, t1, t2 -; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: sll t1, t0, a5 +; RV32I-NEXT: srli a4, a4, 1 +; RV32I-NEXT: not t2, a5 +; RV32I-NEXT: srl a4, a4, t2 +; RV32I-NEXT: or a4, t1, a4 +; RV32I-NEXT: lbu t2, 13(a3) +; RV32I-NEXT: lbu t3, 12(a3) +; RV32I-NEXT: lbu t4, 14(a3) +; RV32I-NEXT: lbu a3, 15(a3) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or t2, t2, t3 +; RV32I-NEXT: slli t4, t4, 16 ; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a3, a3, t3 -; RV32I-NEXT: or a3, a3, t1 -; RV32I-NEXT: srli t1, a3, 1 -; RV32I-NEXT: srl a7, t1, a7 -; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or a3, a3, t4 +; RV32I-NEXT: or a3, a3, t2 ; RV32I-NEXT: sll a3, a3, a5 -; RV32I-NEXT: srli a4, a4, 1 -; RV32I-NEXT: not t1, a5 -; RV32I-NEXT: srl a4, a4, t1 -; RV32I-NEXT: or a4, a3, a4 +; RV32I-NEXT: srli t0, t0, 1 +; RV32I-NEXT: srl a7, t0, a7 +; RV32I-NEXT: or a7, a3, a7 ; RV32I-NEXT: sll a5, a6, a5 ; RV32I-NEXT: sb a5, 0(a2) -; RV32I-NEXT: srli a6, a3, 16 -; RV32I-NEXT: sb a6, 10(a2) ; RV32I-NEXT: srli a6, a3, 24 -; RV32I-NEXT: sb a6, 11(a2) +; RV32I-NEXT: sb a6, 15(a2) +; RV32I-NEXT: srli a6, a3, 16 +; RV32I-NEXT: sb a6, 14(a2) ; RV32I-NEXT: srli a3, a3, 8 -; RV32I-NEXT: sb a3, 9(a2) -; RV32I-NEXT: srli a3, t0, 16 -; RV32I-NEXT: sb a3, 14(a2) -; RV32I-NEXT: srli a3, t0, 24 -; RV32I-NEXT: sb a3, 15(a2) -; RV32I-NEXT: srli a3, t0, 8 ; RV32I-NEXT: sb a3, 13(a2) -; RV32I-NEXT: srli a3, a5, 16 -; RV32I-NEXT: sb a3, 2(a2) -; RV32I-NEXT: srli a3, a5, 24 -; RV32I-NEXT: sb a3, 3(a2) -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 1(a2) -; RV32I-NEXT: srli a3, a0, 16 -; RV32I-NEXT: sb a3, 6(a2) +; RV32I-NEXT: srli a3, t1, 24 +; RV32I-NEXT: sb a3, 11(a2) +; RV32I-NEXT: srli a3, t1, 16 +; RV32I-NEXT: sb a3, 10(a2) +; RV32I-NEXT: srli a3, t1, 8 +; RV32I-NEXT: sb a3, 9(a2) ; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: sb a3, 7(a2) +; RV32I-NEXT: srli a3, a0, 16 +; RV32I-NEXT: sb a3, 6(a2) ; RV32I-NEXT: srli a0, a0, 8 ; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: sb a4, 8(a2) +; RV32I-NEXT: srli a0, a5, 24 +; RV32I-NEXT: sb a0, 3(a2) +; RV32I-NEXT: srli a0, a5, 16 +; RV32I-NEXT: sb a0, 2(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 1(a2) ; RV32I-NEXT: sb a7, 12(a2) +; RV32I-NEXT: sb a4, 8(a2) ; RV32I-NEXT: sb a1, 4(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload @@ -1344,48 +1344,48 @@ ; RV32I-NEXT: srli a0, a0, 28 ; RV32I-NEXT: addi a3, sp, 8 ; RV32I-NEXT: add a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a4, 4(a3) -; RV32I-NEXT: lbu a5, 6(a3) -; RV32I-NEXT: lbu a6, 7(a3) +; RV32I-NEXT: lbu a0, 1(a3) +; RV32I-NEXT: lbu a4, 0(a3) +; RV32I-NEXT: lbu a5, 2(a3) +; RV32I-NEXT: lbu a6, 3(a3) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: andi a5, a1, 7 -; RV32I-NEXT: srl a0, a4, a5 -; RV32I-NEXT: lbu a1, 9(a3) -; RV32I-NEXT: lbu a6, 8(a3) -; RV32I-NEXT: lbu a7, 10(a3) -; RV32I-NEXT: lbu t0, 11(a3) +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: andi a4, a1, 7 +; RV32I-NEXT: srl a0, a0, a4 +; RV32I-NEXT: lbu a1, 5(a3) +; RV32I-NEXT: lbu a5, 4(a3) +; RV32I-NEXT: lbu a6, 6(a3) +; RV32I-NEXT: lbu a7, 7(a3) ; RV32I-NEXT: slli a1, a1, 8 -; RV32I-NEXT: or a1, a1, a6 -; RV32I-NEXT: slli a7, a7, 16 -; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a6, a6, a1 -; RV32I-NEXT: slli a1, a6, 1 -; RV32I-NEXT: not a7, a5 -; RV32I-NEXT: sll a1, a1, a7 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a7, a7, 24 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a5, a5, a1 +; RV32I-NEXT: slli a1, a5, 1 +; RV32I-NEXT: xori a6, a4, 31 +; RV32I-NEXT: sll a1, a1, a6 ; RV32I-NEXT: or a1, a0, a1 -; RV32I-NEXT: lbu a7, 1(a3) -; RV32I-NEXT: lbu t0, 0(a3) -; RV32I-NEXT: lbu t1, 2(a3) -; RV32I-NEXT: lbu t2, 3(a3) +; RV32I-NEXT: srl a5, a5, a4 +; RV32I-NEXT: lbu a7, 9(a3) +; RV32I-NEXT: lbu t0, 8(a3) +; RV32I-NEXT: lbu t1, 10(a3) +; RV32I-NEXT: lbu t2, 11(a3) ; RV32I-NEXT: slli a7, a7, 8 ; RV32I-NEXT: or a7, a7, t0 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 ; RV32I-NEXT: or t0, t2, t1 ; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: srl a7, a7, a5 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: xori t0, a5, 31 -; RV32I-NEXT: sll a4, a4, t0 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: srl a6, a6, a5 +; RV32I-NEXT: slli t0, a7, 1 +; RV32I-NEXT: not t1, a4 +; RV32I-NEXT: sll t0, t0, t1 +; RV32I-NEXT: or t0, a5, t0 +; RV32I-NEXT: srl a7, a7, a4 ; RV32I-NEXT: lbu t1, 13(a3) ; RV32I-NEXT: lbu t2, 12(a3) ; RV32I-NEXT: lbu t3, 14(a3) @@ -1397,37 +1397,37 @@ ; RV32I-NEXT: or a3, a3, t3 ; RV32I-NEXT: or a3, a3, t1 ; RV32I-NEXT: slli t1, a3, 1 -; RV32I-NEXT: sll t0, t1, t0 -; RV32I-NEXT: or t0, a6, t0 -; RV32I-NEXT: sra a3, a3, a5 -; RV32I-NEXT: sb a6, 8(a2) +; RV32I-NEXT: sll a6, t1, a6 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: sra a3, a3, a4 ; RV32I-NEXT: sb a3, 12(a2) -; RV32I-NEXT: sb a7, 0(a2) -; RV32I-NEXT: sb a0, 4(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: srli a5, a6, 8 -; RV32I-NEXT: sb a5, 9(a2) -; RV32I-NEXT: srli a5, a3, 16 -; RV32I-NEXT: sb a5, 14(a2) -; RV32I-NEXT: srli a5, a3, 24 -; RV32I-NEXT: sb a5, 15(a2) +; RV32I-NEXT: sb a7, 8(a2) +; RV32I-NEXT: sb a5, 4(a2) +; RV32I-NEXT: sb a0, 0(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 15(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 14(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 13(a2) ; RV32I-NEXT: srli a3, a7, 16 -; RV32I-NEXT: sb a3, 2(a2) +; RV32I-NEXT: sb a3, 10(a2) ; RV32I-NEXT: srli a3, a7, 8 -; RV32I-NEXT: sb a3, 1(a2) -; RV32I-NEXT: srli a3, a0, 16 +; RV32I-NEXT: sb a3, 9(a2) +; RV32I-NEXT: srli a3, a5, 16 ; RV32I-NEXT: sb a3, 6(a2) +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 5(a2) +; RV32I-NEXT: srli a3, a0, 16 +; RV32I-NEXT: sb a3, 2(a2) ; RV32I-NEXT: srli a0, a0, 8 -; RV32I-NEXT: sb a0, 5(a2) -; RV32I-NEXT: srli a0, t0, 24 +; RV32I-NEXT: sb a0, 1(a2) +; RV32I-NEXT: srli a0, a6, 24 ; RV32I-NEXT: sb a0, 11(a2) -; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: sb a4, 3(a2) +; RV32I-NEXT: srli a0, t0, 24 +; RV32I-NEXT: sb a0, 7(a2) ; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sb a1, 7(a2) +; RV32I-NEXT: sb a1, 3(a2) ; RV32I-NEXT: lw s0, 60(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 56(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s2, 52(sp) # 4-byte Folded Reload @@ -1592,45 +1592,45 @@ ; RV64I-NEXT: sb a0, 56(sp) ; RV64I-NEXT: slli a0, t0, 56 ; RV64I-NEXT: srli a0, a0, 59 -; RV64I-NEXT: addi a3, sp, 56 -; RV64I-NEXT: add a3, a3, a0 -; RV64I-NEXT: lbu a0, 9(a3) -; RV64I-NEXT: lbu a1, 8(a3) -; RV64I-NEXT: lbu a4, 10(a3) -; RV64I-NEXT: lbu a5, 11(a3) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addi a1, sp, 56 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 2(a0) +; RV64I-NEXT: lbu a5, 3(a0) +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a5, a5, 24 ; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: lbu a1, 13(a3) -; RV64I-NEXT: lbu a4, 12(a3) -; RV64I-NEXT: lbu a5, 14(a3) -; RV64I-NEXT: lbu a6, 15(a3) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: or a1, a4, a1 +; RV64I-NEXT: lbu a3, 5(a0) +; RV64I-NEXT: lbu a4, 4(a0) +; RV64I-NEXT: lbu a5, 6(a0) +; RV64I-NEXT: lbu a6, 7(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a4, a1, a0 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: or a3, a3, a1 ; RV64I-NEXT: andi a1, t0, 7 -; RV64I-NEXT: lbu a0, 17(a3) -; RV64I-NEXT: lbu a5, 16(a3) -; RV64I-NEXT: lbu a6, 18(a3) -; RV64I-NEXT: lbu a7, 19(a3) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a5 +; RV64I-NEXT: lbu a4, 9(a0) +; RV64I-NEXT: lbu a5, 8(a0) +; RV64I-NEXT: lbu a6, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 ; RV64I-NEXT: or a5, a7, a6 -; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: lbu a5, 21(a3) -; RV64I-NEXT: lbu a6, 20(a3) -; RV64I-NEXT: lbu a7, 22(a3) -; RV64I-NEXT: lbu t0, 23(a3) +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 13(a0) +; RV64I-NEXT: lbu a6, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 @@ -1638,126 +1638,126 @@ ; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: or a5, a5, a0 -; RV64I-NEXT: slli a0, a5, 1 -; RV64I-NEXT: not a6, a1 -; RV64I-NEXT: sll a0, a0, a6 -; RV64I-NEXT: lbu a6, 1(a3) -; RV64I-NEXT: lbu a7, 0(a3) -; RV64I-NEXT: lbu t0, 2(a3) -; RV64I-NEXT: lbu t1, 3(a3) +; RV64I-NEXT: or a5, a5, a4 +; RV64I-NEXT: lbu a4, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 ; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a3) -; RV64I-NEXT: lbu t0, 4(a3) -; RV64I-NEXT: lbu t1, 6(a3) -; RV64I-NEXT: lbu t2, 7(a3) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 25(a3) -; RV64I-NEXT: lbu t0, 24(a3) -; RV64I-NEXT: lbu t1, 26(a3) -; RV64I-NEXT: lbu t2, 27(a3) +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a6, a6, a4 +; RV64I-NEXT: slli a4, a6, 1 +; RV64I-NEXT: not a7, a1 +; RV64I-NEXT: sll a4, a4, a7 +; RV64I-NEXT: lbu a7, 25(a0) +; RV64I-NEXT: lbu t0, 24(a0) +; RV64I-NEXT: lbu t1, 26(a0) +; RV64I-NEXT: lbu t2, 27(a0) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 29(a3) -; RV64I-NEXT: lbu t1, 28(a3) -; RV64I-NEXT: lbu t2, 30(a3) -; RV64I-NEXT: lbu a3, 31(a3) +; RV64I-NEXT: lbu t0, 29(a0) +; RV64I-NEXT: lbu t1, 28(a0) +; RV64I-NEXT: lbu t2, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli a3, a3, 24 -; RV64I-NEXT: or a3, a3, t2 -; RV64I-NEXT: slli t1, a4, 1 -; RV64I-NEXT: or a3, a3, t0 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t2 +; RV64I-NEXT: slli t1, a5, 1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: xori t0, a1, 63 ; RV64I-NEXT: sll t1, t1, t0 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a3, a7 -; RV64I-NEXT: slli a7, a3, 1 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a7, a0, 1 ; RV64I-NEXT: sll a7, a7, t0 -; RV64I-NEXT: srl a4, a4, a1 -; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: srl a3, a3, a1 ; RV64I-NEXT: srl a5, a5, a1 -; RV64I-NEXT: srl a1, a3, a1 -; RV64I-NEXT: srli a3, a5, 48 -; RV64I-NEXT: sb a3, 22(a2) -; RV64I-NEXT: srli a3, a5, 40 -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: srli a3, a5, 32 -; RV64I-NEXT: sb a3, 20(a2) -; RV64I-NEXT: srli a3, a5, 24 -; RV64I-NEXT: sb a3, 19(a2) -; RV64I-NEXT: srli a3, a5, 16 -; RV64I-NEXT: sb a3, 18(a2) -; RV64I-NEXT: or a3, a5, a7 -; RV64I-NEXT: sb a5, 16(a2) +; RV64I-NEXT: srl a6, a6, a1 +; RV64I-NEXT: srl a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 31(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 30(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 29(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 28(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 27(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 26(a2) +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 25(a2) +; RV64I-NEXT: srli a0, a6, 48 +; RV64I-NEXT: sb a0, 22(a2) +; RV64I-NEXT: srli a0, a6, 40 +; RV64I-NEXT: sb a0, 21(a2) +; RV64I-NEXT: srli a0, a6, 32 +; RV64I-NEXT: sb a0, 20(a2) +; RV64I-NEXT: srli a0, a6, 24 +; RV64I-NEXT: sb a0, 19(a2) +; RV64I-NEXT: srli a0, a6, 16 +; RV64I-NEXT: sb a0, 18(a2) +; RV64I-NEXT: or a0, a6, a7 +; RV64I-NEXT: sb a6, 16(a2) +; RV64I-NEXT: srli a1, a6, 8 +; RV64I-NEXT: sb a1, 17(a2) +; RV64I-NEXT: srli a1, a5, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a5, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a5, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a5, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a5, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: sb a5, 8(a2) ; RV64I-NEXT: srli a5, a5, 8 -; RV64I-NEXT: sb a5, 17(a2) -; RV64I-NEXT: srli a5, a1, 56 -; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a1, 48 -; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: sb a5, 29(a2) -; RV64I-NEXT: srli a5, a1, 32 -; RV64I-NEXT: sb a5, 28(a2) -; RV64I-NEXT: srli a5, a1, 24 -; RV64I-NEXT: sb a5, 27(a2) -; RV64I-NEXT: srli a5, a1, 16 -; RV64I-NEXT: sb a5, 26(a2) -; RV64I-NEXT: sb a1, 24(a2) -; RV64I-NEXT: srli a1, a1, 8 -; RV64I-NEXT: sb a1, 25(a2) -; RV64I-NEXT: srli a1, a6, 48 +; RV64I-NEXT: sb a5, 9(a2) +; RV64I-NEXT: srli a1, a3, 48 ; RV64I-NEXT: sb a1, 6(a2) -; RV64I-NEXT: srli a1, a6, 40 +; RV64I-NEXT: srli a1, a3, 40 ; RV64I-NEXT: sb a1, 5(a2) -; RV64I-NEXT: srli a1, a6, 32 +; RV64I-NEXT: srli a1, a3, 32 ; RV64I-NEXT: sb a1, 4(a2) -; RV64I-NEXT: srli a1, a6, 24 +; RV64I-NEXT: srli a1, a3, 24 ; RV64I-NEXT: sb a1, 3(a2) -; RV64I-NEXT: srli a1, a6, 16 +; RV64I-NEXT: srli a1, a3, 16 ; RV64I-NEXT: sb a1, 2(a2) -; RV64I-NEXT: or a1, a6, t1 -; RV64I-NEXT: sb a6, 0(a2) -; RV64I-NEXT: srli a5, a6, 8 -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: srli a5, a4, 48 -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: srli a5, a4, 40 -; RV64I-NEXT: sb a5, 13(a2) -; RV64I-NEXT: srli a5, a4, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: sb a5, 11(a2) -; RV64I-NEXT: srli a5, a4, 16 -; RV64I-NEXT: sb a5, 10(a2) -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: sb a4, 8(a2) -; RV64I-NEXT: srli a4, a4, 8 -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: srli a3, a3, 56 -; RV64I-NEXT: sb a3, 23(a2) +; RV64I-NEXT: or a1, a3, t1 +; RV64I-NEXT: sb a3, 0(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 1(a2) +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: sb a0, 23(a2) +; RV64I-NEXT: srli a4, a4, 56 +; RV64I-NEXT: sb a4, 15(a2) ; RV64I-NEXT: srli a1, a1, 56 ; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: sb a0, 15(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload @@ -1912,10 +1912,10 @@ ; RV32I-NEXT: srli a0, a0, 27 ; RV32I-NEXT: addi a4, sp, 28 ; RV32I-NEXT: add a4, a4, a0 -; RV32I-NEXT: lbu a0, 5(a4) -; RV32I-NEXT: lbu a1, 4(a4) -; RV32I-NEXT: lbu a3, 6(a4) -; RV32I-NEXT: lbu a5, 7(a4) +; RV32I-NEXT: lbu a0, 1(a4) +; RV32I-NEXT: lbu a1, 0(a4) +; RV32I-NEXT: lbu a3, 2(a4) +; RV32I-NEXT: lbu a5, 3(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a3, a3, 16 @@ -1923,10 +1923,10 @@ ; RV32I-NEXT: or a3, a5, a3 ; RV32I-NEXT: or t4, a3, a0 ; RV32I-NEXT: andi a3, t0, 7 -; RV32I-NEXT: lbu a0, 9(a4) -; RV32I-NEXT: lbu a1, 8(a4) -; RV32I-NEXT: lbu a5, 10(a4) -; RV32I-NEXT: lbu a6, 11(a4) +; RV32I-NEXT: lbu a0, 5(a4) +; RV32I-NEXT: lbu a1, 4(a4) +; RV32I-NEXT: lbu a5, 6(a4) +; RV32I-NEXT: lbu a6, 7(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a5, a5, 16 @@ -1934,43 +1934,43 @@ ; RV32I-NEXT: or a1, a6, a5 ; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t0, a3 -; RV32I-NEXT: sll a0, a0, t0 -; RV32I-NEXT: lbu a1, 1(a4) -; RV32I-NEXT: lbu a5, 0(a4) -; RV32I-NEXT: lbu a7, 2(a4) -; RV32I-NEXT: lbu t1, 3(a4) +; RV32I-NEXT: xori a7, a3, 31 +; RV32I-NEXT: sll a0, a0, a7 +; RV32I-NEXT: lbu a1, 9(a4) +; RV32I-NEXT: lbu a5, 8(a4) +; RV32I-NEXT: lbu t0, 10(a4) +; RV32I-NEXT: lbu t1, 11(a4) ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t1, a7 -; RV32I-NEXT: or t1, a5, a1 -; RV32I-NEXT: slli a1, t4, 1 -; RV32I-NEXT: xori t2, a3, 31 +; RV32I-NEXT: or a5, t1, t0 +; RV32I-NEXT: or t0, a5, a1 +; RV32I-NEXT: slli a1, t0, 1 +; RV32I-NEXT: not t2, a3 ; RV32I-NEXT: sll a1, a1, t2 ; RV32I-NEXT: lbu a5, 13(a4) -; RV32I-NEXT: lbu a7, 12(a4) +; RV32I-NEXT: lbu t1, 12(a4) ; RV32I-NEXT: lbu t3, 14(a4) ; RV32I-NEXT: lbu t5, 15(a4) ; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: or a5, a5, t1 ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: or a7, t5, t3 -; RV32I-NEXT: or t3, a7, a5 +; RV32I-NEXT: or t1, t5, t3 +; RV32I-NEXT: or t3, t1, a5 ; RV32I-NEXT: lbu a5, 17(a4) -; RV32I-NEXT: lbu a7, 16(a4) +; RV32I-NEXT: lbu t1, 16(a4) ; RV32I-NEXT: lbu t5, 18(a4) ; RV32I-NEXT: lbu t6, 19(a4) ; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: or a5, a5, t1 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t5 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: slli a7, a5, 1 -; RV32I-NEXT: sll a7, a7, t0 +; RV32I-NEXT: or t1, t6, t5 +; RV32I-NEXT: or a5, t1, a5 +; RV32I-NEXT: slli t1, a5, 1 +; RV32I-NEXT: sll t1, t1, t2 ; RV32I-NEXT: lbu t5, 21(a4) ; RV32I-NEXT: lbu t6, 20(a4) ; RV32I-NEXT: lbu s0, 22(a4) @@ -1994,92 +1994,92 @@ ; RV32I-NEXT: lbu s0, 29(a4) ; RV32I-NEXT: lbu s1, 28(a4) ; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t0, s2, t0 +; RV32I-NEXT: sll t2, s2, t2 ; RV32I-NEXT: slli s0, s0, 8 ; RV32I-NEXT: or s0, s0, s1 ; RV32I-NEXT: lbu s1, 30(a4) ; RV32I-NEXT: lbu a4, 31(a4) ; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: sll s2, s2, t2 +; RV32I-NEXT: sll s2, s2, a7 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli a4, a4, 24 ; RV32I-NEXT: or a4, a4, s1 ; RV32I-NEXT: slli s1, t5, 1 -; RV32I-NEXT: sll s1, s1, t2 +; RV32I-NEXT: sll s1, s1, a7 ; RV32I-NEXT: or a4, a4, s0 ; RV32I-NEXT: slli s0, a4, 1 -; RV32I-NEXT: sll t2, s0, t2 +; RV32I-NEXT: sll a7, s0, a7 ; RV32I-NEXT: srl t4, t4, a3 -; RV32I-NEXT: srl t1, t1, a3 -; RV32I-NEXT: srl t3, t3, a3 ; RV32I-NEXT: srl a6, a6, a3 -; RV32I-NEXT: srl t5, t5, a3 +; RV32I-NEXT: srl t0, t0, a3 +; RV32I-NEXT: srl t3, t3, a3 ; RV32I-NEXT: srl a5, a5, a3 +; RV32I-NEXT: srl t5, t5, a3 ; RV32I-NEXT: srl t6, t6, a3 ; RV32I-NEXT: srl a3, a4, a3 -; RV32I-NEXT: srli a4, t6, 16 -; RV32I-NEXT: sb a4, 26(a2) -; RV32I-NEXT: or a4, t6, t2 -; RV32I-NEXT: sb t6, 24(a2) -; RV32I-NEXT: srli t2, t6, 8 -; RV32I-NEXT: sb t2, 25(a2) -; RV32I-NEXT: srli t2, a3, 24 -; RV32I-NEXT: sb t2, 31(a2) -; RV32I-NEXT: srli t2, a3, 16 -; RV32I-NEXT: sb t2, 30(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 31(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 30(a2) ; RV32I-NEXT: sb a3, 28(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 29(a2) -; RV32I-NEXT: srli a3, a5, 16 -; RV32I-NEXT: sb a3, 18(a2) +; RV32I-NEXT: srli a3, t6, 16 +; RV32I-NEXT: sb a3, 26(a2) +; RV32I-NEXT: or a3, t6, a7 +; RV32I-NEXT: sb t6, 24(a2) +; RV32I-NEXT: srli a4, t6, 8 +; RV32I-NEXT: sb a4, 25(a2) +; RV32I-NEXT: srli a4, t5, 16 +; RV32I-NEXT: sb a4, 22(a2) +; RV32I-NEXT: or a4, t5, t2 +; RV32I-NEXT: sb t5, 20(a2) +; RV32I-NEXT: srli a7, t5, 8 +; RV32I-NEXT: sb a7, 21(a2) +; RV32I-NEXT: srli a7, a5, 16 +; RV32I-NEXT: sb a7, 18(a2) ; RV32I-NEXT: or s1, a5, s1 ; RV32I-NEXT: sb a5, 16(a2) ; RV32I-NEXT: srli a5, a5, 8 ; RV32I-NEXT: sb a5, 17(a2) -; RV32I-NEXT: srli a3, t5, 16 -; RV32I-NEXT: sb a3, 22(a2) -; RV32I-NEXT: or a3, t5, t0 -; RV32I-NEXT: sb t5, 20(a2) -; RV32I-NEXT: srli a5, t5, 8 -; RV32I-NEXT: sb a5, 21(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: or a5, a6, s2 -; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: srli a6, a6, 8 -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t3, 16 -; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t3, a7 +; RV32I-NEXT: srli a5, t3, 16 +; RV32I-NEXT: sb a5, 14(a2) +; RV32I-NEXT: or a5, t3, t1 ; RV32I-NEXT: sb t3, 12(a2) ; RV32I-NEXT: srli a7, t3, 8 ; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, t1, 16 -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, t1, a1 -; RV32I-NEXT: sb t1, 0(a2) -; RV32I-NEXT: srli a7, t1, 8 -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: srli a7, t4, 16 -; RV32I-NEXT: sb a7, 6(a2) +; RV32I-NEXT: srli a7, t0, 16 +; RV32I-NEXT: sb a7, 10(a2) +; RV32I-NEXT: or a7, t0, s2 +; RV32I-NEXT: sb t0, 8(a2) +; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: sb t0, 9(a2) +; RV32I-NEXT: srli t0, a6, 16 +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: or a1, a6, a1 +; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: srli a6, a6, 8 +; RV32I-NEXT: sb a6, 5(a2) +; RV32I-NEXT: srli a6, t4, 16 +; RV32I-NEXT: sb a6, 2(a2) ; RV32I-NEXT: or a0, t4, a0 -; RV32I-NEXT: sb t4, 4(a2) -; RV32I-NEXT: srli a7, t4, 8 -; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb t4, 0(a2) +; RV32I-NEXT: srli a6, t4, 8 +; RV32I-NEXT: sb a6, 1(a2) +; RV32I-NEXT: srli a3, a3, 24 +; RV32I-NEXT: sb a3, 27(a2) ; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: sb a4, 23(a2) ; RV32I-NEXT: srli s1, s1, 24 ; RV32I-NEXT: sb s1, 19(a2) -; RV32I-NEXT: srli a3, a3, 24 -; RV32I-NEXT: sb a3, 23(a2) ; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: sb a5, 11(a2) -; RV32I-NEXT: srli a3, a6, 24 -; RV32I-NEXT: sb a3, 15(a2) +; RV32I-NEXT: sb a5, 15(a2) +; RV32I-NEXT: srli a3, a7, 24 +; RV32I-NEXT: sb a3, 11(a2) ; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: sb a1, 7(a2) ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: sb a0, 3(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload @@ -2201,7 +2201,6 @@ ; RV64I-NEXT: sb t6, 99(sp) ; RV64I-NEXT: sb t5, 98(sp) ; RV64I-NEXT: sb t4, 97(sp) -; RV64I-NEXT: sb t3, 96(sp) ; RV64I-NEXT: sb zero, 87(sp) ; RV64I-NEXT: sb zero, 86(sp) ; RV64I-NEXT: sb zero, 85(sp) @@ -2234,6 +2233,7 @@ ; RV64I-NEXT: sb zero, 58(sp) ; RV64I-NEXT: sb zero, 57(sp) ; RV64I-NEXT: sb zero, 56(sp) +; RV64I-NEXT: sb t3, 96(sp) ; RV64I-NEXT: sb t2, 95(sp) ; RV64I-NEXT: sb t1, 94(sp) ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload @@ -2297,20 +2297,20 @@ ; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: slli a5, a5, 32 ; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: lbu a5, 25(a0) -; RV64I-NEXT: lbu a6, 24(a0) -; RV64I-NEXT: lbu a7, 26(a0) -; RV64I-NEXT: lbu t0, 27(a0) +; RV64I-NEXT: lbu a5, 17(a0) +; RV64I-NEXT: lbu a6, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli t0, t0, 24 ; RV64I-NEXT: or a6, t0, a7 ; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 29(a0) -; RV64I-NEXT: lbu a7, 28(a0) -; RV64I-NEXT: lbu t0, 30(a0) -; RV64I-NEXT: lbu t1, 31(a0) +; RV64I-NEXT: lbu a6, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 @@ -2319,103 +2319,103 @@ ; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: slli a6, a6, 32 ; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: lbu a6, 17(a0) -; RV64I-NEXT: lbu a7, 16(a0) -; RV64I-NEXT: lbu t0, 18(a0) -; RV64I-NEXT: lbu t1, 19(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: lbu a7, 21(a0) -; RV64I-NEXT: or t0, t1, t0 -; RV64I-NEXT: or a6, t0, a6 -; RV64I-NEXT: lbu t0, 20(a0) +; RV64I-NEXT: srli a6, a3, 1 +; RV64I-NEXT: not a7, a1 +; RV64I-NEXT: srl a6, a6, a7 +; RV64I-NEXT: lbu a7, 25(a0) +; RV64I-NEXT: lbu t0, 24(a0) +; RV64I-NEXT: lbu t1, 26(a0) +; RV64I-NEXT: lbu t2, 27(a0) ; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: lbu t1, 22(a0) -; RV64I-NEXT: lbu a0, 23(a0) ; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: srli t0, a4, 1 ; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: lbu t0, 29(a0) +; RV64I-NEXT: lbu t1, 28(a0) +; RV64I-NEXT: lbu t2, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) +; RV64I-NEXT: slli t0, t0, 8 +; RV64I-NEXT: or t0, t0, t1 +; RV64I-NEXT: slli t2, t2, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or t1, a0, t1 -; RV64I-NEXT: xori t2, a1, 63 -; RV64I-NEXT: srl a0, t0, t2 -; RV64I-NEXT: or a7, t1, a7 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: srli a7, a6, 1 -; RV64I-NEXT: srl a7, a7, t2 -; RV64I-NEXT: srli t0, a3, 1 -; RV64I-NEXT: not t1, a1 -; RV64I-NEXT: srl t0, t0, t1 +; RV64I-NEXT: or a0, a0, t2 +; RV64I-NEXT: srli t1, a4, 1 +; RV64I-NEXT: or a0, a0, t0 +; RV64I-NEXT: xori t0, a1, 63 +; RV64I-NEXT: srl t1, t1, t0 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: srli a7, a5, 1 +; RV64I-NEXT: srl a7, a7, t0 ; RV64I-NEXT: sll a3, a3, a1 ; RV64I-NEXT: sll a5, a5, a1 -; RV64I-NEXT: sll a6, a6, a1 +; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: sll a1, a4, a1 -; RV64I-NEXT: srli a4, a6, 56 -; RV64I-NEXT: sb a4, 23(a2) -; RV64I-NEXT: srli a4, a6, 48 -; RV64I-NEXT: sb a4, 22(a2) -; RV64I-NEXT: srli a4, a6, 40 -; RV64I-NEXT: sb a4, 21(a2) -; RV64I-NEXT: srli a4, a6, 32 -; RV64I-NEXT: sb a4, 20(a2) -; RV64I-NEXT: srli a4, a6, 24 -; RV64I-NEXT: sb a4, 19(a2) -; RV64I-NEXT: srli a4, a6, 16 -; RV64I-NEXT: sb a4, 18(a2) -; RV64I-NEXT: or a4, a6, t0 -; RV64I-NEXT: srli a6, a6, 8 -; RV64I-NEXT: sb a6, 17(a2) -; RV64I-NEXT: srli a6, a5, 56 -; RV64I-NEXT: sb a6, 31(a2) -; RV64I-NEXT: srli a6, a5, 48 -; RV64I-NEXT: sb a6, 30(a2) -; RV64I-NEXT: srli a6, a5, 40 -; RV64I-NEXT: sb a6, 29(a2) -; RV64I-NEXT: srli a6, a5, 32 -; RV64I-NEXT: sb a6, 28(a2) -; RV64I-NEXT: srli a6, a5, 24 -; RV64I-NEXT: sb a6, 27(a2) -; RV64I-NEXT: srli a6, a5, 16 -; RV64I-NEXT: sb a6, 26(a2) -; RV64I-NEXT: or a6, a5, a7 +; RV64I-NEXT: srli a4, a0, 56 +; RV64I-NEXT: sb a4, 31(a2) +; RV64I-NEXT: srli a4, a0, 48 +; RV64I-NEXT: sb a4, 30(a2) +; RV64I-NEXT: srli a4, a0, 40 +; RV64I-NEXT: sb a4, 29(a2) +; RV64I-NEXT: srli a4, a0, 32 +; RV64I-NEXT: sb a4, 28(a2) +; RV64I-NEXT: srli a4, a0, 24 +; RV64I-NEXT: sb a4, 27(a2) +; RV64I-NEXT: srli a4, a0, 16 +; RV64I-NEXT: sb a4, 26(a2) +; RV64I-NEXT: or a4, a0, a7 +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 25(a2) +; RV64I-NEXT: srli a0, a5, 56 +; RV64I-NEXT: sb a0, 23(a2) +; RV64I-NEXT: srli a0, a5, 48 +; RV64I-NEXT: sb a0, 22(a2) +; RV64I-NEXT: srli a0, a5, 40 +; RV64I-NEXT: sb a0, 21(a2) +; RV64I-NEXT: srli a0, a5, 32 +; RV64I-NEXT: sb a0, 20(a2) +; RV64I-NEXT: srli a0, a5, 24 +; RV64I-NEXT: sb a0, 19(a2) +; RV64I-NEXT: srli a0, a5, 16 +; RV64I-NEXT: sb a0, 18(a2) +; RV64I-NEXT: or a0, a5, a6 ; RV64I-NEXT: srli a5, a5, 8 -; RV64I-NEXT: sb a5, 25(a2) -; RV64I-NEXT: srli a5, a1, 56 -; RV64I-NEXT: sb a5, 7(a2) -; RV64I-NEXT: srli a5, a1, 48 -; RV64I-NEXT: sb a5, 6(a2) -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: sb a5, 5(a2) -; RV64I-NEXT: srli a5, a1, 32 -; RV64I-NEXT: sb a5, 4(a2) -; RV64I-NEXT: srli a5, a1, 24 -; RV64I-NEXT: sb a5, 3(a2) -; RV64I-NEXT: srli a5, a1, 16 -; RV64I-NEXT: sb a5, 2(a2) +; RV64I-NEXT: sb a5, 17(a2) +; RV64I-NEXT: srli a5, a3, 56 +; RV64I-NEXT: sb a5, 15(a2) +; RV64I-NEXT: srli a5, a3, 48 +; RV64I-NEXT: sb a5, 14(a2) +; RV64I-NEXT: srli a5, a3, 40 +; RV64I-NEXT: sb a5, 13(a2) +; RV64I-NEXT: srli a5, a3, 32 +; RV64I-NEXT: sb a5, 12(a2) +; RV64I-NEXT: srli a5, a3, 24 +; RV64I-NEXT: sb a5, 11(a2) +; RV64I-NEXT: srli a5, a3, 16 +; RV64I-NEXT: sb a5, 10(a2) +; RV64I-NEXT: or a5, a3, t1 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 9(a2) +; RV64I-NEXT: srli a3, a1, 56 +; RV64I-NEXT: sb a3, 7(a2) +; RV64I-NEXT: srli a3, a1, 48 +; RV64I-NEXT: sb a3, 6(a2) +; RV64I-NEXT: srli a3, a1, 40 +; RV64I-NEXT: sb a3, 5(a2) +; RV64I-NEXT: srli a3, a1, 32 +; RV64I-NEXT: sb a3, 4(a2) +; RV64I-NEXT: srli a3, a1, 24 +; RV64I-NEXT: sb a3, 3(a2) +; RV64I-NEXT: srli a3, a1, 16 +; RV64I-NEXT: sb a3, 2(a2) ; RV64I-NEXT: sb a1, 0(a2) ; RV64I-NEXT: srli a1, a1, 8 ; RV64I-NEXT: sb a1, 1(a2) -; RV64I-NEXT: srli a1, a3, 56 -; RV64I-NEXT: sb a1, 15(a2) -; RV64I-NEXT: srli a1, a3, 48 -; RV64I-NEXT: sb a1, 14(a2) -; RV64I-NEXT: srli a1, a3, 40 -; RV64I-NEXT: sb a1, 13(a2) -; RV64I-NEXT: srli a1, a3, 32 -; RV64I-NEXT: sb a1, 12(a2) -; RV64I-NEXT: srli a1, a3, 24 -; RV64I-NEXT: sb a1, 11(a2) -; RV64I-NEXT: srli a1, a3, 16 -; RV64I-NEXT: sb a1, 10(a2) -; RV64I-NEXT: or a0, a3, a0 -; RV64I-NEXT: srli a3, a3, 8 -; RV64I-NEXT: sb a3, 9(a2) -; RV64I-NEXT: sb a4, 16(a2) -; RV64I-NEXT: sb a6, 24(a2) -; RV64I-NEXT: sb a0, 8(a2) +; RV64I-NEXT: sb a4, 24(a2) +; RV64I-NEXT: sb a0, 16(a2) +; RV64I-NEXT: sb a5, 8(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload @@ -2568,175 +2568,175 @@ ; RV32I-NEXT: sb a0, 60(sp) ; RV32I-NEXT: slli a0, t0, 24 ; RV32I-NEXT: srli a0, a0, 27 -; RV32I-NEXT: addi a5, sp, 60 -; RV32I-NEXT: sub a5, a5, a0 -; RV32I-NEXT: lbu a0, 5(a5) -; RV32I-NEXT: lbu a1, 4(a5) -; RV32I-NEXT: lbu a3, 6(a5) -; RV32I-NEXT: lbu a4, 7(a5) +; RV32I-NEXT: addi a4, sp, 60 +; RV32I-NEXT: sub a4, a4, a0 +; RV32I-NEXT: lbu a0, 5(a4) +; RV32I-NEXT: lbu a1, 4(a4) +; RV32I-NEXT: lbu a3, 6(a4) +; RV32I-NEXT: lbu a5, 7(a4) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a3, a3, 16 -; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or t3, a3, a0 -; RV32I-NEXT: andi a1, t0, 7 -; RV32I-NEXT: lbu a0, 1(a5) -; RV32I-NEXT: lbu a3, 0(a5) -; RV32I-NEXT: lbu a4, 2(a5) -; RV32I-NEXT: lbu a6, 3(a5) +; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or t4, a3, a0 +; RV32I-NEXT: andi a3, t0, 7 +; RV32I-NEXT: lbu a0, 1(a4) +; RV32I-NEXT: lbu a1, 0(a4) +; RV32I-NEXT: lbu a5, 2(a4) +; RV32I-NEXT: lbu a6, 3(a4) ; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: or a0, a0, a3 -; RV32I-NEXT: slli a4, a4, 16 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a6, a4 -; RV32I-NEXT: or a6, a3, a0 +; RV32I-NEXT: or a1, a6, a5 +; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: srli a0, a6, 1 -; RV32I-NEXT: xori a7, a1, 31 +; RV32I-NEXT: xori a7, a3, 31 ; RV32I-NEXT: srl a0, a0, a7 -; RV32I-NEXT: lbu a3, 13(a5) -; RV32I-NEXT: lbu a4, 12(a5) -; RV32I-NEXT: lbu t0, 14(a5) -; RV32I-NEXT: lbu t1, 15(a5) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 +; RV32I-NEXT: lbu a1, 9(a4) +; RV32I-NEXT: lbu a5, 8(a4) +; RV32I-NEXT: lbu t0, 10(a4) +; RV32I-NEXT: lbu t1, 11(a4) +; RV32I-NEXT: slli a1, a1, 8 +; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a4, t1, t0 -; RV32I-NEXT: or t0, a4, a3 -; RV32I-NEXT: lbu a3, 9(a5) -; RV32I-NEXT: lbu a4, 8(a5) -; RV32I-NEXT: lbu t1, 10(a5) -; RV32I-NEXT: lbu t2, 11(a5) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a4, t2, t1 -; RV32I-NEXT: or t1, a4, a3 -; RV32I-NEXT: srli a3, t1, 1 -; RV32I-NEXT: srl a3, a3, a7 -; RV32I-NEXT: srli a4, t3, 1 -; RV32I-NEXT: not t2, a1 -; RV32I-NEXT: lbu t4, 21(a5) -; RV32I-NEXT: lbu t5, 20(a5) -; RV32I-NEXT: lbu t6, 22(a5) -; RV32I-NEXT: lbu s0, 23(a5) -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: or t4, t4, t5 -; RV32I-NEXT: slli t6, t6, 16 -; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: lbu t5, 17(a5) -; RV32I-NEXT: lbu t6, 16(a5) -; RV32I-NEXT: lbu s0, 18(a5) -; RV32I-NEXT: lbu s1, 19(a5) +; RV32I-NEXT: or a5, t1, t0 +; RV32I-NEXT: or t0, a5, a1 +; RV32I-NEXT: srli a1, t4, 1 +; RV32I-NEXT: not t2, a3 +; RV32I-NEXT: srl a1, a1, t2 +; RV32I-NEXT: lbu a5, 13(a4) +; RV32I-NEXT: lbu t1, 12(a4) +; RV32I-NEXT: lbu t3, 14(a4) +; RV32I-NEXT: lbu t5, 15(a4) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, t1 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t5, t5, 24 +; RV32I-NEXT: or t1, t5, t3 +; RV32I-NEXT: or t3, t1, a5 +; RV32I-NEXT: lbu a5, 17(a4) +; RV32I-NEXT: lbu t1, 16(a4) +; RV32I-NEXT: lbu t5, 18(a4) +; RV32I-NEXT: lbu t6, 19(a4) +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, t1 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: or t1, t6, t5 +; RV32I-NEXT: or a5, t1, a5 +; RV32I-NEXT: srli t1, t3, 1 +; RV32I-NEXT: srl t1, t1, t2 +; RV32I-NEXT: lbu t5, 21(a4) +; RV32I-NEXT: lbu t6, 20(a4) +; RV32I-NEXT: lbu s0, 22(a4) +; RV32I-NEXT: lbu s1, 23(a4) ; RV32I-NEXT: slli t5, t5, 8 ; RV32I-NEXT: or t5, t5, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 ; RV32I-NEXT: or s0, s1, s0 ; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: lbu t6, 29(a5) -; RV32I-NEXT: lbu s0, 28(a5) -; RV32I-NEXT: lbu s1, 30(a5) -; RV32I-NEXT: lbu s2, 31(a5) +; RV32I-NEXT: lbu t6, 25(a4) +; RV32I-NEXT: lbu s0, 24(a4) +; RV32I-NEXT: lbu s1, 26(a4) +; RV32I-NEXT: lbu s2, 27(a4) ; RV32I-NEXT: slli t6, t6, 8 ; RV32I-NEXT: or t6, t6, s0 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 ; RV32I-NEXT: or s0, s2, s1 -; RV32I-NEXT: lbu s1, 25(a5) -; RV32I-NEXT: lbu s2, 24(a5) -; RV32I-NEXT: srl a4, a4, t2 ; RV32I-NEXT: or t6, s0, t6 -; RV32I-NEXT: slli s1, s1, 8 -; RV32I-NEXT: or s0, s1, s2 -; RV32I-NEXT: lbu s1, 26(a5) -; RV32I-NEXT: lbu a5, 27(a5) +; RV32I-NEXT: lbu s0, 29(a4) +; RV32I-NEXT: lbu s1, 28(a4) ; RV32I-NEXT: srli s2, t5, 1 +; RV32I-NEXT: srl t2, s2, t2 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: or s0, s0, s1 +; RV32I-NEXT: lbu s1, 30(a4) +; RV32I-NEXT: lbu a4, 31(a4) +; RV32I-NEXT: srli s2, t0, 1 ; RV32I-NEXT: srl s2, s2, a7 ; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a5, a5, s1 -; RV32I-NEXT: srli s1, t0, 1 -; RV32I-NEXT: srl s1, s1, t2 -; RV32I-NEXT: or a5, a5, s0 -; RV32I-NEXT: srli s0, a5, 1 +; RV32I-NEXT: slli a4, a4, 24 +; RV32I-NEXT: or a4, a4, s1 +; RV32I-NEXT: srli s1, a5, 1 +; RV32I-NEXT: srl s1, s1, a7 +; RV32I-NEXT: or a4, a4, s0 +; RV32I-NEXT: srli s0, t6, 1 ; RV32I-NEXT: srl a7, s0, a7 -; RV32I-NEXT: srli s0, t4, 1 -; RV32I-NEXT: srl t2, s0, t2 -; RV32I-NEXT: sll t3, t3, a1 -; RV32I-NEXT: sll t0, t0, a1 -; RV32I-NEXT: sll t1, t1, a1 -; RV32I-NEXT: sll t4, t4, a1 -; RV32I-NEXT: sll t5, t5, a1 -; RV32I-NEXT: sll t6, t6, a1 -; RV32I-NEXT: sll a5, a5, a1 -; RV32I-NEXT: sll a1, a6, a1 -; RV32I-NEXT: srli a6, a5, 24 -; RV32I-NEXT: sb a6, 27(a2) -; RV32I-NEXT: srli a6, a5, 16 -; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: or a6, a5, t2 -; RV32I-NEXT: srli a5, a5, 8 -; RV32I-NEXT: sb a5, 25(a2) -; RV32I-NEXT: srli a5, t6, 24 -; RV32I-NEXT: sb a5, 31(a2) -; RV32I-NEXT: srli a5, t6, 16 -; RV32I-NEXT: sb a5, 30(a2) -; RV32I-NEXT: or a5, t6, a7 +; RV32I-NEXT: sll t4, t4, a3 +; RV32I-NEXT: sll t0, t0, a3 +; RV32I-NEXT: sll t3, t3, a3 +; RV32I-NEXT: sll a5, a5, a3 +; RV32I-NEXT: sll t5, t5, a3 +; RV32I-NEXT: sll t6, t6, a3 +; RV32I-NEXT: sll a4, a4, a3 +; RV32I-NEXT: sll a3, a6, a3 +; RV32I-NEXT: srli a6, a4, 24 +; RV32I-NEXT: sb a6, 31(a2) +; RV32I-NEXT: srli a6, a4, 16 +; RV32I-NEXT: sb a6, 30(a2) +; RV32I-NEXT: or a6, a4, a7 +; RV32I-NEXT: srli a4, a4, 8 +; RV32I-NEXT: sb a4, 29(a2) +; RV32I-NEXT: srli a4, t6, 24 +; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: srli a4, t6, 16 +; RV32I-NEXT: sb a4, 26(a2) +; RV32I-NEXT: or a4, t6, t2 ; RV32I-NEXT: srli a7, t6, 8 -; RV32I-NEXT: sb a7, 29(a2) +; RV32I-NEXT: sb a7, 25(a2) ; RV32I-NEXT: srli a7, t5, 24 -; RV32I-NEXT: sb a7, 19(a2) +; RV32I-NEXT: sb a7, 23(a2) ; RV32I-NEXT: srli a7, t5, 16 -; RV32I-NEXT: sb a7, 18(a2) +; RV32I-NEXT: sb a7, 22(a2) ; RV32I-NEXT: or a7, t5, s1 ; RV32I-NEXT: srli t2, t5, 8 -; RV32I-NEXT: sb t2, 17(a2) -; RV32I-NEXT: srli t2, t4, 24 -; RV32I-NEXT: sb t2, 23(a2) -; RV32I-NEXT: srli t2, t4, 16 -; RV32I-NEXT: sb t2, 22(a2) -; RV32I-NEXT: or t2, t4, s2 -; RV32I-NEXT: srli t4, t4, 8 -; RV32I-NEXT: sb t4, 21(a2) -; RV32I-NEXT: srli t4, t1, 24 -; RV32I-NEXT: sb t4, 11(a2) -; RV32I-NEXT: srli t4, t1, 16 -; RV32I-NEXT: sb t4, 10(a2) -; RV32I-NEXT: or a4, t1, a4 -; RV32I-NEXT: srli t1, t1, 8 -; RV32I-NEXT: sb t1, 9(a2) -; RV32I-NEXT: srli t1, t0, 24 -; RV32I-NEXT: sb t1, 15(a2) -; RV32I-NEXT: srli t1, t0, 16 -; RV32I-NEXT: sb t1, 14(a2) -; RV32I-NEXT: or a3, t0, a3 +; RV32I-NEXT: sb t2, 21(a2) +; RV32I-NEXT: srli t2, a5, 24 +; RV32I-NEXT: sb t2, 19(a2) +; RV32I-NEXT: srli t2, a5, 16 +; RV32I-NEXT: sb t2, 18(a2) +; RV32I-NEXT: or t1, a5, t1 +; RV32I-NEXT: srli a5, a5, 8 +; RV32I-NEXT: sb a5, 17(a2) +; RV32I-NEXT: srli a5, t3, 24 +; RV32I-NEXT: sb a5, 15(a2) +; RV32I-NEXT: srli a5, t3, 16 +; RV32I-NEXT: sb a5, 14(a2) +; RV32I-NEXT: or a5, t3, s2 +; RV32I-NEXT: srli t2, t3, 8 +; RV32I-NEXT: sb t2, 13(a2) +; RV32I-NEXT: srli t2, t0, 24 +; RV32I-NEXT: sb t2, 11(a2) +; RV32I-NEXT: srli t2, t0, 16 +; RV32I-NEXT: sb t2, 10(a2) +; RV32I-NEXT: or a1, t0, a1 ; RV32I-NEXT: srli t0, t0, 8 -; RV32I-NEXT: sb t0, 13(a2) -; RV32I-NEXT: srli t0, a1, 24 +; RV32I-NEXT: sb t0, 9(a2) +; RV32I-NEXT: srli t0, t4, 24 +; RV32I-NEXT: sb t0, 7(a2) +; RV32I-NEXT: srli t0, t4, 16 +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: or a0, t4, a0 +; RV32I-NEXT: srli t0, t4, 8 +; RV32I-NEXT: sb t0, 5(a2) +; RV32I-NEXT: srli t0, a3, 24 ; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: srli t0, a1, 16 +; RV32I-NEXT: srli t0, a3, 16 ; RV32I-NEXT: sb t0, 2(a2) -; RV32I-NEXT: sb a1, 0(a2) -; RV32I-NEXT: srli a1, a1, 8 -; RV32I-NEXT: sb a1, 1(a2) -; RV32I-NEXT: srli a1, t3, 24 -; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: srli a1, t3, 16 -; RV32I-NEXT: sb a1, 6(a2) -; RV32I-NEXT: or a0, t3, a0 -; RV32I-NEXT: srli a1, t3, 8 -; RV32I-NEXT: sb a1, 5(a2) -; RV32I-NEXT: sb a6, 24(a2) -; RV32I-NEXT: sb a5, 28(a2) -; RV32I-NEXT: sb a7, 16(a2) -; RV32I-NEXT: sb t2, 20(a2) -; RV32I-NEXT: sb a4, 8(a2) -; RV32I-NEXT: sb a3, 12(a2) +; RV32I-NEXT: sb a3, 0(a2) +; RV32I-NEXT: srli a3, a3, 8 +; RV32I-NEXT: sb a3, 1(a2) +; RV32I-NEXT: sb a6, 28(a2) +; RV32I-NEXT: sb a4, 24(a2) +; RV32I-NEXT: sb a7, 20(a2) +; RV32I-NEXT: sb t1, 16(a2) +; RV32I-NEXT: sb a5, 12(a2) +; RV32I-NEXT: sb a1, 8(a2) ; RV32I-NEXT: sb a0, 4(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload @@ -2917,138 +2917,152 @@ ; RV64I-NEXT: sb a0, 89(sp) ; RV64I-NEXT: slli a0, t1, 56 ; RV64I-NEXT: srli a0, a0, 59 -; RV64I-NEXT: addi a3, sp, 56 -; RV64I-NEXT: add a3, a3, a0 -; RV64I-NEXT: lbu a0, 9(a3) -; RV64I-NEXT: lbu a1, 8(a3) -; RV64I-NEXT: lbu a4, 10(a3) -; RV64I-NEXT: lbu a5, 11(a3) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: addi a1, sp, 56 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: lbu a1, 1(a0) +; RV64I-NEXT: lbu a3, 0(a0) +; RV64I-NEXT: lbu a4, 2(a0) +; RV64I-NEXT: lbu a5, 3(a0) +; RV64I-NEXT: slli a1, a1, 8 +; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a5, a5, 24 ; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: lbu a1, 13(a3) -; RV64I-NEXT: lbu a4, 12(a3) -; RV64I-NEXT: lbu a5, 14(a3) -; RV64I-NEXT: lbu a6, 15(a3) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a4 +; RV64I-NEXT: or a1, a4, a1 +; RV64I-NEXT: lbu a3, 5(a0) +; RV64I-NEXT: lbu a4, 4(a0) +; RV64I-NEXT: lbu a5, 6(a0) +; RV64I-NEXT: lbu a6, 7(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a4, a1, a0 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: or a6, a3, a1 ; RV64I-NEXT: andi a1, t1, 7 -; RV64I-NEXT: lbu a0, 17(a3) -; RV64I-NEXT: lbu a5, 16(a3) -; RV64I-NEXT: lbu a6, 18(a3) -; RV64I-NEXT: lbu a7, 19(a3) -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: or a0, a0, a5 -; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: lbu a3, 9(a0) +; RV64I-NEXT: lbu a4, 8(a0) +; RV64I-NEXT: lbu a5, 10(a0) +; RV64I-NEXT: lbu a7, 11(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a5, a7, a6 -; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: lbu a5, 21(a3) -; RV64I-NEXT: lbu a6, 20(a3) -; RV64I-NEXT: lbu a7, 22(a3) -; RV64I-NEXT: lbu t0, 23(a3) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: or a4, a7, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 13(a0) +; RV64I-NEXT: lbu a5, 12(a0) +; RV64I-NEXT: lbu a7, 14(a0) +; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a6, t0, a7 -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: or a5, a5, a0 -; RV64I-NEXT: slli a0, a5, 1 -; RV64I-NEXT: not a6, a1 -; RV64I-NEXT: sll a0, a0, a6 -; RV64I-NEXT: lbu a6, 1(a3) -; RV64I-NEXT: lbu a7, 0(a3) -; RV64I-NEXT: lbu t0, 2(a3) -; RV64I-NEXT: lbu t1, 3(a3) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a5, a4, a3 +; RV64I-NEXT: lbu a3, 17(a0) +; RV64I-NEXT: lbu a4, 16(a0) +; RV64I-NEXT: lbu a7, 18(a0) +; RV64I-NEXT: lbu t0, 19(a0) +; RV64I-NEXT: slli a3, a3, 8 +; RV64I-NEXT: or a3, a3, a4 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a4, t0, a7 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: lbu a4, 21(a0) +; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 22(a0) +; RV64I-NEXT: lbu t1, 23(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: or a4, a4, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 ; RV64I-NEXT: or a7, t1, t0 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 5(a3) -; RV64I-NEXT: lbu t0, 4(a3) -; RV64I-NEXT: lbu t1, 6(a3) -; RV64I-NEXT: lbu t2, 7(a3) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or t0, t2, t1 -; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: or a6, a7, a6 -; RV64I-NEXT: lbu a7, 25(a3) -; RV64I-NEXT: lbu t0, 24(a3) -; RV64I-NEXT: lbu t1, 26(a3) -; RV64I-NEXT: lbu t2, 27(a3) +; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a4, a3, 1 +; RV64I-NEXT: not a7, a1 +; RV64I-NEXT: sll a4, a4, a7 +; RV64I-NEXT: lbu a7, 25(a0) +; RV64I-NEXT: lbu t0, 24(a0) +; RV64I-NEXT: lbu t1, 26(a0) +; RV64I-NEXT: lbu t2, 27(a0) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 ; RV64I-NEXT: or t0, t2, t1 ; RV64I-NEXT: or a7, t0, a7 -; RV64I-NEXT: lbu t0, 29(a3) -; RV64I-NEXT: lbu t1, 28(a3) -; RV64I-NEXT: lbu t2, 30(a3) -; RV64I-NEXT: lbu a3, 31(a3) +; RV64I-NEXT: lbu t0, 29(a0) +; RV64I-NEXT: lbu t1, 28(a0) +; RV64I-NEXT: lbu t2, 30(a0) +; RV64I-NEXT: lbu a0, 31(a0) ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli a3, a3, 24 -; RV64I-NEXT: or a3, a3, t2 -; RV64I-NEXT: slli t1, a4, 1 -; RV64I-NEXT: or a3, a3, t0 +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or a0, a0, t2 +; RV64I-NEXT: slli t1, a5, 1 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: xori t0, a1, 63 ; RV64I-NEXT: sll t1, t1, t0 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a3, a3, a7 -; RV64I-NEXT: slli a7, a3, 1 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: or a0, a0, a7 +; RV64I-NEXT: slli a7, a0, 1 ; RV64I-NEXT: sll a7, a7, t0 -; RV64I-NEXT: srl a4, a4, a1 ; RV64I-NEXT: srl a6, a6, a1 ; RV64I-NEXT: srl a5, a5, a1 -; RV64I-NEXT: sra a1, a3, a1 -; RV64I-NEXT: srli a3, a5, 48 -; RV64I-NEXT: sb a3, 22(a2) -; RV64I-NEXT: srli a3, a5, 40 -; RV64I-NEXT: sb a3, 21(a2) -; RV64I-NEXT: srli a3, a5, 32 -; RV64I-NEXT: sb a3, 20(a2) -; RV64I-NEXT: srli a3, a5, 24 -; RV64I-NEXT: sb a3, 19(a2) -; RV64I-NEXT: srli a3, a5, 16 -; RV64I-NEXT: sb a3, 18(a2) -; RV64I-NEXT: or a3, a5, a7 -; RV64I-NEXT: sb a5, 16(a2) +; RV64I-NEXT: srl a3, a3, a1 +; RV64I-NEXT: sra a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: sb a1, 31(a2) +; RV64I-NEXT: srli a1, a0, 48 +; RV64I-NEXT: sb a1, 30(a2) +; RV64I-NEXT: srli a1, a0, 40 +; RV64I-NEXT: sb a1, 29(a2) +; RV64I-NEXT: srli a1, a0, 32 +; RV64I-NEXT: sb a1, 28(a2) +; RV64I-NEXT: srli a1, a0, 24 +; RV64I-NEXT: sb a1, 27(a2) +; RV64I-NEXT: srli a1, a0, 16 +; RV64I-NEXT: sb a1, 26(a2) +; RV64I-NEXT: sb a0, 24(a2) +; RV64I-NEXT: srli a0, a0, 8 +; RV64I-NEXT: sb a0, 25(a2) +; RV64I-NEXT: srli a0, a3, 48 +; RV64I-NEXT: sb a0, 22(a2) +; RV64I-NEXT: srli a0, a3, 40 +; RV64I-NEXT: sb a0, 21(a2) +; RV64I-NEXT: srli a0, a3, 32 +; RV64I-NEXT: sb a0, 20(a2) +; RV64I-NEXT: srli a0, a3, 24 +; RV64I-NEXT: sb a0, 19(a2) +; RV64I-NEXT: srli a0, a3, 16 +; RV64I-NEXT: sb a0, 18(a2) +; RV64I-NEXT: or a0, a3, a7 +; RV64I-NEXT: sb a3, 16(a2) +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 17(a2) +; RV64I-NEXT: srli a1, a5, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a5, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a5, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a5, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a5, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: sb a5, 8(a2) ; RV64I-NEXT: srli a5, a5, 8 -; RV64I-NEXT: sb a5, 17(a2) -; RV64I-NEXT: srli a5, a1, 56 -; RV64I-NEXT: sb a5, 31(a2) -; RV64I-NEXT: srli a5, a1, 48 -; RV64I-NEXT: sb a5, 30(a2) -; RV64I-NEXT: srli a5, a1, 40 -; RV64I-NEXT: sb a5, 29(a2) -; RV64I-NEXT: srli a5, a1, 32 -; RV64I-NEXT: sb a5, 28(a2) -; RV64I-NEXT: srli a5, a1, 24 -; RV64I-NEXT: sb a5, 27(a2) -; RV64I-NEXT: srli a5, a1, 16 -; RV64I-NEXT: sb a5, 26(a2) -; RV64I-NEXT: sb a1, 24(a2) -; RV64I-NEXT: srli a1, a1, 8 -; RV64I-NEXT: sb a1, 25(a2) +; RV64I-NEXT: sb a5, 9(a2) ; RV64I-NEXT: srli a1, a6, 48 ; RV64I-NEXT: sb a1, 6(a2) ; RV64I-NEXT: srli a1, a6, 40 @@ -3061,28 +3075,14 @@ ; RV64I-NEXT: sb a1, 2(a2) ; RV64I-NEXT: or a1, a6, t1 ; RV64I-NEXT: sb a6, 0(a2) -; RV64I-NEXT: srli a5, a6, 8 -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: srli a5, a4, 48 -; RV64I-NEXT: sb a5, 14(a2) -; RV64I-NEXT: srli a5, a4, 40 -; RV64I-NEXT: sb a5, 13(a2) -; RV64I-NEXT: srli a5, a4, 32 -; RV64I-NEXT: sb a5, 12(a2) -; RV64I-NEXT: srli a5, a4, 24 -; RV64I-NEXT: sb a5, 11(a2) -; RV64I-NEXT: srli a5, a4, 16 -; RV64I-NEXT: sb a5, 10(a2) -; RV64I-NEXT: or a0, a4, a0 -; RV64I-NEXT: sb a4, 8(a2) -; RV64I-NEXT: srli a4, a4, 8 -; RV64I-NEXT: sb a4, 9(a2) -; RV64I-NEXT: srli a3, a3, 56 -; RV64I-NEXT: sb a3, 23(a2) +; RV64I-NEXT: srli a3, a6, 8 +; RV64I-NEXT: sb a3, 1(a2) +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: sb a0, 23(a2) +; RV64I-NEXT: srli a4, a4, 56 +; RV64I-NEXT: sb a4, 15(a2) ; RV64I-NEXT: srli a1, a1, 56 ; RV64I-NEXT: sb a1, 7(a2) -; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: sb a0, 15(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 200(sp) # 8-byte Folded Reload @@ -3242,10 +3242,10 @@ ; RV32I-NEXT: srli a0, a0, 27 ; RV32I-NEXT: addi a3, sp, 28 ; RV32I-NEXT: add a3, a3, a0 -; RV32I-NEXT: lbu a0, 5(a3) -; RV32I-NEXT: lbu a1, 4(a3) -; RV32I-NEXT: lbu a4, 6(a3) -; RV32I-NEXT: lbu a5, 7(a3) +; RV32I-NEXT: lbu a0, 1(a3) +; RV32I-NEXT: lbu a1, 0(a3) +; RV32I-NEXT: lbu a4, 2(a3) +; RV32I-NEXT: lbu a5, 3(a3) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a4, a4, 16 @@ -3253,10 +3253,10 @@ ; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: or t4, a4, a0 ; RV32I-NEXT: andi a4, t1, 7 -; RV32I-NEXT: lbu a0, 9(a3) -; RV32I-NEXT: lbu a1, 8(a3) -; RV32I-NEXT: lbu a5, 10(a3) -; RV32I-NEXT: lbu a6, 11(a3) +; RV32I-NEXT: lbu a0, 5(a3) +; RV32I-NEXT: lbu a1, 4(a3) +; RV32I-NEXT: lbu a5, 6(a3) +; RV32I-NEXT: lbu a6, 7(a3) ; RV32I-NEXT: slli a0, a0, 8 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a5, a5, 16 @@ -3264,43 +3264,43 @@ ; RV32I-NEXT: or a1, a6, a5 ; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: slli a0, a6, 1 -; RV32I-NEXT: not t0, a4 -; RV32I-NEXT: sll a0, a0, t0 -; RV32I-NEXT: lbu a1, 1(a3) -; RV32I-NEXT: lbu a5, 0(a3) -; RV32I-NEXT: lbu a7, 2(a3) -; RV32I-NEXT: lbu t1, 3(a3) +; RV32I-NEXT: xori a7, a4, 31 +; RV32I-NEXT: sll a0, a0, a7 +; RV32I-NEXT: lbu a1, 9(a3) +; RV32I-NEXT: lbu a5, 8(a3) +; RV32I-NEXT: lbu t0, 10(a3) +; RV32I-NEXT: lbu t1, 11(a3) ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, a5 -; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t1, a7 -; RV32I-NEXT: or t1, a5, a1 -; RV32I-NEXT: slli a1, t4, 1 -; RV32I-NEXT: xori t2, a4, 31 +; RV32I-NEXT: or a5, t1, t0 +; RV32I-NEXT: or t0, a5, a1 +; RV32I-NEXT: slli a1, t0, 1 +; RV32I-NEXT: not t2, a4 ; RV32I-NEXT: sll a1, a1, t2 ; RV32I-NEXT: lbu a5, 13(a3) -; RV32I-NEXT: lbu a7, 12(a3) +; RV32I-NEXT: lbu t1, 12(a3) ; RV32I-NEXT: lbu t3, 14(a3) ; RV32I-NEXT: lbu t5, 15(a3) ; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: or a5, a5, t1 ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: or a7, t5, t3 -; RV32I-NEXT: or t3, a7, a5 +; RV32I-NEXT: or t1, t5, t3 +; RV32I-NEXT: or t3, t1, a5 ; RV32I-NEXT: lbu a5, 17(a3) -; RV32I-NEXT: lbu a7, 16(a3) +; RV32I-NEXT: lbu t1, 16(a3) ; RV32I-NEXT: lbu t5, 18(a3) ; RV32I-NEXT: lbu t6, 19(a3) ; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a7 +; RV32I-NEXT: or a5, a5, t1 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a7, t6, t5 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: slli a7, a5, 1 -; RV32I-NEXT: sll a7, a7, t0 +; RV32I-NEXT: or t1, t6, t5 +; RV32I-NEXT: or a5, t1, a5 +; RV32I-NEXT: slli t1, a5, 1 +; RV32I-NEXT: sll t1, t1, t2 ; RV32I-NEXT: lbu t5, 21(a3) ; RV32I-NEXT: lbu t6, 20(a3) ; RV32I-NEXT: lbu s0, 22(a3) @@ -3324,92 +3324,92 @@ ; RV32I-NEXT: lbu s0, 29(a3) ; RV32I-NEXT: lbu s1, 28(a3) ; RV32I-NEXT: slli s2, t6, 1 -; RV32I-NEXT: sll t0, s2, t0 +; RV32I-NEXT: sll t2, s2, t2 ; RV32I-NEXT: slli s0, s0, 8 ; RV32I-NEXT: or s0, s0, s1 ; RV32I-NEXT: lbu s1, 30(a3) ; RV32I-NEXT: lbu a3, 31(a3) ; RV32I-NEXT: slli s2, t3, 1 -; RV32I-NEXT: sll s2, s2, t2 +; RV32I-NEXT: sll s2, s2, a7 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli a3, a3, 24 ; RV32I-NEXT: or a3, a3, s1 ; RV32I-NEXT: slli s1, t5, 1 -; RV32I-NEXT: sll s1, s1, t2 +; RV32I-NEXT: sll s1, s1, a7 ; RV32I-NEXT: or a3, a3, s0 ; RV32I-NEXT: slli s0, a3, 1 -; RV32I-NEXT: sll t2, s0, t2 +; RV32I-NEXT: sll a7, s0, a7 ; RV32I-NEXT: srl t4, t4, a4 -; RV32I-NEXT: srl t1, t1, a4 -; RV32I-NEXT: srl t3, t3, a4 ; RV32I-NEXT: srl a6, a6, a4 -; RV32I-NEXT: srl t5, t5, a4 +; RV32I-NEXT: srl t0, t0, a4 +; RV32I-NEXT: srl t3, t3, a4 ; RV32I-NEXT: srl a5, a5, a4 +; RV32I-NEXT: srl t5, t5, a4 ; RV32I-NEXT: srl t6, t6, a4 ; RV32I-NEXT: sra a3, a3, a4 -; RV32I-NEXT: srli a4, t6, 16 -; RV32I-NEXT: sb a4, 26(a2) -; RV32I-NEXT: or a4, t6, t2 -; RV32I-NEXT: sb t6, 24(a2) -; RV32I-NEXT: srli t2, t6, 8 -; RV32I-NEXT: sb t2, 25(a2) -; RV32I-NEXT: srli t2, a3, 24 -; RV32I-NEXT: sb t2, 31(a2) -; RV32I-NEXT: srli t2, a3, 16 -; RV32I-NEXT: sb t2, 30(a2) +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: sb a4, 31(a2) +; RV32I-NEXT: srli a4, a3, 16 +; RV32I-NEXT: sb a4, 30(a2) ; RV32I-NEXT: sb a3, 28(a2) ; RV32I-NEXT: srli a3, a3, 8 ; RV32I-NEXT: sb a3, 29(a2) -; RV32I-NEXT: srli a3, a5, 16 -; RV32I-NEXT: sb a3, 18(a2) +; RV32I-NEXT: srli a3, t6, 16 +; RV32I-NEXT: sb a3, 26(a2) +; RV32I-NEXT: or a3, t6, a7 +; RV32I-NEXT: sb t6, 24(a2) +; RV32I-NEXT: srli a4, t6, 8 +; RV32I-NEXT: sb a4, 25(a2) +; RV32I-NEXT: srli a4, t5, 16 +; RV32I-NEXT: sb a4, 22(a2) +; RV32I-NEXT: or a4, t5, t2 +; RV32I-NEXT: sb t5, 20(a2) +; RV32I-NEXT: srli a7, t5, 8 +; RV32I-NEXT: sb a7, 21(a2) +; RV32I-NEXT: srli a7, a5, 16 +; RV32I-NEXT: sb a7, 18(a2) ; RV32I-NEXT: or s1, a5, s1 ; RV32I-NEXT: sb a5, 16(a2) ; RV32I-NEXT: srli a5, a5, 8 ; RV32I-NEXT: sb a5, 17(a2) -; RV32I-NEXT: srli a3, t5, 16 -; RV32I-NEXT: sb a3, 22(a2) -; RV32I-NEXT: or a3, t5, t0 -; RV32I-NEXT: sb t5, 20(a2) -; RV32I-NEXT: srli a5, t5, 8 -; RV32I-NEXT: sb a5, 21(a2) -; RV32I-NEXT: srli a5, a6, 16 -; RV32I-NEXT: sb a5, 10(a2) -; RV32I-NEXT: or a5, a6, s2 -; RV32I-NEXT: sb a6, 8(a2) -; RV32I-NEXT: srli a6, a6, 8 -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: srli a6, t3, 16 -; RV32I-NEXT: sb a6, 14(a2) -; RV32I-NEXT: or a6, t3, a7 +; RV32I-NEXT: srli a5, t3, 16 +; RV32I-NEXT: sb a5, 14(a2) +; RV32I-NEXT: or a5, t3, t1 ; RV32I-NEXT: sb t3, 12(a2) ; RV32I-NEXT: srli a7, t3, 8 ; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, t1, 16 -; RV32I-NEXT: sb a7, 2(a2) -; RV32I-NEXT: or a1, t1, a1 -; RV32I-NEXT: sb t1, 0(a2) -; RV32I-NEXT: srli a7, t1, 8 -; RV32I-NEXT: sb a7, 1(a2) -; RV32I-NEXT: srli a7, t4, 16 -; RV32I-NEXT: sb a7, 6(a2) +; RV32I-NEXT: srli a7, t0, 16 +; RV32I-NEXT: sb a7, 10(a2) +; RV32I-NEXT: or a7, t0, s2 +; RV32I-NEXT: sb t0, 8(a2) +; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: sb t0, 9(a2) +; RV32I-NEXT: srli t0, a6, 16 +; RV32I-NEXT: sb t0, 6(a2) +; RV32I-NEXT: or a1, a6, a1 +; RV32I-NEXT: sb a6, 4(a2) +; RV32I-NEXT: srli a6, a6, 8 +; RV32I-NEXT: sb a6, 5(a2) +; RV32I-NEXT: srli a6, t4, 16 +; RV32I-NEXT: sb a6, 2(a2) ; RV32I-NEXT: or a0, t4, a0 -; RV32I-NEXT: sb t4, 4(a2) -; RV32I-NEXT: srli a7, t4, 8 -; RV32I-NEXT: sb a7, 5(a2) +; RV32I-NEXT: sb t4, 0(a2) +; RV32I-NEXT: srli a6, t4, 8 +; RV32I-NEXT: sb a6, 1(a2) +; RV32I-NEXT: srli a3, a3, 24 +; RV32I-NEXT: sb a3, 27(a2) ; RV32I-NEXT: srli a4, a4, 24 -; RV32I-NEXT: sb a4, 27(a2) +; RV32I-NEXT: sb a4, 23(a2) ; RV32I-NEXT: srli s1, s1, 24 ; RV32I-NEXT: sb s1, 19(a2) -; RV32I-NEXT: srli a3, a3, 24 -; RV32I-NEXT: sb a3, 23(a2) ; RV32I-NEXT: srli a5, a5, 24 -; RV32I-NEXT: sb a5, 11(a2) -; RV32I-NEXT: srli a3, a6, 24 -; RV32I-NEXT: sb a3, 15(a2) +; RV32I-NEXT: sb a5, 15(a2) +; RV32I-NEXT: srli a3, a7, 24 +; RV32I-NEXT: sb a3, 11(a2) ; RV32I-NEXT: srli a1, a1, 24 -; RV32I-NEXT: sb a1, 3(a2) +; RV32I-NEXT: sb a1, 7(a2) ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: sb a0, 7(a2) +; RV32I-NEXT: sb a0, 3(a2) ; RV32I-NEXT: lw ra, 140(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 136(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 132(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/SPARC/LeonInsertNOPLoadPassUT.ll b/llvm/test/CodeGen/SPARC/LeonInsertNOPLoadPassUT.ll --- a/llvm/test/CodeGen/SPARC/LeonInsertNOPLoadPassUT.ll +++ b/llvm/test/CodeGen/SPARC/LeonInsertNOPLoadPassUT.ll @@ -13,8 +13,7 @@ ; CHECK-NEXT: sethi 1042305, %o0 ; CHECK-NEXT: or %o0, 25, %o0 ; CHECK-NEXT: st %o0, [%sp+92] -; CHECK-NEXT: sethi %hi(.LCPI0_0), %o0 -; CHECK-NEXT: ld [%o0+%lo(.LCPI0_0)], %f0 +; CHECK-NEXT: ld [%sp+92], %f0 ; CHECK-NEXT: nop ; CHECK-NEXT: retl ; CHECK-NEXT: add %sp, 96, %sp diff --git a/llvm/test/CodeGen/SystemZ/fp-move-02.ll b/llvm/test/CodeGen/SystemZ/fp-move-02.ll --- a/llvm/test/CodeGen/SystemZ/fp-move-02.ll +++ b/llvm/test/CodeGen/SystemZ/fp-move-02.ll @@ -81,10 +81,10 @@ define void @f6(ptr %a, ptr %b) { ; CHECK-LABEL: f6: ; CHECK: # %bb.0: -; CHECK-NEXT: lg %r0, 8(%r3) -; CHECK-NEXT: lg %r1, 0(%r3) -; CHECK-NEXT: stg %r0, 8(%r2) -; CHECK-NEXT: stg %r1, 0(%r2) +; CHECK-NEXT: ld %f0, 0(%r3) +; CHECK-NEXT: ld %f2, 8(%r3) +; CHECK-NEXT: std %f0, 0(%r2) +; CHECK-NEXT: std %f2, 8(%r2) ; CHECK-NEXT: br %r14 %val = load i128, ptr %b %res = bitcast i128 %val to fp128 @@ -120,10 +120,10 @@ define void @f9(ptr %a, ptr %b) { ; CHECK-LABEL: f9: ; CHECK: # %bb.0: -; CHECK-NEXT: ld %f0, 0(%r2) -; CHECK-NEXT: ld %f2, 8(%r2) -; CHECK-NEXT: std %f0, 0(%r3) -; CHECK-NEXT: std %f2, 8(%r3) +; CHECK-NEXT: lg %r0, 8(%r2) +; CHECK-NEXT: lg %r1, 0(%r2) +; CHECK-NEXT: stg %r0, 8(%r3) +; CHECK-NEXT: stg %r1, 0(%r3) ; CHECK-NEXT: br %r14 %val = load fp128, ptr %a %res = bitcast fp128 %val to i128 diff --git a/llvm/test/CodeGen/SystemZ/frame-24.ll b/llvm/test/CodeGen/SystemZ/frame-24.ll --- a/llvm/test/CodeGen/SystemZ/frame-24.ll +++ b/llvm/test/CodeGen/SystemZ/frame-24.ll @@ -12,15 +12,15 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: stmg %r4, %r15, 32(%r15) ; CHECK-NEXT: aghi %r15, -192 -; CHECK-NEXT: std %f2, 328(%r15) -; CHECK-NEXT: std %f4, 336(%r15) ; CHECK-NEXT: std %f6, 344(%r15) -; CHECK-NEXT: la %r0, 352(%r15) -; CHECK-NEXT: stg %r0, 176(%r15) +; CHECK-NEXT: std %f4, 336(%r15) +; CHECK-NEXT: std %f2, 328(%r15) ; CHECK-NEXT: la %r0, 192(%r15) ; CHECK-NEXT: stg %r0, 184(%r15) -; CHECK-NEXT: mvghi 160(%r15), 2 +; CHECK-NEXT: la %r0, 352(%r15) +; CHECK-NEXT: stg %r0, 176(%r15) ; CHECK-NEXT: mvghi 168(%r15), 1 +; CHECK-NEXT: mvghi 160(%r15), 2 ; CHECK-NEXT: lmg %r6, %r15, 240(%r15) ; CHECK-NEXT: br %r14 entry: diff --git a/llvm/test/CodeGen/SystemZ/fshl.ll b/llvm/test/CodeGen/SystemZ/fshl.ll --- a/llvm/test/CodeGen/SystemZ/fshl.ll +++ b/llvm/test/CodeGen/SystemZ/fshl.ll @@ -82,9 +82,10 @@ ; CHECK-NEXT: lg %r1, 8(%r5) ; CHECK-NEXT: lg %r0, 0(%r4) ; CHECK-NEXT: lg %r14, 8(%r3) -; CHECK-NEXT: tmll %r1, 64 +; CHECK-NEXT: risbg %r5, %r1, 63, 191, 58 +; CHECK-NEXT: chi %r5, 0 ; CHECK-NEXT: lgr %r13, %r0 -; CHECK-NEXT: jne .LBB4_2 +; CHECK-NEXT: jlh .LBB4_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: lgr %r13, %r14 ; CHECK-NEXT: .LBB4_2: diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll b/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll --- a/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll +++ b/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll @@ -147,8 +147,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: ltgr %r3,%r3 ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: stg %r2, 0(%r1) ; CHECK-NEXT: stg %r3, 8(%r1) +; CHECK-NEXT: stg %r2, 0(%r1) ; CHECK-NEXT: br %r14 entry: %0 = load i128, ptr @V128 diff --git a/llvm/test/CodeGen/SystemZ/int-cmp-47.ll b/llvm/test/CodeGen/SystemZ/int-cmp-47.ll --- a/llvm/test/CodeGen/SystemZ/int-cmp-47.ll +++ b/llvm/test/CodeGen/SystemZ/int-cmp-47.ll @@ -267,7 +267,8 @@ define void @f12(i64 %a) { ; CHECK-LABEL: f12: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: tmhh %r2, 256 +; CHECK-NEXT: srlg %r0, %r2, 56 +; CHECK-NEXT: tmll %r0, 1 ; CHECK-NEXT: bner %r14 ; CHECK-NEXT: .LBB11_1: # %store ; CHECK-NEXT: lgrl %r1, g@GOT diff --git a/llvm/test/CodeGen/SystemZ/int-move-10.ll b/llvm/test/CodeGen/SystemZ/int-move-10.ll --- a/llvm/test/CodeGen/SystemZ/int-move-10.ll +++ b/llvm/test/CodeGen/SystemZ/int-move-10.ll @@ -99,7 +99,7 @@ define dso_local void @f7(ptr %Src) { ; CHECK-LABEL: f7: ; CHECK: # %bb.0: -; CHECK-NEXT: lg %r0, 0(%r2) +; CHECK-NEXT: l %r0, 4(%r2) ; CHECK-NEXT: larl %r1, D_align4 ; CHECK-NEXT: st %r0, 2(%r1) ; CHECK-NEXT: br %r14 @@ -112,9 +112,8 @@ define dso_local void @f8(ptr %Src) { ; CHECK-LABEL: f8: ; CHECK: # %bb.0: -; CHECK-NEXT: lg %r0, 0(%r2) ; CHECK-NEXT: larl %r1, F_align2 -; CHECK-NEXT: sth %r0, 1(%r1) +; CHECK-NEXT: mvc 1(2,%r1), 6(%r2) ; CHECK-NEXT: br %r14 %L = load i64, ptr %Src %T = trunc i64 %L to i16 diff --git a/llvm/test/CodeGen/SystemZ/memset-08.ll b/llvm/test/CodeGen/SystemZ/memset-08.ll --- a/llvm/test/CodeGen/SystemZ/memset-08.ll +++ b/llvm/test/CodeGen/SystemZ/memset-08.ll @@ -18,10 +18,16 @@ define void @reg18(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg18: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llcr %r0, %r3 +; CHECK-NEXT: mhi %r0, 257 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: sth %r0, 16(%r2) +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 0(%r2), 4 -; CHECK-NEXT: vsteh %v0, 16(%r2), 0 ; CHECK-NEXT: br %r14 call void @llvm.memset.p0.i64(ptr align 16 %Dst, i8 %val, i64 18, i1 false) ret void @@ -30,9 +36,15 @@ define void @reg19(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg19: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 -; CHECK-NEXT: vstef %v0, 15(%r2), 0 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llcr %r0, %r3 +; CHECK-NEXT: msfi %r0, 16843009 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: st %r0, 15(%r2) +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 call void @llvm.memset.p0.i64(ptr align 16 %Dst, i8 %val, i64 19, i1 false) @@ -42,9 +54,15 @@ define void @reg20(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg20: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 -; CHECK-NEXT: vstef %v0, 16(%r2), 0 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llcr %r0, %r3 +; CHECK-NEXT: msfi %r0, 16843009 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: st %r0, 16(%r2) +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 call void @llvm.memset.p0.i64(ptr align 16 %Dst, i8 %val, i64 20, i1 false) @@ -54,9 +72,13 @@ define void @reg21(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg21: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 -; CHECK-NEXT: vsteg %v0, 13(%r2), 0 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 +; CHECK-NEXT: stg %r0, 13(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 call void @llvm.memset.p0.i64(ptr align 16 %Dst, i8 %val, i64 21, i1 false) @@ -66,9 +88,13 @@ define void @reg22(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg22: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 -; CHECK-NEXT: vsteg %v0, 14(%r2), 0 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 +; CHECK-NEXT: stg %r0, 14(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 call void @llvm.memset.p0.i64(ptr align 16 %Dst, i8 %val, i64 22, i1 false) @@ -78,9 +104,13 @@ define void @reg23(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg23: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 -; CHECK-NEXT: vsteg %v0, 15(%r2), 0 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 +; CHECK-NEXT: stg %r0, 15(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 call void @llvm.memset.p0.i64(ptr align 16 %Dst, i8 %val, i64 23, i1 false) @@ -90,9 +120,13 @@ define void @reg24(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg24: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 -; CHECK-NEXT: vsteg %v0, 16(%r2), 0 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 +; CHECK-NEXT: stg %r0, 16(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 call void @llvm.memset.p0.i64(ptr align 16 %Dst, i8 %val, i64 24, i1 false) @@ -102,8 +136,12 @@ define void @reg25(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg25: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 9(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 @@ -114,8 +152,12 @@ define void @reg26(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg26: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 10(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 @@ -126,8 +168,12 @@ define void @reg27(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg27: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 11(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 @@ -138,8 +184,12 @@ define void @reg28(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg28: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 12(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 @@ -150,8 +200,12 @@ define void @reg29(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg29: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 13(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 @@ -162,8 +216,12 @@ define void @reg30(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg30: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 14(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 @@ -174,8 +232,12 @@ define void @reg31(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg31: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 15(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 @@ -186,8 +248,12 @@ define void @reg32(ptr %Dst, i8 %val) { ; CHECK-LABEL: reg32: ; CHECK: # %bb.0: -; CHECK-NEXT: vlvgp %v0, %r3, %r3 -; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: llihf %r1, 16843009 +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: llgcr %r0, %r3 +; CHECK-NEXT: oilf %r1, 16843009 +; CHECK-NEXT: msgrkc %r0, %r0, %r1 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 ; CHECK-NEXT: vst %v0, 16(%r2), 4 ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 diff --git a/llvm/test/CodeGen/SystemZ/pr36164.ll b/llvm/test/CodeGen/SystemZ/pr36164.ll --- a/llvm/test/CodeGen/SystemZ/pr36164.ll +++ b/llvm/test/CodeGen/SystemZ/pr36164.ll @@ -15,39 +15,53 @@ define void @main() local_unnamed_addr #0 { ; CHECK-LABEL: main: ; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r12, %r15, 96(%r15) +; CHECK-NEXT: .cfi_offset %r12, -64 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 ; CHECK-NEXT: lhi %r0, 1 ; CHECK-NEXT: larl %r1, g_938 ; CHECK-NEXT: lhi %r2, 3 -; CHECK-NEXT: lhi %r3, 4 -; CHECK-NEXT: larl %r4, g_11 +; CHECK-NEXT: lhi %r3, 0 +; CHECK-NEXT: lhi %r4, 2 +; CHECK-NEXT: lhi %r5, 4 +; CHECK-NEXT: larl %r14, g_11 ; CHECK-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: strl %r0, g_73 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: strl %r0, g_69 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: lghi %r13, 24 +; CHECK-NEXT: ag %r13, 0(%r1) ; CHECK-NEXT: strl %r2, g_69 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: lrl %r5, g_832 -; CHECK-NEXT: agsi 0(%r1), 24 -; CHECK-NEXT: lrl %r5, g_832 +; CHECK-NEXT: lrl %r12, g_832 ; CHECK-NEXT: strl %r3, g_69 -; CHECK-NEXT: mvi 0(%r4), 1 +; CHECK-NEXT: lrl %r12, g_832 +; CHECK-NEXT: strl %r0, g_69 +; CHECK-NEXT: lrl %r12, g_832 +; CHECK-NEXT: strl %r4, g_69 +; CHECK-NEXT: lrl %r12, g_832 +; CHECK-NEXT: strl %r2, g_69 +; CHECK-NEXT: stgrl %r13, g_938 +; CHECK-NEXT: lrl %r13, g_832 +; CHECK-NEXT: strl %r5, g_69 +; CHECK-NEXT: mvi 0(%r14), 1 ; CHECK-NEXT: j .LBB0_1 br label %1 diff --git a/llvm/test/CodeGen/SystemZ/pr42606.ll b/llvm/test/CodeGen/SystemZ/pr42606.ll --- a/llvm/test/CodeGen/SystemZ/pr42606.ll +++ b/llvm/test/CodeGen/SystemZ/pr42606.ll @@ -4,16 +4,12 @@ define i64 @test(i64 %lo, i64 %hi) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: la %r0, 0(%r2,%r2) -; CHECK-NEXT: clgr %r0, %r2 -; CHECK-NEXT: ipm %r0 -; CHECK-NEXT: la %r1, 1(%r2,%r2) -; CHECK-NEXT: cghi %r1, 0 -; CHECK-NEXT: ipm %r1 -; CHECK-NEXT: afi %r1, -268435456 -; CHECK-NEXT: srl %r1, 31 -; CHECK-NEXT: rosbg %r1, %r0, 63, 63, 36 -; CHECK-NEXT: algfr %r3, %r1 +; CHECK-NEXT: lgr %r0, %r2 +; CHECK-NEXT: algfi %r0, 1 +; CHECK-NEXT: lghi %r1, 0 +; CHECK-NEXT: alcgr %r3, %r1 +; CHECK-NEXT: algr %r0, %r2 +; CHECK-NEXT: alcgr %r3, %r1 ; CHECK-NEXT: lgr %r2, %r3 ; CHECK-NEXT: br %r14 %tmp = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %lo, i64 1) diff --git a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll --- a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll +++ b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll @@ -15,17 +15,16 @@ ; CHECK-LABEL: main: ; CHECK: # %bb.0: ; CHECK-NEXT: lhr %r2, %r2 -; CHECK-NEXT: larl %r1, g_151 -; CHECK-NEXT: lghi %r3, 0 +; CHECK-NEXT: lghi %r1, 0 +; CHECK-NEXT: larl %r3, g_151 ; CHECK-NEXT: chi %r2, 0 ; CHECK-NEXT: lhi %r0, 1 -; CHECK-NEXT: locghile %r3, 1 -; CHECK-NEXT: o %r0, 0(%r1) +; CHECK-NEXT: locghile %r1, 1 +; CHECK-NEXT: o %r0, 0(%r3) +; CHECK-NEXT: dsgfr %r0, %r0 ; CHECK-NEXT: larl %r1, g_222 -; CHECK-NEXT: lghi %r5, 0 -; CHECK-NEXT: dsgfr %r2, %r0 -; CHECK-NEXT: stgrl %r2, g_39 -; CHECK-NEXT: stc %r5, 19(%r1) +; CHECK-NEXT: stgrl %r0, g_39 +; CHECK-NEXT: mvi 19(%r1), 0 ; CHECK-NEXT: br %r14 %tmp = load i32, ptr @g_151, align 4 %tmp3 = or i32 %tmp, 1 diff --git a/llvm/test/CodeGen/SystemZ/risbg-01.ll b/llvm/test/CodeGen/SystemZ/risbg-01.ll --- a/llvm/test/CodeGen/SystemZ/risbg-01.ll +++ b/llvm/test/CodeGen/SystemZ/risbg-01.ll @@ -342,8 +342,12 @@ define i32 @f26(i32 %foo) { ; CHECK-LABEL: f26: ; CHECK: # %bb.0: -; CHECK-NEXT: nill %r2, 65487 -; CHECK-NEXT: rll %r2, %r2, 5 +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: lr %r0, %r2 +; CHECK-NEXT: nilf %r0, 134217679 +; CHECK-NEXT: sll %r0, 5 +; CHECK-NEXT: rosbg %r0, %r2, 59, 63, 37 +; CHECK-NEXT: lr %r2, %r0 ; CHECK-NEXT: br %r14 %and = and i32 %foo, -49 %parta = shl i32 %and, 5 @@ -356,7 +360,12 @@ define i64 @f27(i64 %foo) { ; CHECK-LABEL: f27: ; CHECK: # %bb.0: -; CHECK-NEXT: risbg %r2, %r2, 55, 180, 5 +; CHECK-NEXT: llihf %r0, 134217727 +; CHECK-NEXT: oilf %r0, 4294967247 +; CHECK-NEXT: ngr %r0, %r2 +; CHECK-NEXT: sllg %r0, %r0, 5 +; CHECK-NEXT: rosbg %r0, %r2, 59, 63, 5 +; CHECK-NEXT: lgr %r2, %r0 ; CHECK-NEXT: br %r14 %and = and i64 %foo, -49 %parta = shl i64 %and, 5 @@ -577,9 +586,9 @@ define signext i32 @f44(i64 %x) { ; CHECK-LABEL: f44: ; CHECK: # %bb.0: -; CHECK-NEXT: srlg %r0, %r2, 12 -; CHECK-NEXT: lghi %r2, 10 -; CHECK-NEXT: ngr %r2, %r0 +; CHECK-NEXT: srl %r2, 12 +; CHECK-NEXT: nilf %r2, 10 +; CHECK-NEXT: llgfr %r2, %r2 ; CHECK-NEXT: br %r14 %shr4 = lshr i64 %x, 12 %conv = trunc i64 %shr4 to i32 diff --git a/llvm/test/CodeGen/SystemZ/risbg-04.ll b/llvm/test/CodeGen/SystemZ/risbg-04.ll --- a/llvm/test/CodeGen/SystemZ/risbg-04.ll +++ b/llvm/test/CodeGen/SystemZ/risbg-04.ll @@ -326,8 +326,12 @@ define i32 @f26(i32 %foo) { ; CHECK-LABEL: f26: ; CHECK: # %bb.0: -; CHECK-NEXT: nill %r2, 65487 -; CHECK-NEXT: rll %r2, %r2, 5 +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: lr %r0, %r2 +; CHECK-NEXT: nilf %r0, 134217679 +; CHECK-NEXT: sll %r0, 5 +; CHECK-NEXT: rosbg %r0, %r2, 59, 63, 37 +; CHECK-NEXT: lr %r2, %r0 ; CHECK-NEXT: br %r14 %and = and i32 %foo, -49 %parta = shl i32 %and, 5 @@ -340,7 +344,12 @@ define i64 @f27(i64 %foo) { ; CHECK-LABEL: f27: ; CHECK: # %bb.0: -; CHECK-NEXT: risbg %r2, %r2, 55, 180, 5 +; CHECK-NEXT: llihf %r0, 134217727 +; CHECK-NEXT: oilf %r0, 4294967247 +; CHECK-NEXT: ngr %r0, %r2 +; CHECK-NEXT: sllg %r0, %r0, 5 +; CHECK-NEXT: rosbg %r0, %r2, 59, 63, 5 +; CHECK-NEXT: lgr %r2, %r0 ; CHECK-NEXT: br %r14 %and = and i64 %foo, -49 %parta = shl i64 %and, 5 @@ -556,9 +565,9 @@ define signext i32 @f44(i64 %x) { ; CHECK-LABEL: f44: ; CHECK: # %bb.0: -; CHECK-NEXT: srlg %r2, %r2, 12 -; CHECK-NEXT: lghi %r0, 10 -; CHECK-NEXT: ngr %r2, %r0 +; CHECK-NEXT: srl %r2, 12 +; CHECK-NEXT: nilf %r2, 10 +; CHECK-NEXT: llgfr %r2, %r2 ; CHECK-NEXT: br %r14 %shr4 = lshr i64 %x, 12 %conv = trunc i64 %shr4 to i32 diff --git a/llvm/test/CodeGen/SystemZ/shift-04.ll b/llvm/test/CodeGen/SystemZ/shift-04.ll --- a/llvm/test/CodeGen/SystemZ/shift-04.ll +++ b/llvm/test/CodeGen/SystemZ/shift-04.ll @@ -85,7 +85,12 @@ define i32 @f7(i32 %a, i64 %amt) { ; CHECK-LABEL: f7: ; CHECK: # %bb.0: -; CHECK-NEXT: rll %r2, %r2, 10(%r3) +; CHECK-NEXT: lhi %r1, 22 +; CHECK-NEXT: sr %r1, %r3 +; CHECK-NEXT: lr %r0, %r2 +; CHECK-NEXT: sll %r0, 10(%r3) +; CHECK-NEXT: srl %r2, 0(%r1) +; CHECK-NEXT: or %r2, %r0 ; CHECK-NEXT: br %r14 %add = add i64 %amt, 10 %sub = sub i64 32, %add diff --git a/llvm/test/CodeGen/SystemZ/shift-08.ll b/llvm/test/CodeGen/SystemZ/shift-08.ll --- a/llvm/test/CodeGen/SystemZ/shift-08.ll +++ b/llvm/test/CodeGen/SystemZ/shift-08.ll @@ -56,7 +56,11 @@ define i64 @f5(i64 %a, i64 %amt) { ; CHECK-LABEL: f5: ; CHECK: # %bb.0: -; CHECK-NEXT: rllg %r2, %r2, 10(%r3) +; CHECK-NEXT: sllg %r0, %r2, 10(%r3) +; CHECK-NEXT: lhi %r1, 54 +; CHECK-NEXT: sr %r1, %r3 +; CHECK-NEXT: srlg %r2, %r2, 0(%r1) +; CHECK-NEXT: ogr %r2, %r0 ; CHECK-NEXT: br %r14 %add = add i64 %amt, 10 %sub = sub i64 64, %add @@ -103,7 +107,11 @@ define i64 @f8(i64 %a, i64 %amt) { ; CHECK-LABEL: f8: ; CHECK: # %bb.0: -; CHECK-NEXT: rllg %r2, %r2, -1(%r3) +; CHECK-NEXT: sllg %r0, %r2, 524287(%r3) +; CHECK-NEXT: iilf %r1, 4294443073 +; CHECK-NEXT: sr %r1, %r3 +; CHECK-NEXT: srlg %r2, %r2, 0(%r1) +; CHECK-NEXT: ogr %r2, %r0 ; CHECK-NEXT: br %r14 %add = add i64 %amt, 524287 %sub = sub i64 64, %add @@ -118,8 +126,11 @@ define i64 @f9(i64 %a, i64 %amt) { ; CHECK-LABEL: f9: ; CHECK: # %bb.0: -; CHECK-NEXT: afi %r3, 524288 -; CHECK-NEXT: rllg %r2, %r2, 0(%r3) +; CHECK-NEXT: lcr %r1, %r3 +; CHECK-NEXT: agfi %r3, 524288 +; CHECK-NEXT: sllg %r0, %r2, 0(%r3) +; CHECK-NEXT: srlg %r2, %r2, 0(%r1) +; CHECK-NEXT: ogr %r2, %r0 ; CHECK-NEXT: br %r14 %add = add i64 %amt, 524288 %sub = sub i64 64, %add @@ -133,7 +144,11 @@ define i64 @f10(i64 %a, i64 %amt) { ; CHECK-LABEL: f10: ; CHECK: # %bb.0: -; CHECK-NEXT: rllg %r2, %r2, -1(%r3) +; CHECK-NEXT: sllg %r0, %r2, -1(%r3) +; CHECK-NEXT: lhi %r1, 65 +; CHECK-NEXT: sr %r1, %r3 +; CHECK-NEXT: srlg %r2, %r2, 0(%r1) +; CHECK-NEXT: ogr %r2, %r0 ; CHECK-NEXT: br %r14 %suba = sub i64 %amt, 1 %subb = sub i64 64, %suba @@ -148,7 +163,10 @@ define i64 @f11(i64 %a, i64 %amt) { ; CHECK-LABEL: f11: ; CHECK: # %bb.0: -; CHECK-NEXT: rllg %r2, %r2, -524288(%r3) +; CHECK-NEXT: sllg %r0, %r2, -524288(%r3) +; CHECK-NEXT: lcr %r1, %r3 +; CHECK-NEXT: srlg %r2, %r2, 0(%r1) +; CHECK-NEXT: ogr %r2, %r0 ; CHECK-NEXT: br %r14 %suba = sub i64 %amt, 524288 %subb = sub i64 64, %suba @@ -162,7 +180,12 @@ define i64 @f12(i64 %a, i64 %amt) { ; CHECK-LABEL: f12: ; CHECK: # %bb.0: -; CHECK-NEXT: rllg %r2, %r2, -1(%r3) +; CHECK-NEXT: iilf %r1, 524353 +; CHECK-NEXT: sr %r1, %r3 +; CHECK-NEXT: agfi %r3, -524289 +; CHECK-NEXT: sllg %r0, %r2, 0(%r3) +; CHECK-NEXT: srlg %r2, %r2, 0(%r1) +; CHECK-NEXT: ogr %r2, %r0 ; CHECK-NEXT: br %r14 %suba = sub i64 %amt, 524289 %subb = sub i64 64, %suba diff --git a/llvm/test/CodeGen/SystemZ/signbits-intrinsics-binop.ll b/llvm/test/CodeGen/SystemZ/signbits-intrinsics-binop.ll --- a/llvm/test/CodeGen/SystemZ/signbits-intrinsics-binop.ll +++ b/llvm/test/CodeGen/SystemZ/signbits-intrinsics-binop.ll @@ -258,9 +258,7 @@ define <16 x i8> @f14() { ; CHECK-LABEL: f14: ; CHECK: # %bb.0: -; CHECK-NEXT: larl %r1, .LCPI14_0 -; CHECK-NEXT: vl %v0, 0(%r1), 3 -; CHECK-NEXT: vperm %v24, %v0, %v0, %v0 +; CHECK-NEXT: vgbm %v24, 0 ; CHECK-NEXT: br %r14 %perm = call <16 x i8> @llvm.s390.vperm( <16 x i8> , ptr %src diff --git a/llvm/test/CodeGen/SystemZ/vec-args-04.ll b/llvm/test/CodeGen/SystemZ/vec-args-04.ll --- a/llvm/test/CodeGen/SystemZ/vec-args-04.ll +++ b/llvm/test/CodeGen/SystemZ/vec-args-04.ll @@ -18,11 +18,11 @@ ; CHECK-VEC-NEXT: .cfi_offset %r15, -40 ; CHECK-VEC-NEXT: aghi %r15, -192 ; CHECK-VEC-NEXT: .cfi_def_cfa_offset 352 -; CHECK-VEC-NEXT: larl %r1, .LCPI0_0 -; CHECK-VEC-NEXT: vl %v0, 0(%r1), 3 -; CHECK-VEC-NEXT: larl %r1, .LCPI0_1 -; CHECK-VEC-NEXT: vst %v0, 176(%r15), 3 -; CHECK-VEC-NEXT: vl %v0, 0(%r1), 3 +; CHECK-VEC-NEXT: vrepib %v0, 12 +; CHECK-VEC-NEXT: llihf %r0, 185273099 +; CHECK-VEC-NEXT: vsteg %v0, 184(%r15), 0 +; CHECK-VEC-NEXT: stg %r0, 176(%r15) +; CHECK-VEC-NEXT: llihh %r0, 2570 ; CHECK-VEC-NEXT: vrepib %v24, 1 ; CHECK-VEC-NEXT: vrepib %v26, 2 ; CHECK-VEC-NEXT: vrepib %v28, 3 @@ -31,7 +31,9 @@ ; CHECK-VEC-NEXT: vrepib %v27, 6 ; CHECK-VEC-NEXT: vrepib %v29, 7 ; CHECK-VEC-NEXT: vrepib %v31, 8 -; CHECK-VEC-NEXT: vst %v0, 160(%r15), 3 +; CHECK-VEC-NEXT: stg %r0, 168(%r15) +; CHECK-VEC-NEXT: llihh %r0, 2304 +; CHECK-VEC-NEXT: stg %r0, 160(%r15) ; CHECK-VEC-NEXT: brasl %r14, bar@PLT ; CHECK-VEC-NEXT: lmg %r14, %r15, 304(%r15) ; CHECK-VEC-NEXT: br %r14 @@ -43,11 +45,11 @@ ; CHECK-STACK-NEXT: .cfi_offset %r15, -40 ; CHECK-STACK-NEXT: aghi %r15, -192 ; CHECK-STACK-NEXT: .cfi_def_cfa_offset 352 -; CHECK-STACK-NEXT: larl %r1, .LCPI0_0 -; CHECK-STACK-NEXT: vl %v0, 0(%r1), 3 -; CHECK-STACK-NEXT: larl %r1, .LCPI0_1 -; CHECK-STACK-NEXT: vst %v0, 176(%r15), 3 -; CHECK-STACK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-STACK-NEXT: vrepib %v0, 12 +; CHECK-STACK-NEXT: llihf %r0, 185273099 +; CHECK-STACK-NEXT: vsteg %v0, 184(%r15), 0 +; CHECK-STACK-NEXT: stg %r0, 176(%r15) +; CHECK-STACK-NEXT: llihh %r0, 2570 ; CHECK-STACK-NEXT: vrepib %v24, 1 ; CHECK-STACK-NEXT: vrepib %v26, 2 ; CHECK-STACK-NEXT: vrepib %v28, 3 @@ -56,7 +58,9 @@ ; CHECK-STACK-NEXT: vrepib %v27, 6 ; CHECK-STACK-NEXT: vrepib %v29, 7 ; CHECK-STACK-NEXT: vrepib %v31, 8 -; CHECK-STACK-NEXT: vst %v0, 160(%r15), 3 +; CHECK-STACK-NEXT: stg %r0, 168(%r15) +; CHECK-STACK-NEXT: llihh %r0, 2304 +; CHECK-STACK-NEXT: stg %r0, 160(%r15) ; CHECK-STACK-NEXT: brasl %r14, bar@PLT ; CHECK-STACK-NEXT: lmg %r14, %r15, 304(%r15) ; CHECK-STACK-NEXT: br %r14 diff --git a/llvm/test/CodeGen/SystemZ/vec-args-05.ll b/llvm/test/CodeGen/SystemZ/vec-args-05.ll --- a/llvm/test/CodeGen/SystemZ/vec-args-05.ll +++ b/llvm/test/CodeGen/SystemZ/vec-args-05.ll @@ -17,11 +17,12 @@ ; CHECK-VEC-NEXT: .cfi_offset %r15, -40 ; CHECK-VEC-NEXT: aghi %r15, -176 ; CHECK-VEC-NEXT: .cfi_def_cfa_offset 336 -; CHECK-VEC-NEXT: larl %r1, .LCPI0_0 -; CHECK-VEC-NEXT: vl %v0, 0(%r1), 3 +; CHECK-VEC-NEXT: llihf %r0, 67372036 ; CHECK-VEC-NEXT: vrepib %v24, 1 ; CHECK-VEC-NEXT: vrepib %v26, 2 -; CHECK-VEC-NEXT: vst %v0, 160(%r15), 3 +; CHECK-VEC-NEXT: stg %r0, 168(%r15) +; CHECK-VEC-NEXT: llihf %r0, 50529027 +; CHECK-VEC-NEXT: stg %r0, 160(%r15) ; CHECK-VEC-NEXT: brasl %r14, bar@PLT ; CHECK-VEC-NEXT: lmg %r14, %r15, 288(%r15) ; CHECK-VEC-NEXT: br %r14 @@ -33,11 +34,12 @@ ; CHECK-STACK-NEXT: .cfi_offset %r15, -40 ; CHECK-STACK-NEXT: aghi %r15, -176 ; CHECK-STACK-NEXT: .cfi_def_cfa_offset 336 -; CHECK-STACK-NEXT: larl %r1, .LCPI0_0 -; CHECK-STACK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-STACK-NEXT: llihf %r0, 67372036 ; CHECK-STACK-NEXT: vrepib %v24, 1 ; CHECK-STACK-NEXT: vrepib %v26, 2 -; CHECK-STACK-NEXT: vst %v0, 160(%r15), 3 +; CHECK-STACK-NEXT: stg %r0, 168(%r15) +; CHECK-STACK-NEXT: llihf %r0, 50529027 +; CHECK-STACK-NEXT: stg %r0, 160(%r15) ; CHECK-STACK-NEXT: brasl %r14, bar@PLT ; CHECK-STACK-NEXT: lmg %r14, %r15, 288(%r15) ; CHECK-STACK-NEXT: br %r14 diff --git a/llvm/test/CodeGen/SystemZ/vec-bswap-05.ll b/llvm/test/CodeGen/SystemZ/vec-bswap-05.ll --- a/llvm/test/CodeGen/SystemZ/vec-bswap-05.ll +++ b/llvm/test/CodeGen/SystemZ/vec-bswap-05.ll @@ -26,7 +26,10 @@ define <8 x i16> @f2(ptr %ptr) { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: vllebrzh %v24, 0(%r2) +; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: vllezh %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v0, %v1 ; CHECK-NEXT: br %r14 %val = load i16, ptr %ptr %insert = insertelement <8 x i16> zeroinitializer, i16 %val, i32 3 @@ -50,7 +53,10 @@ define <4 x i32> @f4(ptr %ptr) { ; CHECK-LABEL: f4: ; CHECK: # %bb.0: -; CHECK-NEXT: vllebrzf %v24, 0(%r2) +; CHECK-NEXT: larl %r1, .LCPI3_0 +; CHECK-NEXT: vllezf %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v0, %v1 ; CHECK-NEXT: br %r14 %val = load i32, ptr %ptr %insert = insertelement <4 x i32> zeroinitializer, i32 %val, i32 1 @@ -74,7 +80,10 @@ define <2 x i64> @f6(ptr %ptr) { ; CHECK-LABEL: f6: ; CHECK: # %bb.0: -; CHECK-NEXT: vllebrzg %v24, 0(%r2) +; CHECK-NEXT: larl %r1, .LCPI5_0 +; CHECK-NEXT: vllezg %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v0, %v1 ; CHECK-NEXT: br %r14 %val = load i64, ptr %ptr %insert = insertelement <2 x i64> zeroinitializer, i64 %val, i32 0 @@ -98,7 +107,10 @@ define <4 x i32> @f8(ptr %ptr) { ; CHECK-LABEL: f8: ; CHECK: # %bb.0: -; CHECK-NEXT: vllebrze %v24, 0(%r2) +; CHECK-NEXT: larl %r1, .LCPI7_0 +; CHECK-NEXT: vllezlf %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v0, %v1 ; CHECK-NEXT: br %r14 %val = load i32, ptr %ptr %insert = insertelement <4 x i32> zeroinitializer, i32 %val, i32 0 diff --git a/llvm/test/CodeGen/SystemZ/vec-bswap-06.ll b/llvm/test/CodeGen/SystemZ/vec-bswap-06.ll --- a/llvm/test/CodeGen/SystemZ/vec-bswap-06.ll +++ b/llvm/test/CodeGen/SystemZ/vec-bswap-06.ll @@ -26,7 +26,10 @@ define <8 x i16> @f2(ptr %ptr) { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: vlbrreph %v24, 0(%r2) +; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v0, %v1 ; CHECK-NEXT: br %r14 %val = load i16, ptr %ptr %insert = insertelement <8 x i16> undef, i16 %val, i32 5 @@ -50,7 +53,10 @@ define <4 x i32> @f4(ptr %ptr) { ; CHECK-LABEL: f4: ; CHECK: # %bb.0: -; CHECK-NEXT: vlbrrepf %v24, 0(%r2) +; CHECK-NEXT: larl %r1, .LCPI3_0 +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v0, %v1 ; CHECK-NEXT: br %r14 %val = load i32, ptr %ptr %insert = insertelement <4 x i32> undef, i32 %val, i32 2 @@ -74,7 +80,10 @@ define <2 x i64> @f6(ptr %ptr) { ; CHECK-LABEL: f6: ; CHECK: # %bb.0: -; CHECK-NEXT: vlbrrepg %v24, 0(%r2) +; CHECK-NEXT: larl %r1, .LCPI5_0 +; CHECK-NEXT: vlrepg %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v0, %v1 ; CHECK-NEXT: br %r14 %val = load i64, ptr %ptr %insert = insertelement <2 x i64> undef, i64 %val, i32 1 diff --git a/llvm/test/CodeGen/SystemZ/vec-bswap-07.ll b/llvm/test/CodeGen/SystemZ/vec-bswap-07.ll --- a/llvm/test/CodeGen/SystemZ/vec-bswap-07.ll +++ b/llvm/test/CodeGen/SystemZ/vec-bswap-07.ll @@ -59,7 +59,10 @@ define <8 x i16> @f4(ptr %ptr) { ; CHECK-LABEL: f4: ; CHECK: # %bb.0: -; CHECK-NEXT: vlbrreph %v24, 0(%r2) +; CHECK-NEXT: larl %r1, .LCPI3_0 +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v0, %v1 ; CHECK-NEXT: br %r14 %scalar = load i16, ptr %ptr %val = insertelement <8 x i16> undef, i16 %scalar, i32 0 @@ -118,7 +121,10 @@ define <4 x i32> @f8(ptr %ptr) { ; CHECK-LABEL: f8: ; CHECK: # %bb.0: -; CHECK-NEXT: vlbrrepf %v24, 0(%r2) +; CHECK-NEXT: larl %r1, .LCPI7_0 +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v0, %v1 ; CHECK-NEXT: br %r14 %scalar = load i32, ptr %ptr %val = insertelement <4 x i32> undef, i32 %scalar, i32 0 @@ -177,7 +183,10 @@ define <2 x i64> @f12(ptr %ptr) { ; CHECK-LABEL: f12: ; CHECK: # %bb.0: -; CHECK-NEXT: vlbrrepg %v24, 0(%r2) +; CHECK-NEXT: larl %r1, .LCPI11_0 +; CHECK-NEXT: vlrepg %v0, 0(%r2) +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v24, %v0, %v0, %v1 ; CHECK-NEXT: br %r14 %scalar = load i64, ptr %ptr %val = insertelement <2 x i64> undef, i64 %scalar, i32 0 diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-12.ll b/llvm/test/CodeGen/SystemZ/vec-perm-12.ll --- a/llvm/test/CodeGen/SystemZ/vec-perm-12.ll +++ b/llvm/test/CodeGen/SystemZ/vec-perm-12.ll @@ -10,17 +10,17 @@ ; CHECK-CODE-LABEL: f1: ; CHECK-CODE: # %bb.0: ; CHECK-CODE-NEXT: larl %r1, .LCPI0_0 -; CHECK-CODE-NEXT: vl %v1, 0(%r1), 3 -; CHECK-CODE-NEXT: vlvgf %v0, %r2, 0 -; CHECK-CODE-NEXT: vperm %v24, %v24, %v0, %v1 +; CHECK-CODE-NEXT: vl %v0, 0(%r1), 3 +; CHECK-CODE-NEXT: vperm %v24, %v24, %v24, %v0 +; CHECK-CODE-NEXT: vlvgf %v24, %r2, 3 ; CHECK-CODE-NEXT: br %r14 ; ; CHECK-VECTOR-LABEL: f1: ; CHECK-VECTOR: # %bb.0: ; CHECK-VECTOR-NEXT: larl %r1, .LCPI0_0 -; CHECK-VECTOR-NEXT: vl %v1, 0(%r1), 3 -; CHECK-VECTOR-NEXT: vlvgf %v0, %r2, 0 -; CHECK-VECTOR-NEXT: vperm %v24, %v24, %v0, %v1 +; CHECK-VECTOR-NEXT: vl %v0, 0(%r1), 3 +; CHECK-VECTOR-NEXT: vperm %v24, %v24, %v24, %v0 +; CHECK-VECTOR-NEXT: vlvgf %v24, %r2, 3 ; CHECK-VECTOR-NEXT: br %r14 diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll @@ -106,24 +106,24 @@ ; S390X-NEXT: ld %f1, 0(%r1) ; S390X-NEXT: larl %r1, .LCPI3_0 ; S390X-NEXT: ld %f2, 0(%r1) -; S390X-NEXT: ddb %f1, 0(%r2) +; S390X-NEXT: ddb %f1, 16(%r2) ; S390X-NEXT: ddb %f0, 8(%r2) -; S390X-NEXT: ddb %f2, 16(%r2) -; S390X-NEXT: std %f1, 0(%r2) +; S390X-NEXT: ddb %f2, 0(%r2) +; S390X-NEXT: std %f1, 16(%r2) ; S390X-NEXT: std %f0, 8(%r2) -; S390X-NEXT: std %f2, 16(%r2) +; S390X-NEXT: std %f2, 0(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fdiv_v3f64: ; SZ13: # %bb.0: # %entry ; SZ13-NEXT: larl %r1, .LCPI3_0 +; SZ13-NEXT: vl %v0, 0(%r2), 4 +; SZ13-NEXT: vl %v1, 0(%r1), 3 +; SZ13-NEXT: vfddb %v0, %v1, %v0 +; SZ13-NEXT: larl %r1, .LCPI3_1 ; SZ13-NEXT: ld %f1, 0(%r1) ; SZ13-NEXT: ddb %f1, 16(%r2) -; SZ13-NEXT: larl %r1, .LCPI3_1 -; SZ13-NEXT: vl %v0, 0(%r2), 4 -; SZ13-NEXT: vl %v2, 0(%r1), 3 ; SZ13-NEXT: std %f1, 16(%r2) -; SZ13-NEXT: vfddb %v0, %v2, %v0 ; SZ13-NEXT: vst %v0, 0(%r2), 4 ; SZ13-NEXT: br %r14 entry: @@ -381,8 +381,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f2, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f2, 0(%r2) ; S390X-NEXT: larl %r1, .LCPI8_0 ; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: ld %f9, 8(%r2) @@ -399,9 +399,9 @@ ; S390X-NEXT: ldr %f0, %f1 ; S390X-NEXT: ldr %f2, %f8 ; S390X-NEXT: brasl %r14, fmod@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -657,25 +657,25 @@ ; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: ldr %f1, %f0 ; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: mdb %f0, 0(%r2) +; S390X-NEXT: mdb %f0, 16(%r2) ; S390X-NEXT: mdb %f2, 8(%r2) -; S390X-NEXT: mdb %f1, 16(%r2) -; S390X-NEXT: std %f0, 0(%r2) +; S390X-NEXT: mdb %f1, 0(%r2) +; S390X-NEXT: std %f0, 16(%r2) ; S390X-NEXT: std %f2, 8(%r2) -; S390X-NEXT: std %f1, 16(%r2) +; S390X-NEXT: std %f1, 0(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fmul_v3f64: ; SZ13: # %bb.0: # %entry ; SZ13-NEXT: larl %r1, .LCPI13_0 -; SZ13-NEXT: ld %f1, 0(%r1) -; SZ13-NEXT: larl %r1, .LCPI13_1 ; SZ13-NEXT: vl %v0, 0(%r2), 4 -; SZ13-NEXT: vl %v2, 0(%r1), 3 +; SZ13-NEXT: vl %v1, 0(%r1), 3 +; SZ13-NEXT: larl %r1, .LCPI13_1 +; SZ13-NEXT: vfmdb %v0, %v1, %v0 +; SZ13-NEXT: ld %f1, 0(%r1) ; SZ13-NEXT: mdb %f1, 16(%r2) -; SZ13-NEXT: vfmdb %v0, %v2, %v0 -; SZ13-NEXT: vst %v0, 0(%r2), 4 ; SZ13-NEXT: std %f1, 16(%r2) +; SZ13-NEXT: vst %v0, 0(%r2), 4 ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, ptr %a @@ -827,25 +827,25 @@ ; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: ldr %f1, %f0 ; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: adb %f0, 0(%r2) +; S390X-NEXT: adb %f0, 16(%r2) ; S390X-NEXT: adb %f2, 8(%r2) -; S390X-NEXT: adb %f1, 16(%r2) -; S390X-NEXT: std %f0, 0(%r2) +; S390X-NEXT: adb %f1, 0(%r2) +; S390X-NEXT: std %f0, 16(%r2) ; S390X-NEXT: std %f2, 8(%r2) -; S390X-NEXT: std %f1, 16(%r2) +; S390X-NEXT: std %f1, 0(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fadd_v3f64: ; SZ13: # %bb.0: # %entry ; SZ13-NEXT: larl %r1, .LCPI18_0 -; SZ13-NEXT: ld %f1, 0(%r1) -; SZ13-NEXT: larl %r1, .LCPI18_1 ; SZ13-NEXT: vl %v0, 0(%r2), 4 -; SZ13-NEXT: vl %v2, 0(%r1), 3 +; SZ13-NEXT: vl %v1, 0(%r1), 3 +; SZ13-NEXT: larl %r1, .LCPI18_1 +; SZ13-NEXT: vfadb %v0, %v1, %v0 +; SZ13-NEXT: ld %f1, 0(%r1) ; SZ13-NEXT: adb %f1, 16(%r2) -; SZ13-NEXT: vfadb %v0, %v2, %v0 -; SZ13-NEXT: vst %v0, 0(%r2), 4 ; SZ13-NEXT: std %f1, 16(%r2) +; SZ13-NEXT: vst %v0, 0(%r2), 4 ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, ptr %a @@ -999,23 +999,22 @@ ; S390X-NEXT: ld %f0, 0(%r1) ; S390X-NEXT: ldr %f1, %f0 ; S390X-NEXT: ldr %f2, %f0 -; S390X-NEXT: sdb %f0, 0(%r2) +; S390X-NEXT: sdb %f0, 16(%r2) ; S390X-NEXT: sdb %f2, 8(%r2) -; S390X-NEXT: sdb %f1, 16(%r2) -; S390X-NEXT: std %f0, 0(%r2) +; S390X-NEXT: sdb %f1, 0(%r2) +; S390X-NEXT: std %f0, 16(%r2) ; S390X-NEXT: std %f2, 8(%r2) -; S390X-NEXT: std %f1, 16(%r2) +; S390X-NEXT: std %f1, 0(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_fsub_v3f64: ; SZ13: # %bb.0: # %entry ; SZ13-NEXT: vl %v0, 0(%r2), 4 -; SZ13-NEXT: vgmg %v2, 12, 10 -; SZ13-NEXT: sdb %f2, 16(%r2) ; SZ13-NEXT: vgmg %v1, 12, 10 ; SZ13-NEXT: vfsdb %v0, %v1, %v0 +; SZ13-NEXT: sdb %f1, 16(%r2) +; SZ13-NEXT: std %f1, 16(%r2) ; SZ13-NEXT: vst %v0, 0(%r2), 4 -; SZ13-NEXT: std %f2, 16(%r2) ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, ptr %a @@ -1146,12 +1145,12 @@ define void @constrained_vector_sqrt_v3f64(ptr %a) #0 { ; S390X-LABEL: constrained_vector_sqrt_v3f64: ; S390X: # %bb.0: # %entry -; S390X-NEXT: sqdb %f0, 0(%r2) +; S390X-NEXT: sqdb %f0, 16(%r2) ; S390X-NEXT: sqdb %f1, 8(%r2) -; S390X-NEXT: sqdb %f2, 16(%r2) -; S390X-NEXT: std %f0, 0(%r2) +; S390X-NEXT: sqdb %f2, 0(%r2) +; S390X-NEXT: std %f0, 16(%r2) ; S390X-NEXT: std %f1, 8(%r2) -; S390X-NEXT: std %f2, 16(%r2) +; S390X-NEXT: std %f2, 0(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_sqrt_v3f64: @@ -1413,8 +1412,8 @@ ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: .cfi_offset %f11, -192 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: larl %r1, .LCPI33_0 ; S390X-NEXT: ld %f9, 0(%r1) ; S390X-NEXT: ld %f10, 8(%r2) @@ -1428,9 +1427,9 @@ ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ldr %f2, %f9 ; S390X-NEXT: brasl %r14, pow@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f10, 8(%r13) -; S390X-NEXT: std %f11, 16(%r13) +; S390X-NEXT: std %f11, 0(%r13) ; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload @@ -2108,8 +2107,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, sin@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -2118,9 +2117,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, sin@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2425,8 +2424,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, cos@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -2435,9 +2434,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, cos@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -2742,8 +2741,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, exp@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -2752,9 +2751,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, exp@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3059,8 +3058,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, exp2@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -3069,9 +3068,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, exp2@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3376,8 +3375,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, log@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -3386,9 +3385,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, log@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -3693,8 +3692,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, log10@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -3703,9 +3702,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, log10@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4010,8 +4009,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, log2@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -4020,9 +4019,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, log2@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4244,25 +4243,25 @@ define void @constrained_vector_rint_v3f64(ptr %a) #0 { ; S390X-LABEL: constrained_vector_rint_v3f64: ; S390X: # %bb.0: # %entry -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f1, 8(%r2) -; S390X-NEXT: ld %f2, 0(%r2) +; S390X-NEXT: ld %f2, 16(%r2) ; S390X-NEXT: fidbr %f0, 0, %f0 ; S390X-NEXT: fidbr %f1, 0, %f1 ; S390X-NEXT: fidbr %f2, 0, %f2 -; S390X-NEXT: std %f2, 0(%r2) +; S390X-NEXT: std %f2, 16(%r2) ; S390X-NEXT: std %f1, 8(%r2) -; S390X-NEXT: std %f0, 16(%r2) +; S390X-NEXT: std %f0, 0(%r2) ; S390X-NEXT: br %r14 ; ; SZ13-LABEL: constrained_vector_rint_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v0, 0(%r2), 4 -; SZ13-NEXT: ld %f1, 16(%r2) -; SZ13-NEXT: vfidb %v0, %v0, 0, 0 -; SZ13-NEXT: fidbra %f1, 0, %f1, 0 -; SZ13-NEXT: vst %v0, 0(%r2), 4 -; SZ13-NEXT: std %f1, 16(%r2) +; SZ13-NEXT: vl %v1, 0(%r2), 4 +; SZ13-NEXT: ld %f0, 16(%r2) +; SZ13-NEXT: vfidb %v1, %v1, 0, 0 +; SZ13-NEXT: fidbra %f0, 0, %f0, 0 +; SZ13-NEXT: std %f0, 16(%r2) +; SZ13-NEXT: vst %v1, 0(%r2), 4 ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, ptr %a @@ -4446,8 +4445,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, nearbyint@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -4456,9 +4455,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, nearbyint@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -4467,12 +4466,12 @@ ; ; SZ13-LABEL: constrained_vector_nearbyint_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v0, 0(%r2), 4 -; SZ13-NEXT: ld %f1, 16(%r2) -; SZ13-NEXT: vfidb %v0, %v0, 4, 0 -; SZ13-NEXT: fidbra %f1, 0, %f1, 4 -; SZ13-NEXT: vst %v0, 0(%r2), 4 -; SZ13-NEXT: std %f1, 16(%r2) +; SZ13-NEXT: vl %v1, 0(%r2), 4 +; SZ13-NEXT: ld %f0, 16(%r2) +; SZ13-NEXT: vfidb %v1, %v1, 4, 0 +; SZ13-NEXT: fidbra %f0, 0, %f0, 4 +; SZ13-NEXT: std %f0, 16(%r2) +; SZ13-NEXT: vst %v1, 0(%r2), 4 ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, ptr %a @@ -4742,8 +4741,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: larl %r1, .LCPI88_0 ; S390X-NEXT: ld %f2, 0(%r1) ; S390X-NEXT: ld %f9, 8(%r2) @@ -4758,9 +4757,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, fmax@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5118,8 +5117,8 @@ ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: .cfi_offset %f11, -192 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: larl %r1, .LCPI93_0 ; S390X-NEXT: ld %f9, 0(%r1) ; S390X-NEXT: ld %f10, 8(%r2) @@ -5133,9 +5132,9 @@ ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: ldr %f2, %f9 ; S390X-NEXT: brasl %r14, fmin@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f10, 8(%r13) -; S390X-NEXT: std %f11, 16(%r13) +; S390X-NEXT: std %f11, 0(%r13) ; S390X-NEXT: ld %f8, 184(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 168(%r15) # 8-byte Folded Reload @@ -5676,8 +5675,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, ceil@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -5686,9 +5685,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, ceil@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5697,12 +5696,12 @@ ; ; SZ13-LABEL: constrained_vector_ceil_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v0, 0(%r2), 4 -; SZ13-NEXT: ld %f1, 16(%r2) -; SZ13-NEXT: vfidb %v0, %v0, 4, 6 -; SZ13-NEXT: fidbra %f1, 6, %f1, 4 -; SZ13-NEXT: vst %v0, 0(%r2), 4 -; SZ13-NEXT: std %f1, 16(%r2) +; SZ13-NEXT: vl %v1, 0(%r2), 4 +; SZ13-NEXT: ld %f0, 16(%r2) +; SZ13-NEXT: vfidb %v1, %v1, 4, 6 +; SZ13-NEXT: fidbra %f0, 6, %f0, 4 +; SZ13-NEXT: std %f0, 16(%r2) +; SZ13-NEXT: vst %v1, 0(%r2), 4 ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, ptr %a @@ -5846,8 +5845,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, floor@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -5856,9 +5855,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, floor@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -5867,12 +5866,12 @@ ; ; SZ13-LABEL: constrained_vector_floor_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v0, 0(%r2), 4 -; SZ13-NEXT: ld %f1, 16(%r2) -; SZ13-NEXT: vfidb %v0, %v0, 4, 7 -; SZ13-NEXT: fidbra %f1, 7, %f1, 4 -; SZ13-NEXT: vst %v0, 0(%r2), 4 -; SZ13-NEXT: std %f1, 16(%r2) +; SZ13-NEXT: vl %v1, 0(%r2), 4 +; SZ13-NEXT: ld %f0, 16(%r2) +; SZ13-NEXT: vfidb %v1, %v1, 4, 7 +; SZ13-NEXT: fidbra %f0, 7, %f0, 4 +; SZ13-NEXT: std %f0, 16(%r2) +; SZ13-NEXT: vst %v1, 0(%r2), 4 ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, ptr %a @@ -6016,8 +6015,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, round@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -6026,9 +6025,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, round@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -6037,12 +6036,12 @@ ; ; SZ13-LABEL: constrained_vector_round_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v0, 0(%r2), 4 -; SZ13-NEXT: ld %f1, 16(%r2) -; SZ13-NEXT: vfidb %v0, %v0, 4, 1 -; SZ13-NEXT: fidbra %f1, 1, %f1, 4 -; SZ13-NEXT: vst %v0, 0(%r2), 4 -; SZ13-NEXT: std %f1, 16(%r2) +; SZ13-NEXT: vl %v1, 0(%r2), 4 +; SZ13-NEXT: ld %f0, 16(%r2) +; SZ13-NEXT: vfidb %v1, %v1, 4, 1 +; SZ13-NEXT: fidbra %f0, 1, %f0, 4 +; SZ13-NEXT: std %f0, 16(%r2) +; SZ13-NEXT: vst %v1, 0(%r2), 4 ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, ptr %a @@ -6185,8 +6184,8 @@ ; S390X-NEXT: .cfi_offset %f9, -176 ; S390X-NEXT: .cfi_offset %f10, -184 ; S390X-NEXT: lgr %r13, %r2 -; S390X-NEXT: ld %f8, 0(%r2) -; S390X-NEXT: ld %f0, 16(%r2) +; S390X-NEXT: ld %f8, 16(%r2) +; S390X-NEXT: ld %f0, 0(%r2) ; S390X-NEXT: ld %f9, 8(%r2) ; S390X-NEXT: brasl %r14, trunc@PLT ; S390X-NEXT: ldr %f10, %f0 @@ -6195,9 +6194,9 @@ ; S390X-NEXT: ldr %f9, %f0 ; S390X-NEXT: ldr %f0, %f8 ; S390X-NEXT: brasl %r14, trunc@PLT -; S390X-NEXT: std %f0, 0(%r13) +; S390X-NEXT: std %f0, 16(%r13) ; S390X-NEXT: std %f9, 8(%r13) -; S390X-NEXT: std %f10, 16(%r13) +; S390X-NEXT: std %f10, 0(%r13) ; S390X-NEXT: ld %f8, 176(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f9, 168(%r15) # 8-byte Folded Reload ; S390X-NEXT: ld %f10, 160(%r15) # 8-byte Folded Reload @@ -6206,12 +6205,12 @@ ; ; SZ13-LABEL: constrained_vector_trunc_v3f64: ; SZ13: # %bb.0: # %entry -; SZ13-NEXT: vl %v0, 0(%r2), 4 -; SZ13-NEXT: ld %f1, 16(%r2) -; SZ13-NEXT: vfidb %v0, %v0, 4, 5 -; SZ13-NEXT: fidbra %f1, 5, %f1, 4 -; SZ13-NEXT: vst %v0, 0(%r2), 4 -; SZ13-NEXT: std %f1, 16(%r2) +; SZ13-NEXT: vl %v1, 0(%r2), 4 +; SZ13-NEXT: ld %f0, 16(%r2) +; SZ13-NEXT: vfidb %v1, %v1, 4, 5 +; SZ13-NEXT: fidbra %f0, 5, %f0, 4 +; SZ13-NEXT: std %f0, 16(%r2) +; SZ13-NEXT: vst %v1, 0(%r2), 4 ; SZ13-NEXT: br %r14 entry: %b = load <3 x double>, ptr %a diff --git a/llvm/test/CodeGen/Thumb/iabs-vector.ll b/llvm/test/CodeGen/Thumb/iabs-vector.ll --- a/llvm/test/CodeGen/Thumb/iabs-vector.ll +++ b/llvm/test/CodeGen/Thumb/iabs-vector.ll @@ -4,12 +4,13 @@ define void @PR41160(ptr %p) nounwind { ; CHECK-LABEL: PR41160: ; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.8 {d16, d17}, [r0] -; CHECK-NEXT: vabs.s32 q8, q8 -; CHECK-NEXT: vst1.8 {d16, d17}, [r0]! -; CHECK-NEXT: vld1.8 {d16, d17}, [r0] +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: vld1.8 {d16, d17}, [r1]! ; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vld1.8 {d18, d19}, [r1] +; CHECK-NEXT: vabs.s32 q9, q9 ; CHECK-NEXT: vst1.8 {d16, d17}, [r0] +; CHECK-NEXT: vst1.8 {d18, d19}, [r1] ; CHECK-NEXT: bx lr %tmp1 = load <8 x i32>, ptr %p, align 1 %tmp2 = icmp slt <8 x i32> %tmp1, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/2013-03-02-vduplane-nonconstant-source-index.ll b/llvm/test/CodeGen/Thumb2/2013-03-02-vduplane-nonconstant-source-index.ll --- a/llvm/test/CodeGen/Thumb2/2013-03-02-vduplane-nonconstant-source-index.ll +++ b/llvm/test/CodeGen/Thumb2/2013-03-02-vduplane-nonconstant-source-index.ll @@ -10,14 +10,14 @@ ; CHECK-NEXT: mov r4, sp ; CHECK-NEXT: bfc r4, #0, #4 ; CHECK-NEXT: mov sp, r4 -; CHECK-NEXT: and r1, r1, #3 ; CHECK-NEXT: vldr d17, [r7, #8] +; CHECK-NEXT: and r1, r1, #3 ; CHECK-NEXT: vmov d16, r2, r3 ; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: lsls r1, r1, #2 +; CHECK-NEXT: orr.w r1, r2, r1, lsl #2 ; CHECK-NEXT: subs r4, r7, #4 -; CHECK-NEXT: vst1.64 {d16, d17}, [r2:128], r1 -; CHECK-NEXT: vld1.32 {d16[], d17[]}, [r2:32] +; CHECK-NEXT: vst1.64 {d16, d17}, [r2:128] +; CHECK-NEXT: vld1.32 {d16[], d17[]}, [r1:32] ; CHECK-NEXT: vst1.32 {d16, d17}, [r0] ; CHECK-NEXT: mov sp, r4 ; CHECK-NEXT: pop {r4, r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -281,16 +281,16 @@ ; CHECK-NEXT: beq.w .LBB2_20 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r2, #1 +; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: adr r2, .LCPI2_1 -; CHECK-NEXT: mov lr, lr +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: add.w lr, r4, r3, lsr #2 ; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: mov lr, lr ; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: vdup.32 q2, r12 ; CHECK-NEXT: b .LBB2_3 @@ -315,7 +315,13 @@ ; CHECK-NEXT: vcmp.u32 cs, q1, q4 ; CHECK-NEXT: @ implicit-def: $q5 ; CHECK-NEXT: vmrs r4, p0 -; CHECK-NEXT: and r2, r4, #1 +; CHECK-NEXT: ands r2, r4, #1 +; CHECK-NEXT: beq .LBB2_5 +; CHECK-NEXT: @ %bb.4: @ %cond.load +; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 +; CHECK-NEXT: vldr.16 s20, [r0] +; CHECK-NEXT: .LBB2_5: @ %else +; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: rsbs r5, r2, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: bfi r2, r5, #0, #1 @@ -328,30 +334,32 @@ ; CHECK-NEXT: bfi r2, r5, #2, #1 ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: bfi r2, r4, #3, #1 -; CHECK-NEXT: lsls r4, r2, #31 -; CHECK-NEXT: bne .LBB2_12 -; CHECK-NEXT: @ %bb.4: @ %else -; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: bmi .LBB2_13 -; CHECK-NEXT: .LBB2_5: @ %else5 +; CHECK-NEXT: bmi .LBB2_14 +; CHECK-NEXT: @ %bb.6: @ %else5 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: bmi .LBB2_14 -; CHECK-NEXT: .LBB2_6: @ %else8 +; CHECK-NEXT: bmi .LBB2_15 +; CHECK-NEXT: .LBB2_7: @ %else8 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: bpl .LBB2_8 -; CHECK-NEXT: .LBB2_7: @ %cond.load10 +; CHECK-NEXT: bpl .LBB2_9 +; CHECK-NEXT: .LBB2_8: @ %cond.load10 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vldr.16 s22, [r0, #6] ; CHECK-NEXT: vins.f16 s21, s22 -; CHECK-NEXT: .LBB2_8: @ %else11 +; CHECK-NEXT: .LBB2_9: @ %else11 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vcmp.u32 cs, q2, q4 ; CHECK-NEXT: @ implicit-def: $q6 ; CHECK-NEXT: vmrs r4, p0 -; CHECK-NEXT: and r2, r4, #1 +; CHECK-NEXT: ands r2, r4, #1 +; CHECK-NEXT: beq .LBB2_11 +; CHECK-NEXT: @ %bb.10: @ %cond.load14 +; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 +; CHECK-NEXT: vldr.16 s24, [r1] +; CHECK-NEXT: .LBB2_11: @ %else15 +; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: rsbs r5, r2, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: bfi r2, r5, #0, #1 @@ -364,51 +372,37 @@ ; CHECK-NEXT: bfi r2, r5, #2, #1 ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: bfi r2, r4, #3, #1 -; CHECK-NEXT: lsls r4, r2, #31 -; CHECK-NEXT: bne .LBB2_15 -; CHECK-NEXT: @ %bb.9: @ %else15 -; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: lsls r4, r2, #30 ; CHECK-NEXT: bmi .LBB2_16 -; CHECK-NEXT: .LBB2_10: @ %else18 +; CHECK-NEXT: @ %bb.12: @ %else18 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: lsls r4, r2, #29 ; CHECK-NEXT: bmi .LBB2_17 -; CHECK-NEXT: .LBB2_11: @ %else21 +; CHECK-NEXT: .LBB2_13: @ %else21 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: lsls r2, r2, #28 ; CHECK-NEXT: bpl .LBB2_2 ; CHECK-NEXT: b .LBB2_18 -; CHECK-NEXT: .LBB2_12: @ %cond.load -; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vldr.16 s20, [r0] -; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: bpl .LBB2_5 -; CHECK-NEXT: .LBB2_13: @ %cond.load4 +; CHECK-NEXT: .LBB2_14: @ %cond.load4 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vldr.16 s22, [r0, #2] ; CHECK-NEXT: vins.f16 s20, s22 ; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: bpl .LBB2_6 -; CHECK-NEXT: .LBB2_14: @ %cond.load7 +; CHECK-NEXT: bpl .LBB2_7 +; CHECK-NEXT: .LBB2_15: @ %cond.load7 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vldr.16 s21, [r0, #4] ; CHECK-NEXT: vmovx.f16 s22, s0 ; CHECK-NEXT: vins.f16 s21, s22 ; CHECK-NEXT: lsls r2, r2, #28 -; CHECK-NEXT: bmi .LBB2_7 -; CHECK-NEXT: b .LBB2_8 -; CHECK-NEXT: .LBB2_15: @ %cond.load14 -; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 -; CHECK-NEXT: vldr.16 s24, [r1] -; CHECK-NEXT: lsls r4, r2, #30 -; CHECK-NEXT: bpl .LBB2_10 +; CHECK-NEXT: bmi .LBB2_8 +; CHECK-NEXT: b .LBB2_9 ; CHECK-NEXT: .LBB2_16: @ %cond.load17 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vldr.16 s26, [r1, #2] ; CHECK-NEXT: vins.f16 s24, s26 ; CHECK-NEXT: lsls r4, r2, #29 -; CHECK-NEXT: bpl .LBB2_11 +; CHECK-NEXT: bpl .LBB2_13 ; CHECK-NEXT: .LBB2_17: @ %cond.load20 ; CHECK-NEXT: @ in Loop: Header=BB2_3 Depth=1 ; CHECK-NEXT: vldr.16 s25, [r1, #4] diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -630,32 +630,31 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) { ; CHECK-LABEL: wrongop: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: movw r12, #47184 -; CHECK-NEXT: movw r3, #23593 -; CHECK-NEXT: ldrd r2, lr, [r1, #4] -; CHECK-NEXT: movt r12, #1310 -; CHECK-NEXT: movt r3, #49807 -; CHECK-NEXT: mla r3, lr, r3, r12 -; CHECK-NEXT: movw r1, #55051 -; CHECK-NEXT: movw r4, #23593 -; CHECK-NEXT: movt r1, #163 +; CHECK-NEXT: movw r3, #34079 +; CHECK-NEXT: ldrd r2, r12, [r1, #4] +; CHECK-NEXT: movt r3, #20971 +; CHECK-NEXT: smmul r3, r12, r3 ; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: movt r4, #655 -; CHECK-NEXT: ror.w r12, r3, #4 -; CHECK-NEXT: cmp r12, r1 -; CHECK-NEXT: cset r1, lo -; CHECK-NEXT: ror.w r3, r3, #2 -; CHECK-NEXT: mov.w r12, #1 -; CHECK-NEXT: cmp r3, r4 -; CHECK-NEXT: csel r3, r1, r12, lo -; CHECK-NEXT: lsls.w r4, lr, #30 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: asrs r1, r3, #5 +; CHECK-NEXT: add.w lr, r1, r3, lsr #31 +; CHECK-NEXT: movs r1, #100 +; CHECK-NEXT: mls lr, lr, r1, r12 +; CHECK-NEXT: asrs r1, r3, #7 +; CHECK-NEXT: add.w r1, r1, r3, lsr #31 +; CHECK-NEXT: mov.w r3, #400 +; CHECK-NEXT: mls r1, r1, r3, r12 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: cset r1, eq +; CHECK-NEXT: cmp.w lr, #0 +; CHECK-NEXT: csel r3, r1, r5, eq +; CHECK-NEXT: lsls.w r4, r12, #30 ; CHECK-NEXT: csel r1, r1, r3, ne ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r4, pc} -; CHECK-NEXT: .LBB8_1: @ %vector.ph +; CHECK-NEXT: blt .LBB8_4 +; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: movw r3, :lower16:days ; CHECK-NEXT: movs r4, #52 ; CHECK-NEXT: movt r3, :upper16:days @@ -666,7 +665,7 @@ ; CHECK-NEXT: adds r0, r2, #3 ; CHECK-NEXT: bic r0, r0, #3 ; CHECK-NEXT: subs r0, #4 -; CHECK-NEXT: add.w r0, r12, r0, lsr #2 +; CHECK-NEXT: add.w r0, r5, r0, lsr #2 ; CHECK-NEXT: dls lr, r0 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -680,7 +679,8 @@ ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: .LBB8_4: @ %for.end +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %day1 = getelementptr inbounds %struct.date, %struct.date* %pd, i32 0, i32 0 %0 = load i32, i32* %day1, align 4 diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -85,40 +85,25 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) { ; CHECK-LABEL: v7i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: ldr.w r12, [sp, #40] -; CHECK-NEXT: vdup.32 q3, r2 -; CHECK-NEXT: ldr r3, [sp, #32] -; CHECK-NEXT: adr r2, .LCPI2_1 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r12 -; CHECK-NEXT: ldr.w r12, [sp, #44] -; CHECK-NEXT: ldr r3, [sp, #36] -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 -; CHECK-NEXT: ldr.w r12, [sp, #8] -; CHECK-NEXT: ldr r3, [sp] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 -; CHECK-NEXT: ldr.w r12, [sp, #12] -; CHECK-NEXT: ldr r3, [sp, #4] -; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 +; CHECK-NEXT: add r3, sp, #32 +; CHECK-NEXT: vldmia sp, {s8, s9, s10, s11} +; CHECK-NEXT: vldmia r3, {s0, s1, s2, s3} ; CHECK-NEXT: adr r3, .LCPI2_0 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: vqadd.u32 q2, q2, r1 -; CHECK-NEXT: vcmp.u32 hi, q3, q2 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vqadd.u32 q3, q1, r1 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: vcmp.u32 hi, q1, q3 +; CHECK-NEXT: add r2, sp, #48 +; CHECK-NEXT: vpsel q0, q2, q0 ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: ldr r2, [sp, #48] -; CHECK-NEXT: vqadd.u32 q0, q0, r1 -; CHECK-NEXT: ldr r1, [sp, #52] -; CHECK-NEXT: vcmp.u32 hi, q3, q0 -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldr r1, [sp, #56] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: ldr r1, [sp, #20] -; CHECK-NEXT: ldr r2, [sp, #16] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: ldr r1, [sp, #24] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vldmia r2, {s0, s1, s2} +; CHECK-NEXT: add r2, sp, #16 +; CHECK-NEXT: vldmia r2, {s8, s9, s10} +; CHECK-NEXT: adr r2, .LCPI2_1 +; CHECK-NEXT: vldrw.u32 q3, [r2] +; CHECK-NEXT: vqadd.u32 q3, q3, r1 +; CHECK-NEXT: vcmp.u32 hi, q1, q3 +; CHECK-NEXT: vpsel q0, q2, q0 ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: vmov r3, s0 @@ -326,7 +311,7 @@ define void @test_width2(ptr nocapture readnone %x, ptr nocapture %y, i8 zeroext %m) { ; CHECK-LABEL: test_width2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq .LBB5_3 @@ -340,38 +325,35 @@ ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.64 r2 -; CHECK-NEXT: @ implicit-def: $q0 -; CHECK-NEXT: subs r2, #2 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r0, r3, #1 -; CHECK-NEXT: ubfx r3, r3, #8, #1 -; CHECK-NEXT: rsb.w r12, r0, #0 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r0, r12, #0, #1 +; CHECK-NEXT: @ implicit-def: $q0 ; CHECK-NEXT: sub.w r12, r1, #8 -; CHECK-NEXT: bfi r0, r3, #1, #1 -; CHECK-NEXT: lsls r3, r0, #31 -; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne.w r3, [r12] -; CHECK-NEXT: vmovne.32 q0[0], r3 -; CHECK-NEXT: lsls r0, r0, #30 +; CHECK-NEXT: subs r2, #2 +; CHECK-NEXT: ands r0, r3, #1 +; CHECK-NEXT: it ne +; CHECK-NEXT: vldrne s0, [r12] +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r4, r0, #0, #1 +; CHECK-NEXT: ubfx r0, r3, #8, #1 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r4, r0, #1, #1 +; CHECK-NEXT: lsls r0, r4, #30 ; CHECK-NEXT: itt mi ; CHECK-NEXT: ldrmi.w r0, [r12, #4] ; CHECK-NEXT: vmovmi.32 q0[2], r0 -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r0, r3, #1 -; CHECK-NEXT: ubfx r3, r3, #8, #1 -; CHECK-NEXT: rsb.w r12, r0, #0 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r0, r12, #0, #1 -; CHECK-NEXT: bfi r0, r3, #1, #1 -; CHECK-NEXT: lsls r3, r0, #31 +; CHECK-NEXT: vmrs r0, p0 +; CHECK-NEXT: ands r3, r0, #1 ; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne r3, s0 -; CHECK-NEXT: strne r3, [r1] -; CHECK-NEXT: lsls r0, r0, #30 +; CHECK-NEXT: vmovne r4, s0 +; CHECK-NEXT: strne r4, [r1] +; CHECK-NEXT: ubfx r0, r0, #8, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: rsbs r0, r0, #0 +; CHECK-NEXT: bfi r4, r3, #0, #1 +; CHECK-NEXT: bfi r4, r0, #1, #1 +; CHECK-NEXT: lsls r0, r4, #30 ; CHECK-NEXT: itt mi ; CHECK-NEXT: vmovmi r0, s2 ; CHECK-NEXT: strmi r0, [r1, #4] @@ -379,7 +361,7 @@ ; CHECK-NEXT: le lr, .LBB5_2 ; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp9.not = icmp eq i8 %m, 0 br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader diff --git a/llvm/test/CodeGen/Thumb2/csel.ll b/llvm/test/CodeGen/Thumb2/csel.ll --- a/llvm/test/CodeGen/Thumb2/csel.ll +++ b/llvm/test/CodeGen/Thumb2/csel.ll @@ -327,7 +327,8 @@ ; CHECK-LABEL: csinv_inplace: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: cmp r1, #45 -; CHECK-NEXT: cinv r0, r0, gt +; CHECK-NEXT: csetm r1, gt +; CHECK-NEXT: eors r0, r1 ; CHECK-NEXT: bx lr entry: %cmp = icmp sgt i32 %b, 45 diff --git a/llvm/test/CodeGen/Thumb2/lsll0.ll b/llvm/test/CodeGen/Thumb2/lsll0.ll --- a/llvm/test/CodeGen/Thumb2/lsll0.ll +++ b/llvm/test/CodeGen/Thumb2/lsll0.ll @@ -5,16 +5,16 @@ ; CHECK-LABEL: _Z4loopPxS_iS_i: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sxth r1, r1 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: asr.w r12, r1, #31 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: strd r2, r3, [r0] -; CHECK-NEXT: strd r1, r12, [r0, #8] +; CHECK-NEXT: asrs r3, r1, #31 +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: strd r1, r3, [r0] +; CHECK-NEXT: strd r2, r12, [r0, #8] ; CHECK-NEXT: bx lr entry: %wide.load = load <2 x i64>, ptr undef, align 8 diff --git a/llvm/test/CodeGen/Thumb2/mve-be.ll b/llvm/test/CodeGen/Thumb2/mve-be.ll --- a/llvm/test/CodeGen/Thumb2/mve-be.ll +++ b/llvm/test/CodeGen/Thumb2/mve-be.ll @@ -145,8 +145,10 @@ ; CHECK-BE-NEXT: .pad #16 ; CHECK-BE-NEXT: sub sp, #16 ; CHECK-BE-NEXT: add.w r12, sp, #24 -; CHECK-BE-NEXT: vldrw.u32 q0, [r12] -; CHECK-BE-NEXT: vstrw.32 q0, [sp] +; CHECK-BE-NEXT: vldrb.u8 q0, [r12] +; CHECK-BE-NEXT: vrev64.8 q1, q0 +; CHECK-BE-NEXT: vrev64.8 q0, q1 +; CHECK-BE-NEXT: vstrb.8 q0, [sp] ; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vmov d0, r1, r0 ; CHECK-BE-NEXT: vrev64.32 q1, q0 diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll --- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -339,43 +339,46 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: subs r1, r0, #1 -; CHECK-NEXT: sbcs r1, r12, #0 -; CHECK-NEXT: blt.w .LBB1_28 +; CHECK-NEXT: mov r3, r0 +; CHECK-NEXT: subs r0, #1 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: bne.w .LBB1_28 ; CHECK-NEXT: @ %bb.1: @ %for.cond2.preheader.lr.ph -; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: csel lr, r2, r3, lt -; CHECK-NEXT: movw r4, #43691 +; CHECK-NEXT: csel lr, r2, r0, lt +; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: mov r1, lr ; CHECK-NEXT: cmp.w lr, #3 ; CHECK-NEXT: it ls ; CHECK-NEXT: movls r1, #3 -; CHECK-NEXT: movt r4, #43690 +; CHECK-NEXT: movw r4, #43691 ; CHECK-NEXT: sub.w r1, r1, lr -; CHECK-NEXT: ldr r6, [sp, #128] +; CHECK-NEXT: movt r4, #43690 ; CHECK-NEXT: adds r1, #2 +; CHECK-NEXT: ldr r7, [sp, #128] ; CHECK-NEXT: movw r8, :lower16:c -; CHECK-NEXT: movt r8, :upper16:c ; CHECK-NEXT: mov.w r9, #12 ; CHECK-NEXT: umull r1, r4, r1, r4 +; CHECK-NEXT: movt r8, :upper16:c +; CHECK-NEXT: movs r1, #4 ; CHECK-NEXT: @ implicit-def: $r10 ; CHECK-NEXT: @ implicit-def: $r5 ; CHECK-NEXT: @ implicit-def: $r11 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: movs r1, #4 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: strd r2, r12, [sp, #4] @ 8-byte Folded Spill ; CHECK-NEXT: add.w r1, r1, r4, lsr #1 -; CHECK-NEXT: add.w r3, r3, r4, lsr #1 -; CHECK-NEXT: bic r7, r1, #3 +; CHECK-NEXT: add.w r0, r0, r4, lsr #1 +; CHECK-NEXT: bic r6, r1, #3 ; CHECK-NEXT: adr r1, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: adr r1, .LCPI1_1 ; CHECK-NEXT: vldrw.u32 q5, [r1] -; CHECK-NEXT: vdup.32 q6, r3 +; CHECK-NEXT: vdup.32 q6, r0 ; CHECK-NEXT: vadd.i32 q4, q0, lr -; CHECK-NEXT: vdup.32 q7, r3 +; CHECK-NEXT: vdup.32 q7, r0 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 @@ -383,28 +386,29 @@ ; CHECK-NEXT: cmn.w r11, #4 ; CHECK-NEXT: it le ; CHECK-NEXT: mvnle r0, #3 -; CHECK-NEXT: movw r2, #18725 +; CHECK-NEXT: movw r1, #18725 +; CHECK-NEXT: sub.w r0, r0, r11 +; CHECK-NEXT: movt r1, #9362 ; CHECK-NEXT: adds r0, #6 -; CHECK-NEXT: movt r2, #9362 -; CHECK-NEXT: sub.w r1, r0, r11 ; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: umull r2, r3, r1, r2 -; CHECK-NEXT: subs r2, r1, r3 -; CHECK-NEXT: add.w r2, r3, r2, lsr #1 -; CHECK-NEXT: lsrs r3, r2, #2 -; CHECK-NEXT: lsls r3, r3, #3 -; CHECK-NEXT: sub.w r2, r3, r2, lsr #2 -; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: ldrd r12, r3, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: umull r1, r2, r0, r1 +; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: add.w r0, r2, r0, lsr #1 +; CHECK-NEXT: lsrs r1, r0, #2 +; CHECK-NEXT: lsls r1, r1, #3 +; CHECK-NEXT: sub.w r0, r1, r0, lsr #2 +; CHECK-NEXT: add r0, r11 ; CHECK-NEXT: add.w r11, r0, #7 -; CHECK-NEXT: ldrd r12, r0, [sp, #8] @ 8-byte Folded Reload ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 ; CHECK-NEXT: adds r5, #2 -; CHECK-NEXT: subs r1, r5, r0 -; CHECK-NEXT: asr.w r3, r5, #31 -; CHECK-NEXT: sbcs.w r1, r3, r12 -; CHECK-NEXT: bge.w .LBB1_28 +; CHECK-NEXT: subs r1, r5, r3 +; CHECK-NEXT: asr.w r0, r5, #31 +; CHECK-NEXT: sbcs.w r0, r0, r12 +; CHECK-NEXT: cset r0, lt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: beq.w .LBB1_28 ; CHECK-NEXT: .LBB1_4: @ %for.cond2.preheader ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_17 Depth 2 @@ -422,22 +426,22 @@ ; CHECK-NEXT: ldrd r2, r3, [sp, #120] ; CHECK-NEXT: movs r0, #32 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: mov r4, r7 -; CHECK-NEXT: mov r7, lr +; CHECK-NEXT: mov r4, r6 +; CHECK-NEXT: mov r6, lr ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: ldrd r2, r12, [sp, #4] @ 8-byte Folded Reload -; CHECK-NEXT: mov lr, r7 -; CHECK-NEXT: mov r7, r4 -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov lr, r6 +; CHECK-NEXT: mov r6, r4 +; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: b .LBB1_8 ; CHECK-NEXT: .LBB1_7: @ %for.cond.cleanup17.us ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: add.w r11, r3, #7 -; CHECK-NEXT: cmn.w r3, #4 +; CHECK-NEXT: add.w r11, r0, #7 +; CHECK-NEXT: cmn.w r0, #4 ; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: bge .LBB1_3 ; CHECK-NEXT: .LBB1_8: @ %for.body6.us ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 @@ -455,7 +459,7 @@ ; CHECK-NEXT: movw r4, :lower16:b ; CHECK-NEXT: movt r4, :upper16:b ; CHECK-NEXT: str r1, [r4] -; CHECK-NEXT: mov r4, r7 +; CHECK-NEXT: mov r4, r6 ; CHECK-NEXT: .LBB1_10: @ %vector.body111 ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_8 Depth=2 @@ -473,7 +477,7 @@ ; CHECK-NEXT: b .LBB1_13 ; CHECK-NEXT: .LBB1_11: @ %vector.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: mov r4, r7 +; CHECK-NEXT: mov r4, r6 ; CHECK-NEXT: vmov q1, q4 ; CHECK-NEXT: .LBB1_12: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 @@ -491,7 +495,7 @@ ; CHECK-NEXT: bne .LBB1_12 ; CHECK-NEXT: .LBB1_13: @ %for.cond9.for.cond15.preheader_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq .LBB1_7 ; CHECK-NEXT: @ %bb.14: @ %for.cond9.for.cond15.preheader_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_8 Depth=2 @@ -501,11 +505,11 @@ ; CHECK-NEXT: b .LBB1_26 ; CHECK-NEXT: .LBB1_15: @ %for.body6.lr.ph.split ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq.w .LBB1_2 ; CHECK-NEXT: @ %bb.16: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: ldrd r12, r0, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: ldrd r12, r3, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: .LBB1_17: @ %for.body6.us60 ; CHECK-NEXT: @ Parent Loop BB1_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -513,35 +517,35 @@ ; CHECK-NEXT: bne .LBB1_27 ; CHECK-NEXT: @ %bb.18: @ %for.cond.cleanup17.us63 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 -; CHECK-NEXT: cmn.w r3, #4 +; CHECK-NEXT: cmn.w r0, #4 ; CHECK-NEXT: bge .LBB1_22 ; CHECK-NEXT: @ %bb.19: @ %for.cond.cleanup17.us63.1 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 -; CHECK-NEXT: cmn.w r3, #12 +; CHECK-NEXT: cmn.w r0, #12 ; CHECK-NEXT: bgt .LBB1_23 ; CHECK-NEXT: @ %bb.20: @ %for.cond.cleanup17.us63.2 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 -; CHECK-NEXT: cmn.w r3, #19 +; CHECK-NEXT: cmn.w r0, #19 ; CHECK-NEXT: bgt .LBB1_24 ; CHECK-NEXT: @ %bb.21: @ %for.cond.cleanup17.us63.3 ; CHECK-NEXT: @ in Loop: Header=BB1_17 Depth=2 -; CHECK-NEXT: add.w r11, r3, #28 -; CHECK-NEXT: cmn.w r3, #25 +; CHECK-NEXT: add.w r11, r0, #28 +; CHECK-NEXT: cmn.w r0, #25 ; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r0, r11 ; CHECK-NEXT: blt .LBB1_17 ; CHECK-NEXT: b .LBB1_3 ; CHECK-NEXT: .LBB1_22: @ %for.cond.cleanup5.loopexit134.split.loop.exit139 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r11, r3, #7 +; CHECK-NEXT: add.w r11, r0, #7 ; CHECK-NEXT: b .LBB1_25 ; CHECK-NEXT: .LBB1_23: @ %for.cond.cleanup5.loopexit134.split.loop.exit137 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r11, r3, #14 +; CHECK-NEXT: add.w r11, r0, #14 ; CHECK-NEXT: b .LBB1_25 ; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit135 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 -; CHECK-NEXT: add.w r11, r3, #21 +; CHECK-NEXT: add.w r11, r0, #21 ; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_4 Depth=1 ; CHECK-NEXT: mov.w r10, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll @@ -32,17 +32,18 @@ ; CHECK-NEXT: vmovx.f16 s12, s4 ; CHECK-NEXT: vmovx.f16 s2, s5 ; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vins.f16 s12, s2 -; CHECK-NEXT: vmovx.f16 s2, s1 ; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vins.f16 s12, s2 ; CHECK-NEXT: vins.f16 s4, s5 ; CHECK-NEXT: vadd.f16 q3, q3, q0 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vins.f16 s8, s0 ; CHECK-NEXT: vsub.f16 q0, q1, q2 -; CHECK-NEXT: vmovx.f16 s1, s0 -; CHECK-NEXT: vmovx.f16 s2, s12 +; CHECK-NEXT: vmovx.f16 s4, s12 +; CHECK-NEXT: vmovx.f16 s2, s0 ; CHECK-NEXT: vins.f16 s0, s12 -; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll @@ -50,10 +50,11 @@ ; CHECK-NEXT: vmul.f16 q2, q2, q3 ; CHECK-NEXT: vneg.f16 q2, q2 ; CHECK-NEXT: vfma.f16 q2, q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmovx.f16 s9, s8 +; CHECK-NEXT: vmovx.f16 s2, s16 +; CHECK-NEXT: vmovx.f16 s0, s8 ; CHECK-NEXT: vins.f16 s8, s16 -; CHECK-NEXT: vins.f16 s9, s0 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov.f32 s9, s0 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll @@ -272,34 +272,34 @@ define arm_aapcs_vfpcc <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_triangle_addmul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.f32 s20, s5 -; CHECK-NEXT: vmov.f32 s17, s2 -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmul.f32 q3, q5, q4 -; CHECK-NEXT: vmul.f32 q4, q1, q4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmul.f32 q3, q2, q0 +; CHECK-NEXT: vmov.f32 s24, s0 +; CHECK-NEXT: vmov.f32 s16, s13 +; CHECK-NEXT: vmov.f32 s12, s4 +; CHECK-NEXT: vmov.f32 s25, s2 +; CHECK-NEXT: vmov.f32 s13, s6 ; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmul.f32 q5, q3, q6 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vsub.f32 q4, q5, q4 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov q6, q4 -; CHECK-NEXT: vfms.f32 q6, q5, q0 -; CHECK-NEXT: vmov q7, q3 -; CHECK-NEXT: vfma.f32 q3, q1, q0 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vfma.f32 q7, q5, q0 -; CHECK-NEXT: vmov.f32 s5, s11 -; CHECK-NEXT: vadd.f32 q5, q7, q6 -; CHECK-NEXT: vfms.f32 q4, q1, q0 -; CHECK-NEXT: vmov.f32 s1, s20 -; CHECK-NEXT: vsub.f32 q1, q4, q3 -; CHECK-NEXT: vmov.f32 s3, s21 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vfms.f32 q5, q1, q0 +; CHECK-NEXT: vmul.f32 q1, q1, q6 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vmov.f32 s9, s10 +; CHECK-NEXT: vfma.f32 q6, q2, q0 +; CHECK-NEXT: vfma.f32 q1, q3, q0 +; CHECK-NEXT: vadd.f32 q2, q6, q5 +; CHECK-NEXT: vsub.f32 q1, q4, q1 ; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s1, s8 ; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll --- a/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll @@ -94,8 +94,8 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: add r2, sp, #16 +; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: mov r2, sp ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vstrw.32 q1, [r2] ; CHECK-NEXT: vldrb.u16 q1, [r2] diff --git a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll --- a/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fp16convertloops.ll @@ -390,11 +390,20 @@ ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 ; CHECK-NEXT: vcvtb.f32.f16 q1, q0 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0 -; CHECK-NEXT: vmul.f32 q1, q1, r2 ; CHECK-NEXT: vmul.f32 q0, q0, r2 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtt.f16.f32 q1, q0 -; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: vmul.f32 q1, q1, r2 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s9, s2 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vstrh.32 q2, [r1, #8] +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vcvtb.f16.f32 q0, q2 +; CHECK-NEXT: vstrh.32 q0, [r1], #16 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -428,29 +437,52 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: mov.w lr, #128 ; CHECK-NEXT: movw r2, #26214 ; CHECK-NEXT: movt r2, #16390 +; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: vldrh.u16 q0, [r3, #16]! ; CHECK-NEXT: vcvtb.f32.f16 q1, q0 ; CHECK-NEXT: vcvtt.f32.f16 q0, q0 -; CHECK-NEXT: vmul.f32 q1, q1, r2 ; CHECK-NEXT: vmul.f32 q0, q0, r2 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtt.f16.f32 q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r0, #16]! -; CHECK-NEXT: vstrh.16 q1, [r1] -; CHECK-NEXT: vcvtb.f32.f16 q1, q0 -; CHECK-NEXT: vcvtt.f32.f16 q0, q0 ; CHECK-NEXT: vmul.f32 q1, q1, r2 -; CHECK-NEXT: vmul.f32 q0, q0, r2 -; CHECK-NEXT: vcvtb.f16.f32 q1, q1 -; CHECK-NEXT: vcvtt.f16.f32 q1, q0 -; CHECK-NEXT: vstrb.8 q1, [r1, #16]! +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s9, s2 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vstrh.32 q2, [r1, #24] +; CHECK-NEXT: vldrh.u16 q2, [r0] +; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vcvtt.f32.f16 q3, q2 +; CHECK-NEXT: vcvtb.f32.f16 q4, q2 +; CHECK-NEXT: vmul.f32 q2, q3, r2 +; CHECK-NEXT: vmul.f32 q3, q4, r2 +; CHECK-NEXT: vmov.f32 s16, s14 +; CHECK-NEXT: vmov.f32 s17, s10 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s19, s11 +; CHECK-NEXT: vcvtb.f16.f32 q4, q4 +; CHECK-NEXT: vstrh.32 q4, [r1, #8] +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vmov.f32 s17, s8 +; CHECK-NEXT: vmov.f32 s18, s13 +; CHECK-NEXT: vmov.f32 s19, s9 +; CHECK-NEXT: vcvtb.f16.f32 q2, q4 +; CHECK-NEXT: vstrh.32 q2, [r1] +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vcvtb.f16.f32 q0, q2 +; CHECK-NEXT: vstrh.32 q0, [r1, #16]! ; CHECK-NEXT: le lr, .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: br label %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll --- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll @@ -299,49 +299,58 @@ define arm_aapcs_vfpcc <1 x i32> @test_signed_v1f64_v1i32(<1 x double> %f) { ; CHECK-LABEL: test_signed_v1f64_v1i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldr d1, .LCPI8_0 ; CHECK-NEXT: vmov r5, r4, d0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vldr d0, .LCPI8_1 -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_d2iz +; CHECK-NEXT: vldr d0, .LCPI8_1 ; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r6, #-2147483648 +; CHECK-NEXT: clz r0, r7 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r6, #-2147483648 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: mov r2, r5 ; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: mvnne r6, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne ; CHECK-NEXT: movne r6, #0 ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 4290772992 @ double 2147483647 -; CHECK-NEXT: .long 1105199103 -; CHECK-NEXT: .LCPI8_1: ; CHECK-NEXT: .long 0 @ double -2147483648 ; CHECK-NEXT: .long 3252682752 +; CHECK-NEXT: .LCPI8_1: +; CHECK-NEXT: .long 4290772992 @ double 2147483647 +; CHECK-NEXT: .long 1105199103 %x = call <1 x i32> @llvm.fptosi.sat.v1f64.v1i32(<1 x double> %f) ret <1 x i32> %x } @@ -490,112 +499,132 @@ ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s18, s0 +; CHECK-NEXT: vmov.f32 s19, s1 ; CHECK-NEXT: vldr d0, .LCPI10_0 -; CHECK-NEXT: vmov r4, r6, d1 -; CHECK-NEXT: vmov r2, r11, d0 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vmov.f32 s19, s5 +; CHECK-NEXT: vmov r10, r7, d1 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vmov.f32 s17, s5 +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: str.w r11, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vldr d0, .LCPI10_1 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: vmov r2, r8, d0 -; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: str.w r8, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r10, #-2147483648 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: vmov r1, r0, d9 +; CHECK-NEXT: vldr d0, .LCPI10_1 +; CHECK-NEXT: vmov r9, r8, d8 +; CHECK-NEXT: vmov r4, r6, d0 +; CHECK-NEXT: strd r1, r0, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: clz r0, r5 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r11, #-2147483648 +; CHECK-NEXT: mov r0, r10 ; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: vmov r5, r7, d9 -; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r10, #-2147483648 -; CHECK-NEXT: bl __aeabi_dcmpun +; CHECK-NEXT: str r4, [sp] @ 4-byte Spill +; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r6, #-2147483648 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r10 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: vmov r9, r8, d8 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r6, #-2147483648 +; CHECK-NEXT: mvnne r11, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #0 -; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r11, #0 +; CHECK-NEXT: ldr.w r10, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: lsrs r5, r0, #5 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r7, #-2147483648 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r7, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: mvnne r7, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: vmov.32 q0[1], r10 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r7, r6 +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: ldr r4, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: lsr.w r8, r0, #5 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: bl __aeabi_d2lz +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r5, #-2147483648 +; CHECK-NEXT: ldrd r2, r3, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r5, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpun +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vmov.32 q0[1], r11 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r7 ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 @@ -603,11 +632,11 @@ ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI10_0: -; CHECK-NEXT: .long 4290772992 @ double 2147483647 -; CHECK-NEXT: .long 1105199103 -; CHECK-NEXT: .LCPI10_1: ; CHECK-NEXT: .long 0 @ double -2147483648 ; CHECK-NEXT: .long 3252682752 +; CHECK-NEXT: .LCPI10_1: +; CHECK-NEXT: .long 4290772992 @ double 2147483647 +; CHECK-NEXT: .long 1105199103 %x = call <3 x i32> @llvm.fptosi.sat.v3f64.v3i32(<3 x double> %f) ret <3 x i32> %x } @@ -623,147 +652,177 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI11_0 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov r5, r6, d10 -; CHECK-NEXT: vmov r9, r3, d0 -; CHECK-NEXT: str r3, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vldr d0, .LCPI11_1 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: vmov r4, r6, d10 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: vmov r2, r1, d11 +; CHECK-NEXT: vldr d0, .LCPI11_1 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: vmov r10, r8, d8 +; CHECK-NEXT: vmov r7, r11, d0 +; CHECK-NEXT: lsr.w r9, r0, #5 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: strd r2, r1, [sp, #12] @ 8-byte Folded Spill ; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: vmov r11, r1, d11 -; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: vmov r7, r10, d8 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r0, #-2147483648 -; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r0, #-2147483648 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: movne.w r0, #-2147483648 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_dcmpun +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r7, r11 +; CHECK-NEXT: str.w r11, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: ldr.w r8, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r10 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: str r4, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: str.w r9, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: mov r1, r10 -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r10 -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r6, #-2147483648 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r1, r10 -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r6, #-2147483648 +; CHECK-NEXT: mvnne r5, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #0 -; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r8, #-2147483648 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: ldr.w r9, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: str r5, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: lsrs r4, r0, #5 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: bl __aeabi_d2lz +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-2147483648 +; CHECK-NEXT: ldr.w r11, [sp] @ 4-byte Reload +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: vmov r7, r4, d9 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r4, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpun +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: ldr r5, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r6, r9 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: lsrs r4, r0, #5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: bl __aeabi_d2lz +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r8, #-2147483648 +; CHECK-NEXT: ldr.w r9, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r4, r5 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: vmov r10, r5, d9 ; CHECK-NEXT: it ne ; CHECK-NEXT: mvnne r8, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne ; CHECK-NEXT: movne.w r8, #0 -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: ldr r3, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r3, r6 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: lsrs r6, r0, #5 +; CHECK-NEXT: mov r0, r10 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r5, #-2147483648 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: cmp.w r10, #0 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r5, #-2147483648 +; CHECK-NEXT: movne.w r4, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r4, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r6, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r8 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: ldrd r1, r0, [sp, #20] @ 8-byte Folded Reload +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r4, r8 ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 @@ -771,11 +830,11 @@ ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI11_0: -; CHECK-NEXT: .long 4290772992 @ double 2147483647 -; CHECK-NEXT: .long 1105199103 -; CHECK-NEXT: .LCPI11_1: ; CHECK-NEXT: .long 0 @ double -2147483648 ; CHECK-NEXT: .long 3252682752 +; CHECK-NEXT: .LCPI11_1: +; CHECK-NEXT: .long 4290772992 @ double 2147483647 +; CHECK-NEXT: .long 1105199103 %x = call <4 x i32> @llvm.fptosi.sat.v4f64.v4i32(<4 x double> %f) ret <4 x i32> %x } @@ -792,186 +851,220 @@ ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vldr d0, .LCPI12_0 -; CHECK-NEXT: vmov r5, r4, d4 -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: vmov r7, r5, d4 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov.f32 s20, s6 -; CHECK-NEXT: vmov.f32 s18, s4 +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vmov.f32 s20, s4 ; CHECK-NEXT: vmov.f32 s22, s2 -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmov.f32 s19, s5 +; CHECK-NEXT: vmov.f32 s19, s7 +; CHECK-NEXT: vmov.f32 s21, s5 ; CHECK-NEXT: vmov.f32 s23, s3 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: strd r2, r3, [sp, #20] @ 8-byte Folded Spill -; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_d2lz +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov r1, r0, d10 ; CHECK-NEXT: vldr d0, .LCPI12_1 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: vmov r6, r8, d9 +; CHECK-NEXT: vmov r10, r3, d0 +; CHECK-NEXT: strd r1, r0, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: vmov r9, r0, d11 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: clz r0, r11 +; CHECK-NEXT: mov r11, r3 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r4, #-2147483648 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: str.w r10, [sp] @ 4-byte Spill +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r4, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpun +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: str r4, [r0, #16] +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r5, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: lsrs r4, r0, #5 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: vmov r8, r0, d11 -; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: vmov r9, r6, d10 -; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r11, #-2147483648 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-2147483648 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r10, r11 +; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r11, #-2147483648 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r4, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #0 -; CHECK-NEXT: str.w r11, [r7, #16] +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: ldr.w r10, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: str r4, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r4, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: ldr.w r11, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: lsrs r4, r0, #5 ; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: bl __aeabi_d2lz +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r8, #-2147483648 +; CHECK-NEXT: ldr.w r11, [sp] @ 4-byte Reload ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r0, #-2147483648 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r0, #-2147483648 -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r8, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r5, #0 -; CHECK-NEXT: str r5, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: mov r6, r11 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r8, #0 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: lsrs r5, r0, #5 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r10, #-2147483648 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: vmov r11, r4, d9 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r10, #-2147483648 -; CHECK-NEXT: bl __aeabi_dcmpun +; CHECK-NEXT: movne.w r4, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: ldrd r2, r3, [sp, #20] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r7 ; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r7, #-2147483648 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r7, #-2147483648 +; CHECK-NEXT: mvnne r4, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: vmov r5, r4, d8 +; CHECK-NEXT: vmov r7, r6, d8 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: ldrd r2, r3, [sp, #20] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: ldr r3, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: lsr.w r9, r0, #5 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r6, #-2147483648 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r6, #-2147483648 +; CHECK-NEXT: movne.w r5, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r5, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #0 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r6, r7 -; CHECK-NEXT: vmov q0[3], q0[1], r10, r0 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r5, #0 ; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r8, r0 +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -980,11 +1073,11 @@ ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI12_0: -; CHECK-NEXT: .long 4290772992 @ double 2147483647 -; CHECK-NEXT: .long 1105199103 -; CHECK-NEXT: .LCPI12_1: ; CHECK-NEXT: .long 0 @ double -2147483648 ; CHECK-NEXT: .long 3252682752 +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .long 4290772992 @ double 2147483647 +; CHECK-NEXT: .long 1105199103 %x = call <5 x i32> @llvm.fptosi.sat.v5f64.v5i32(<5 x double> %f) ret <5 x i32> %x } @@ -1001,221 +1094,263 @@ ; CHECK-NEXT: .pad #40 ; CHECK-NEXT: sub sp, #40 ; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vldr d0, .LCPI13_0 -; CHECK-NEXT: vmov r9, r4, d5 -; CHECK-NEXT: vmov r2, r6, d0 -; CHECK-NEXT: vmov.f32 s22, s8 -; CHECK-NEXT: vmov.f32 s20, s6 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vmov.f32 s24, s2 -; CHECK-NEXT: vmov.f32 s23, s9 -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmov.f32 s19, s5 -; CHECK-NEXT: vmov.f32 s25, s3 -; CHECK-NEXT: str r2, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: str r6, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vldr d0, .LCPI13_1 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: vmov r7, r6, d5 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: strd r2, r3, [sp, #32] @ 8-byte Folded Spill +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s24, s4 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vmov.f32 s23, s7 +; CHECK-NEXT: vmov.f32 s25, s5 +; CHECK-NEXT: vmov.f32 s19, s3 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: strd r3, r2, [sp, #32] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: vmov r8, r0, d10 -; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: vmov r7, r5, d11 -; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmov r1, r0, d12 +; CHECK-NEXT: vldr d0, .LCPI13_1 +; CHECK-NEXT: vmov r4, r8, d10 +; CHECK-NEXT: vmov r10, r3, d0 +; CHECK-NEXT: strd r1, r0, [sp, #20] @ 8-byte Folded Spill +; CHECK-NEXT: vmov r1, r0, d11 +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: strd r1, r0, [sp, #12] @ 8-byte Folded Spill -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r10, #-2147483648 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: clz r0, r9 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r9, r3 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r5, #-2147483648 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r10, #-2147483648 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r5, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: ldr.w r11, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: str.w r11, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: str.w r5, [r11, #20] +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: ldrd r7, r2, [sp, #32] @ 8-byte Folded Reload +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: lsrs r6, r0, #5 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: bl __aeabi_d2lz +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r5, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: vmov r9, r1, d9 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r5, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpun +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: str.w r5, [r11, #16] +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r11, r7 +; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr.w r8, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: lsrs r4, r0, #5 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: bl __aeabi_d2lz +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r0, #-2147483648 +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: str.w r10, [r11, #20] -; CHECK-NEXT: ldr.w r10, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldrd r2, r3, [sp, #32] @ 8-byte Folded Reload -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: it ne +; CHECK-NEXT: mvnne r4, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpun +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: str r4, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: mov r6, r8 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: lsrs r4, r0, #5 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r6, #-2147483648 -; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r8, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r6, #-2147483648 +; CHECK-NEXT: mvnne r8, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r6, #0 -; CHECK-NEXT: str.w r6, [r11, #16] -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: ldr.w r11, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r8, #0 +; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: ldr r5, [sp, #24] @ 4-byte Reload ; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r7, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: ldr r5, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: lsrs r4, r0, #5 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r10, #-2147483648 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mov r11, r10 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r10, #-2147483648 -; CHECK-NEXT: bl __aeabi_dcmpun +; CHECK-NEXT: movne.w r6, #-2147483648 +; CHECK-NEXT: mov r10, r7 +; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r10, #0 -; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r5, r6 -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r8, #-2147483648 -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: vmov r7, r6, d9 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r8, #-2147483648 +; CHECK-NEXT: mvnne r6, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun +; CHECK-NEXT: vmov r7, r4, d8 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r8, #0 -; CHECK-NEXT: ldr.w r11, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldrd r2, r3, [sp, #32] @ 8-byte Folded Reload -; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r6, #0 +; CHECK-NEXT: ldrd r3, r2, [sp, #32] @ 8-byte Folded Reload ; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: lsr.w r9, r0, #5 ; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r4, #-2147483648 +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: mvnne r4, #-2147483648 -; CHECK-NEXT: bl __aeabi_dcmpun -; CHECK-NEXT: vmov r7, r6, d8 +; CHECK-NEXT: movne.w r5, #-2147483648 +; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r4, #0 -; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldrd r2, r3, [sp, #32] @ 8-byte Folded Reload -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r5, #-2147483648 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r6 -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: it ne ; CHECK-NEXT: mvnne r5, #-2147483648 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne ; CHECK-NEXT: movne r5, #0 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov q0[3], q0[1], r8, r10 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r8, r0 +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} @@ -1224,11 +1359,11 @@ ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI13_0: -; CHECK-NEXT: .long 4290772992 @ double 2147483647 -; CHECK-NEXT: .long 1105199103 -; CHECK-NEXT: .LCPI13_1: ; CHECK-NEXT: .long 0 @ double -2147483648 ; CHECK-NEXT: .long 3252682752 +; CHECK-NEXT: .LCPI13_1: +; CHECK-NEXT: .long 4290772992 @ double 2147483647 +; CHECK-NEXT: .long 1105199103 %x = call <6 x i32> @llvm.fptosi.sat.v6f64.v6i32(<6 x double> %f) ret <6 x i32> %x } @@ -2646,93 +2781,109 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI32_0 ; CHECK-NEXT: vmov r8, r7, d8 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: strd r2, r3, [sp, #12] @ 8-byte Folded Spill -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vldr d0, .LCPI32_1 -; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vmov r4, r5, d0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: strd r2, r3, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_d2iz -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r11, #-1 +; CHECK-NEXT: vldr d0, .LCPI32_1 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: clz r0, r10 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: vmov r6, r11, d9 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: str r2, [sp] @ 4-byte Spill +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r9, #-1 +; CHECK-NEXT: mov r10, r3 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: mov r2, r8 ; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: vmov r6, r5, d9 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #0 +; CHECK-NEXT: movne.w r9, #0 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r11, #0 -; CHECK-NEXT: and r0, r11, #1 -; CHECK-NEXT: ldrd r2, r3, [sp, #12] @ 8-byte Folded Reload +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r9, #0 +; CHECK-NEXT: and r0, r9, #1 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: bfi r4, r0, #0, #1 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldrd r2, r3, [sp, #4] @ 8-byte Folded Reload -; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: bfi r7, r0, #0, #1 ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: lsrs r5, r0, #5 ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_d2iz -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: it eq -; CHECK-NEXT: moveq.w r7, #-1 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r4, #-1 +; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: movne r4, #0 ; CHECK-NEXT: bl __aeabi_dcmpun ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne r7, #0 -; CHECK-NEXT: and r0, r7, #1 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: and r0, r4, #1 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: bfi r4, r0, #1, #1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: strb r4, [r0] -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: bfi r7, r0, #1, #1 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: strb r7, [r0] +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI32_0: -; CHECK-NEXT: .long 0 @ double 0 -; CHECK-NEXT: .long 0 -; CHECK-NEXT: .LCPI32_1: ; CHECK-NEXT: .long 0 @ double -1 ; CHECK-NEXT: .long 3220176896 +; CHECK-NEXT: .LCPI32_1: +; CHECK-NEXT: .long 0 @ double 0 +; CHECK-NEXT: .long 0 %x = call <2 x i1> @llvm.fptosi.sat.v2f64.v2i1(<2 x double> %f) ret <2 x i1> %x } diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll --- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll @@ -289,32 +289,38 @@ ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vldr d0, .LCPI8_1 +; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: bl __aeabi_d2uiz +; CHECK-NEXT: vldr d0, .LCPI8_1 ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: clz r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_d2uiz -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r0, r0, r7, ne -; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: lsrs r0, r0, #5 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 +; CHECK-NEXT: movne r7, #0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r7, #-1 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 4292870144 @ double 4294967295 -; CHECK-NEXT: .long 1106247679 -; CHECK-NEXT: .LCPI8_1: ; CHECK-NEXT: .long 0 @ double 0 ; CHECK-NEXT: .long 0 +; CHECK-NEXT: .LCPI8_1: +; CHECK-NEXT: .long 4292870144 @ double 4294967295 +; CHECK-NEXT: .long 1106247679 %x = call <1 x i32> @llvm.fptoui.sat.v1f64.v1i32(<1 x double> %f) ret <1 x i32> %x } @@ -430,99 +436,114 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vmov.f32 s18, s0 ; CHECK-NEXT: vmov.f32 s19, s1 ; CHECK-NEXT: vldr d0, .LCPI10_0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vmov r9, r7, d0 +; CHECK-NEXT: vmov r5, r10, d1 +; CHECK-NEXT: vmov r4, r7, d0 ; CHECK-NEXT: vmov.f32 s16, s4 ; CHECK-NEXT: vmov.f32 s17, s5 -; CHECK-NEXT: str.w r9, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: str r4, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vldr d0, .LCPI10_1 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: vmov r11, r3, d0 -; CHECK-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: mov r2, r11 ; CHECK-NEXT: bl __aeabi_dcmpge ; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r10 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov r10, r8, d8 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: csel r0, r0, r6, ne -; CHECK-NEXT: mov r2, r9 -; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: vmov r1, r0, d9 +; CHECK-NEXT: vldr d0, .LCPI10_1 +; CHECK-NEXT: vmov r9, r8, d8 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: strd r1, r0, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: clz r0, r6 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: lsrs r0, r0, #5 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: vmov r5, r4, d9 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: movne.w r11, #0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r10, r2 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r11, #-1 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: lsrs r6, r0, #5 +; CHECK-NEXT: mov r0, r9 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: csel r6, r0, r9, ne -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r6, #-1 -; CHECK-NEXT: ldrd r2, r3, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: ldr.w r9, [sp] @ 4-byte Reload +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r5, #-1 +; CHECK-NEXT: ldr.w r8, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r6 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r4, r6 +; CHECK-NEXT: lsrs r7, r0, #5 +; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: bl __aeabi_d2ulz +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r0, r0, r7, ne -; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r6 -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: movne r6, #0 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vmov.32 q0[1], r11 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r6, #-1 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r5 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI10_0: -; CHECK-NEXT: .long 4292870144 @ double 4294967295 -; CHECK-NEXT: .long 1106247679 -; CHECK-NEXT: .LCPI10_1: ; CHECK-NEXT: .long 0 @ double 0 ; CHECK-NEXT: .long 0 +; CHECK-NEXT: .LCPI10_1: +; CHECK-NEXT: .long 4292870144 @ double 4294967295 +; CHECK-NEXT: .long 1106247679 %x = call <3 x i32> @llvm.fptoui.sat.v3f64.v3i32(<3 x double> %f) ret <3 x i32> %x } @@ -538,106 +559,129 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: .pad #24 ; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI11_0 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov r7, r9, d0 -; CHECK-NEXT: vmov r4, r5, d10 -; CHECK-NEXT: str.w r9, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vldr d0, .LCPI11_1 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: vmov r6, r5, d10 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: strd r2, r3, [sp, #16] @ 8-byte Folded Spill +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r9, r3 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov r10, r8, d8 -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: csel r0, r0, r6, ne -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: vldr d0, .LCPI11_1 +; CHECK-NEXT: vmov r11, r1, d11 +; CHECK-NEXT: vmov r7, r10, d8 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: clz r1, r8 +; CHECK-NEXT: lsrs r1, r1, #5 +; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: vmov r11, r5, d11 -; CHECK-NEXT: mov r4, r7 -; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: movne r0, #0 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: mov r1, r8 -; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r4, #-1 +; CHECK-NEXT: str r4, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r6, r9 +; CHECK-NEXT: str.w r9, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: lsrs r5, r0, #5 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp.w r9, #0 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: csel r8, r0, r9, ne -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r8, #-1 -; CHECK-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: movne.w r9, #0 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: ldr.w r10, [sp] @ 4-byte Reload +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r9, #-1 +; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r6 -; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r6, r7 +; CHECK-NEXT: lsrs r5, r0, #5 ; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov r4, r5, d9 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r6, r0, r7, ne -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r6, #-1 -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: movne.w r8, #0 +; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: ldr.w r11, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: vmov r7, r4, d9 +; CHECK-NEXT: mov r3, r11 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldrd r2, r3, [sp, #16] @ 8-byte Folded Reload -; CHECK-NEXT: mov r9, r0 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r8, #-1 +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: lsrs r6, r0, #5 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r0, r0, r7, ne -; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r8, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r6 +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r5, #-1 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r9, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r8 ; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 @@ -645,11 +689,11 @@ ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI11_0: -; CHECK-NEXT: .long 4292870144 @ double 4294967295 -; CHECK-NEXT: .long 1106247679 -; CHECK-NEXT: .LCPI11_1: ; CHECK-NEXT: .long 0 @ double 0 ; CHECK-NEXT: .long 0 +; CHECK-NEXT: .LCPI11_1: +; CHECK-NEXT: .long 4292870144 @ double 4294967295 +; CHECK-NEXT: .long 1106247679 %x = call <4 x i32> @llvm.fptoui.sat.v4f64.v4i32(<4 x double> %f) ret <4 x i32> %x } @@ -666,146 +710,170 @@ ; CHECK-NEXT: .pad #40 ; CHECK-NEXT: sub sp, #40 ; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vldr d0, .LCPI12_0 -; CHECK-NEXT: vmov r5, r6, d4 -; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: vmov r5, r4, d4 +; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov.f32 s20, s6 -; CHECK-NEXT: vmov.f32 s18, s4 +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vmov.f32 s20, s4 ; CHECK-NEXT: vmov.f32 s22, s2 -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmov.f32 s19, s5 +; CHECK-NEXT: vmov.f32 s19, s7 +; CHECK-NEXT: vmov.f32 s21, s5 ; CHECK-NEXT: vmov.f32 s23, s3 ; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: strd r2, r3, [sp, #32] @ 8-byte Folded Spill -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vldr d0, .LCPI12_1 -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: vmov r7, r3, d0 -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: strd r2, r3, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov r8, r1, d11 -; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: vmov r6, r9, d10 -; CHECK-NEXT: csel r0, r0, r11, ne -; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: vmov r2, r1, d9 -; CHECK-NEXT: strd r2, r1, [sp, #16] @ 8-byte Folded Spill +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: vmov r1, r0, d10 +; CHECK-NEXT: vldr d0, .LCPI12_1 +; CHECK-NEXT: vmov r7, r6, d9 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: strd r1, r0, [sp, #24] @ 8-byte Folded Spill +; CHECK-NEXT: vmov r11, r0, d11 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: str r2, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: clz r0, r8 +; CHECK-NEXT: lsrs r0, r0, #5 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: str r0, [r4, #16] -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: ldr r5, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r1, r9 -; CHECK-NEXT: ldr.w r10, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: movne.w r9, #0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r5, r3 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r2, r7 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r9 -; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r9 -; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: csel r0, r0, r4, ne -; CHECK-NEXT: cmp.w r11, #0 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r3, r10 -; CHECK-NEXT: mov r11, r10 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr.w r10, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r9, #-1 +; CHECK-NEXT: str.w r9, [r10, #16] +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: ldr.w r10, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: mov r5, r6 +; CHECK-NEXT: ldr.w r8, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r9, r7 +; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: lsrs r4, r0, #5 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp r6, #0 -; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: csel r0, r0, r6, ne ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: ldr.w r8, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov r1, r4 -; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: movne r0, #0 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: ldr r7, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r6, r5 +; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: mov r2, r7 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r4, #-1 +; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: ldr r4, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: mov r5, r11 ; CHECK-NEXT: mov r11, r10 +; CHECK-NEXT: mov r10, r8 +; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: lsr.w r9, r0, #5 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: csel r4, r0, r5, ne -; CHECK-NEXT: vmov r5, r6, d8 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: ldr r3, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r1, r4 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: cmp.w r9, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r8, #0 +; CHECK-NEXT: mov r9, r7 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r8, #-1 +; CHECK-NEXT: ldr r5, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: ldr r6, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r7, r6 +; CHECK-NEXT: lsrs r4, r0, #5 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r0, r0, r7, ne +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r6, #0 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: ldr.w r9, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: vmov r4, r7, d8 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r6, #-1 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: lsr.w r10, r0, #5 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: bl __aeabi_d2ulz +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: cmp.w r10, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: ldr r2, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r5, #-1 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r8, r0 +; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -814,11 +882,11 @@ ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI12_0: -; CHECK-NEXT: .long 4292870144 @ double 4294967295 -; CHECK-NEXT: .long 1106247679 -; CHECK-NEXT: .LCPI12_1: ; CHECK-NEXT: .long 0 @ double 0 ; CHECK-NEXT: .long 0 +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .long 4292870144 @ double 4294967295 +; CHECK-NEXT: .long 1106247679 %x = call <5 x i32> @llvm.fptoui.sat.v5f64.v5i32(<5 x double> %f) ret <5 x i32> %x } @@ -832,188 +900,223 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vldr d0, .LCPI13_0 -; CHECK-NEXT: vmov r5, r6, d5 -; CHECK-NEXT: vmov r11, r3, d0 -; CHECK-NEXT: vmov.f32 s22, s8 -; CHECK-NEXT: vmov.f32 s20, s6 +; CHECK-NEXT: vmov r6, r8, d5 +; CHECK-NEXT: vmov r2, r7, d0 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s22, s6 ; CHECK-NEXT: vmov.f32 s18, s4 ; CHECK-NEXT: vmov.f32 s24, s2 -; CHECK-NEXT: vmov.f32 s23, s9 -; CHECK-NEXT: vmov.f32 s21, s7 +; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vmov.f32 s23, s7 ; CHECK-NEXT: vmov.f32 s19, s5 ; CHECK-NEXT: vmov.f32 s25, s3 -; CHECK-NEXT: str r3, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: str.w r11, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: str r2, [sp, #44] @ 4-byte Spill +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: bl __aeabi_dcmpge +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: bl __aeabi_d2ulz +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: vmov r1, r0, d12 ; CHECK-NEXT: vldr d0, .LCPI13_1 -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: vmov r4, r9, d0 -; CHECK-NEXT: str r4, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: vmov r4, r11, d10 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: strd r1, r0, [sp, #28] @ 8-byte Folded Spill +; CHECK-NEXT: vmov r1, r0, d11 +; CHECK-NEXT: str r3, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: strd r1, r0, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: clz r0, r10 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: mov r10, r2 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r9, #0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r9, #-1 +; CHECK-NEXT: str r5, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: str.w r9, [r5, #20] +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: ldr.w r9, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: mov r8, r7 +; CHECK-NEXT: mov r2, r9 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: lsrs r7, r0, #5 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov r10, r1, d10 -; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: vmov r5, r6, d11 -; CHECK-NEXT: csel r0, r0, r8, ne +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: vmov r1, r0, d9 ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: vmov r2, r1, d12 -; CHECK-NEXT: strd r2, r1, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: strd r1, r0, [sp, #20] @ 8-byte Folded Spill ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r7, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: str r0, [r7, #20] -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: ldr.w r8, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: movne r6, #0 +; CHECK-NEXT: ldr r7, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: str.w r10, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r11, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r6, #-1 +; CHECK-NEXT: str r6, [r5, #16] +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r8, r4 +; CHECK-NEXT: lsr.w r11, r0, #5 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov r2, r1, d9 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r0, r0, r4, ne ; CHECK-NEXT: cmp.w r11, #0 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: strd r2, r1, [sp, #4] @ 8-byte Folded Spill ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: str r0, [r7, #16] -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: ldr.w r11, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r5, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: movne r0, #0 ; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r4, #-1 +; CHECK-NEXT: ldr r7, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: ldr r6, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: str r4, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: mov r8, r9 -; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r10, r5 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: lsrs r4, r0, #5 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: csel r0, r0, r7, ne +; CHECK-NEXT: mov r11, r0 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r3, r4 -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill -; CHECK-NEXT: mov r0, r9 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r6, r7 -; CHECK-NEXT: mov r10, r5 -; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: movne.w r11, #0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: mov r2, r11 -; CHECK-NEXT: csel r9, r0, r7, ne -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: ldr.w r9, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r9, #-1 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r11, #-1 +; CHECK-NEXT: ldr r6, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: ldr r5, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: clz r0, r0 ; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r10, r6 +; CHECK-NEXT: lsrs r4, r0, #5 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov r5, r6, d8 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r4, r0, r7, ne -; CHECK-NEXT: cmp.w r11, #0 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r4, #-1 -; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp, #36] @ 4-byte Reload -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: movne r6, #0 +; CHECK-NEXT: mov r10, r7 +; CHECK-NEXT: mov r8, r9 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: mov r10, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: vmov r4, r7, d8 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r6, #-1 +; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r7 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: lsr.w r9, r0, #5 +; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csel r0, r0, r7, ne -; CHECK-NEXT: cmp.w r10, #0 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: mov r2, r10 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: cmp.w r9, #0 ; CHECK-NEXT: it ne -; CHECK-NEXT: movne.w r0, #-1 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov q0[3], q0[1], r9, r0 -; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: movne r5, #0 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r5, #-1 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 +; CHECK-NEXT: vmov q0[3], q0[1], r11, r0 +; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI13_0: -; CHECK-NEXT: .long 4292870144 @ double 4294967295 -; CHECK-NEXT: .long 1106247679 -; CHECK-NEXT: .LCPI13_1: ; CHECK-NEXT: .long 0 @ double 0 ; CHECK-NEXT: .long 0 +; CHECK-NEXT: .LCPI13_1: +; CHECK-NEXT: .long 4292870144 @ double 4294967295 +; CHECK-NEXT: .long 1106247679 %x = call <6 x i32> @llvm.fptoui.sat.v6f64.v6i32(<6 x double> %f) ret <6 x i32> %x } @@ -2125,57 +2228,67 @@ ; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vldr d0, .LCPI32_0 -; CHECK-NEXT: vmov r5, r6, d8 +; CHECK-NEXT: vmov r7, r8, d8 ; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: vmov r10, r9, d0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: mov r2, r10 -; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vldr d0, .LCPI32_1 -; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 -; CHECK-NEXT: vmov r4, r11, d0 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r8 ; CHECK-NEXT: bl __aeabi_d2uiz -; CHECK-NEXT: vmov r6, r5, d9 -; CHECK-NEXT: cmp.w r8, #0 -; CHECK-NEXT: csel r0, r0, r8, ne -; CHECK-NEXT: cmp r7, #0 +; CHECK-NEXT: vldr d0, .LCPI32_1 +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: clz r0, r9 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: lsrs r0, r0, #5 +; CHECK-NEXT: vmov r6, r11, d9 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r10, #0 +; CHECK-NEXT: mov r9, r2 +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne.w r10, #1 +; CHECK-NEXT: and r0, r10, #1 ; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: and r0, r0, #1 -; CHECK-NEXT: mov r2, r10 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: mov r1, r11 ; CHECK-NEXT: bfi r7, r0, #0, #1 ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 -; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: mov r8, r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: mov r3, r5 ; CHECK-NEXT: bl __aeabi_dcmpge -; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: clz r0, r0 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: lsrs r5, r0, #5 ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_d2uiz -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csel r0, r0, r4, ne -; CHECK-NEXT: cmp.w r8, #0 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r4, #0 +; CHECK-NEXT: bl __aeabi_dcmpgt +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it ne ; CHECK-NEXT: movne r0, #1 -; CHECK-NEXT: and r0, r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r4, #1 +; CHECK-NEXT: and r0, r4, #1 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: bfi r7, r0, #1, #1 ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload @@ -2187,11 +2300,11 @@ ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI32_0: -; CHECK-NEXT: .long 0 @ double 1 -; CHECK-NEXT: .long 1072693248 -; CHECK-NEXT: .LCPI32_1: ; CHECK-NEXT: .long 0 @ double 0 ; CHECK-NEXT: .long 0 +; CHECK-NEXT: .LCPI32_1: +; CHECK-NEXT: .long 0 @ double 1 +; CHECK-NEXT: .long 1072693248 %x = call <2 x i1> @llvm.fptoui.sat.v2f64.v2i1(<2 x double> %f) ret <2 x i1> %x } diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -683,7 +683,7 @@ ; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: add r2, sp, #120 +; CHECK-NEXT: add r2, sp, #88 ; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill ; CHECK-NEXT: .LBB12_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -711,7 +711,7 @@ ; CHECK-NEXT: vmov r6, r2, d4 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh.w r12, [r4] -; CHECK-NEXT: add r4, sp, #88 +; CHECK-NEXT: add r4, sp, #120 ; CHECK-NEXT: ldrh.w r11, [r5] ; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r5, [r6] diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -33,15 +33,14 @@ ; CHECK-LABEL: unscaled_v2i8_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrb r2, [r1] -; CHECK-NEXT: vmov.i32 q0, #0xff ; CHECK-NEXT: ldrb r1, [r1, #1] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r1, [r0, r1] -; CHECK-NEXT: ldrb r0, [r0, r2] -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 +; CHECK-NEXT: add r2, r0 +; CHECK-NEXT: adds r3, r0, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: ldrb r0, [r0, r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 ; CHECK-NEXT: bx lr entry: %offs = load <2 x i8>, ptr %offptr, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -6,10 +6,12 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i32(ptr %offptr) { ; CHECK-LABEL: ptr_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r0, [r0] -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vldr s2, [r0, #4] +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vldr s2, [r0] +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vldr s0, [r0] ; CHECK-NEXT: bx lr entry: %offs = load <2 x ptr>, ptr %offptr, align 4 @@ -32,27 +34,23 @@ define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(ptr %offptr) { ; CHECK-LABEL: ptr_v8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: vmov r3, r12, d0 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, lr, d1 -; CHECK-NEXT: ldr r7, [r2] -; CHECK-NEXT: vmov r2, r4, d0 -; CHECK-NEXT: ldr r6, [r1] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr.w r1, [r12] -; CHECK-NEXT: vmov q1[2], q1[0], r3, r6 -; CHECK-NEXT: ldr.w r5, [lr] -; CHECK-NEXT: vmov q1[3], q1[1], r1, r7 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: ldr r4, [r4] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vmov r12, r2, d1 +; CHECK-NEXT: vmov lr, r1, d0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r0, r3, d1 +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vldr s3, [r2] +; CHECK-NEXT: vldr s2, [r12] +; CHECK-NEXT: vldr s1, [r1] +; CHECK-NEXT: vldr s0, [lr] +; CHECK-NEXT: vldr s7, [r3] +; CHECK-NEXT: vldr s6, [r0] +; CHECK-NEXT: vldr s5, [r5] +; CHECK-NEXT: vldr s4, [r4] +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x ptr>, ptr %offptr, align 4 %gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %offs, i32 4, <8 x i1> , <8 x i32> undef) @@ -64,42 +62,34 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov r1, r2, d1 -; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: ldr r7, [r2] -; CHECK-NEXT: vmov r2, r6, d0 -; CHECK-NEXT: ldr.w r12, [r1] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: ldr r4, [r4] -; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r12 -; CHECK-NEXT: ldr.w r1, [lr] -; CHECK-NEXT: vmov q3[3], q3[1], r1, r7 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: ldr r6, [r6] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 -; CHECK-NEXT: vmov r2, r4, d3 -; CHECK-NEXT: vmov q0[3], q0[1], r6, r5 -; CHECK-NEXT: vmov r6, r5, d2 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: ldr r6, [r6] -; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: vmov q1[2], q1[0], r6, r2 -; CHECK-NEXT: ldr r6, [r4] -; CHECK-NEXT: vmov r0, r2, d5 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r6 -; CHECK-NEXT: vmov r6, r5, d4 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r6, [r6] -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: ldr r5, [r5] -; CHECK-NEXT: vmov q2[2], q2[0], r6, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov r12, r3, d0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov lr, r4, d1 +; CHECK-NEXT: vmov r7, r5, d0 +; CHECK-NEXT: vldr s3, [r2] +; CHECK-NEXT: vldr s2, [r1] +; CHECK-NEXT: vmov r1, r2, d3 +; CHECK-NEXT: vldr s1, [r3] +; CHECK-NEXT: vmov r3, r0, d2 +; CHECK-NEXT: vldr s7, [r4] +; CHECK-NEXT: vmov r4, r6, d5 +; CHECK-NEXT: vldr s5, [r5] +; CHECK-NEXT: vldr s4, [r7] +; CHECK-NEXT: vmov r7, r5, d4 +; CHECK-NEXT: vldr s0, [r12] +; CHECK-NEXT: vldr s6, [lr] +; CHECK-NEXT: vldr s11, [r2] +; CHECK-NEXT: vldr s10, [r1] +; CHECK-NEXT: vldr s9, [r0] +; CHECK-NEXT: vldr s8, [r3] +; CHECK-NEXT: vldr s15, [r6] +; CHECK-NEXT: vldr s14, [r4] +; CHECK-NEXT: vldr s13, [r5] +; CHECK-NEXT: vldr s12, [r7] ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %offs = load <16 x ptr>, ptr %offptr, align 4 @@ -112,9 +102,12 @@ define arm_aapcs_vfpcc <2 x float> @ptr_v2f32(ptr %offptr) { ; CHECK-LABEL: ptr_v2f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r0, [r0] +; CHECK-NEXT: vldr s2, [r0, #4] +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vldr s0, [r1] +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vldr s0, [r0] ; CHECK-NEXT: bx lr entry: %offs = load <2 x ptr>, ptr %offptr, align 4 @@ -199,12 +192,15 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_sext(ptr %offptr) { ; CHECK-LABEL: ptr_v2i16_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r0, [r0] -; CHECK-NEXT: ldrsh.w r0, [r0] +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldr s2, [r0, #4] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: ldrsh.w r1, [r1] +; CHECK-NEXT: ldrsh.w r0, [r0] ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: asrs r0, r0, #31 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-NEXT: bx lr entry: @@ -217,12 +213,15 @@ define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_zext(ptr %offptr) { ; CHECK-LABEL: ptr_v2i16_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r0, [r0] -; CHECK-NEXT: vmov.i64 q0, #0xffff -; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vldr s2, [r0, #4] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: bx lr entry: %offs = load <2 x ptr>, ptr %offptr, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll @@ -9,12 +9,10 @@ ; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0 ; NOGATSCAT-NEXT: vmov r0, r1, d1 ; NOGATSCAT-NEXT: vmov r2, r3, d0 -; NOGATSCAT-NEXT: ldr r0, [r0] -; NOGATSCAT-NEXT: ldr r2, [r2] -; NOGATSCAT-NEXT: ldr r1, [r1] -; NOGATSCAT-NEXT: ldr r3, [r3] -; NOGATSCAT-NEXT: vmov q0[2], q0[0], r2, r0 -; NOGATSCAT-NEXT: vmov q0[3], q0[1], r3, r1 +; NOGATSCAT-NEXT: vldr s3, [r1] +; NOGATSCAT-NEXT: vldr s2, [r0] +; NOGATSCAT-NEXT: vldr s1, [r3] +; NOGATSCAT-NEXT: vldr s0, [r2] ; NOGATSCAT-NEXT: bx lr ; ; NOMVE-LABEL: unscaled_i32_i32_gather: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -387,31 +387,31 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r4, .LCPI8_0 ; CHECK-NEXT: movs r5, #18 -; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q1, [r4] ; CHECK-NEXT: mov.w r12, #9 -; CHECK-NEXT: mov.w lr, #12 -; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: mov.w lr, #8 +; CHECK-NEXT: movs r4, #3 +; CHECK-NEXT: vdup.32 q0, r5 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vadd.i32 q3, q2, r4 -; CHECK-NEXT: vmla.i32 q4, q2, lr +; CHECK-NEXT: vmul.i32 q3, q1, r4 +; CHECK-NEXT: vadd.i32 q2, q1, lr +; CHECK-NEXT: vshl.i32 q3, q3, #2 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrw.u32 q5, [q4, #24] -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmla.i32 q4, q2, r12 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vstrb.8 q5, [r1], #16 -; CHECK-NEXT: vstrw.32 q4, [r3] +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmla.i32 q3, q1, r12 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vstrb.8 q4, [r1], #16 +; CHECK-NEXT: vstrw.32 q3, [r3] ; CHECK-NEXT: bne .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: diff --git a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll --- a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll +++ b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll @@ -44,13 +44,37 @@ define <8 x i16> @inserti8_first_sext(ptr %p) { ; CHECKLE-LABEL: inserti8_first_sext: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrb.s16 q0, [r0] +; CHECKLE-NEXT: vldrb.s16 q1, [r0, #1] +; CHECKLE-NEXT: ldrsb.w r1, [r0] +; CHECKLE-NEXT: vmovx.f16 s10, s5 +; CHECKLE-NEXT: vmovx.f16 s8, s4 +; CHECKLE-NEXT: vins.f16 s10, s6 +; CHECKLE-NEXT: vmovx.f16 s6, s6 +; CHECKLE-NEXT: vmov.16 q0[0], r1 +; CHECKLE-NEXT: vins.f16 s8, s5 +; CHECKLE-NEXT: vins.f16 s6, s7 +; CHECKLE-NEXT: vmov.f32 s1, s8 +; CHECKLE-NEXT: vmov.f32 s2, s10 +; CHECKLE-NEXT: vins.f16 s0, s4 +; CHECKLE-NEXT: vmov.f32 s3, s6 ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti8_first_sext: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrb.s16 q1, [r0] -; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: vldrb.s16 q0, [r0, #1] +; CHECKBE-NEXT: ldrsb.w r1, [r0] +; CHECKBE-NEXT: vmovx.f16 s6, s1 +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vins.f16 s6, s2 +; CHECKBE-NEXT: vmovx.f16 s2, s2 +; CHECKBE-NEXT: vmov.16 q2[0], r1 +; CHECKBE-NEXT: vins.f16 s4, s1 +; CHECKBE-NEXT: vins.f16 s2, s3 +; CHECKBE-NEXT: vins.f16 s8, s0 +; CHECKBE-NEXT: vmov.f32 s9, s4 +; CHECKBE-NEXT: vmov.f32 s10, s6 +; CHECKBE-NEXT: vmov.f32 s11, s2 +; CHECKBE-NEXT: vrev64.16 q0, q2 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -65,12 +89,32 @@ define <8 x i16> @inserti8_last_sext(ptr %p) { ; CHECKLE-LABEL: inserti8_last_sext: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrb.s16 q0, [r0, #1] +; CHECKLE-NEXT: vldrb.s16 q1, [r0] +; CHECKLE-NEXT: ldrsb.w r1, [r0, #8] +; CHECKLE-NEXT: vmovx.f16 s0, s4 +; CHECKLE-NEXT: vmovx.f16 s1, s5 +; CHECKLE-NEXT: vmovx.f16 s2, s6 +; CHECKLE-NEXT: vins.f16 s0, s5 +; CHECKLE-NEXT: vins.f16 s1, s6 +; CHECKLE-NEXT: vins.f16 s2, s7 +; CHECKLE-NEXT: vmov.u16 r0, q1[7] +; CHECKLE-NEXT: vmov.16 q0[6], r0 +; CHECKLE-NEXT: vmov.16 q0[7], r1 ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti8_last_sext: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrb.s16 q1, [r0, #1] +; CHECKBE-NEXT: vldrb.s16 q0, [r0] +; CHECKBE-NEXT: ldrsb.w r1, [r0, #8] +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vmovx.f16 s5, s1 +; CHECKBE-NEXT: vmovx.f16 s6, s2 +; CHECKBE-NEXT: vins.f16 s4, s1 +; CHECKBE-NEXT: vins.f16 s5, s2 +; CHECKBE-NEXT: vins.f16 s6, s3 +; CHECKBE-NEXT: vmov.u16 r0, q0[7] +; CHECKBE-NEXT: vmov.16 q1[6], r0 +; CHECKBE-NEXT: vmov.16 q1[7], r1 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 8 @@ -86,13 +130,37 @@ define <8 x i16> @inserti8_first_zext(ptr %p) { ; CHECKLE-LABEL: inserti8_first_zext: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrb.u16 q0, [r0] +; CHECKLE-NEXT: vldrb.u16 q1, [r0, #1] +; CHECKLE-NEXT: ldrb r1, [r0] +; CHECKLE-NEXT: vmovx.f16 s10, s5 +; CHECKLE-NEXT: vmovx.f16 s8, s4 +; CHECKLE-NEXT: vins.f16 s10, s6 +; CHECKLE-NEXT: vmovx.f16 s6, s6 +; CHECKLE-NEXT: vmov.16 q0[0], r1 +; CHECKLE-NEXT: vins.f16 s8, s5 +; CHECKLE-NEXT: vins.f16 s6, s7 +; CHECKLE-NEXT: vmov.f32 s1, s8 +; CHECKLE-NEXT: vmov.f32 s2, s10 +; CHECKLE-NEXT: vins.f16 s0, s4 +; CHECKLE-NEXT: vmov.f32 s3, s6 ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti8_first_zext: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrb.u16 q1, [r0] -; CHECKBE-NEXT: vrev64.16 q0, q1 +; CHECKBE-NEXT: vldrb.u16 q0, [r0, #1] +; CHECKBE-NEXT: ldrb r1, [r0] +; CHECKBE-NEXT: vmovx.f16 s6, s1 +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vins.f16 s6, s2 +; CHECKBE-NEXT: vmovx.f16 s2, s2 +; CHECKBE-NEXT: vmov.16 q2[0], r1 +; CHECKBE-NEXT: vins.f16 s4, s1 +; CHECKBE-NEXT: vins.f16 s2, s3 +; CHECKBE-NEXT: vins.f16 s8, s0 +; CHECKBE-NEXT: vmov.f32 s9, s4 +; CHECKBE-NEXT: vmov.f32 s10, s6 +; CHECKBE-NEXT: vmov.f32 s11, s2 +; CHECKBE-NEXT: vrev64.16 q0, q2 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 1 %l1 = load <8 x i8>, ptr %q @@ -107,12 +175,32 @@ define <8 x i16> @inserti8_last_zext(ptr %p) { ; CHECKLE-LABEL: inserti8_last_zext: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrb.u16 q0, [r0, #1] +; CHECKLE-NEXT: vldrb.u16 q1, [r0] +; CHECKLE-NEXT: ldrb r1, [r0, #8] +; CHECKLE-NEXT: vmovx.f16 s0, s4 +; CHECKLE-NEXT: vmovx.f16 s1, s5 +; CHECKLE-NEXT: vmovx.f16 s2, s6 +; CHECKLE-NEXT: vins.f16 s0, s5 +; CHECKLE-NEXT: vins.f16 s1, s6 +; CHECKLE-NEXT: vins.f16 s2, s7 +; CHECKLE-NEXT: vmov.u16 r0, q1[7] +; CHECKLE-NEXT: vmov.16 q0[6], r0 +; CHECKLE-NEXT: vmov.16 q0[7], r1 ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti8_last_zext: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrb.u16 q1, [r0, #1] +; CHECKBE-NEXT: vldrb.u16 q0, [r0] +; CHECKBE-NEXT: ldrb r1, [r0, #8] +; CHECKBE-NEXT: vmovx.f16 s4, s0 +; CHECKBE-NEXT: vmovx.f16 s5, s1 +; CHECKBE-NEXT: vmovx.f16 s6, s2 +; CHECKBE-NEXT: vins.f16 s4, s1 +; CHECKBE-NEXT: vins.f16 s5, s2 +; CHECKBE-NEXT: vins.f16 s6, s3 +; CHECKBE-NEXT: vmov.u16 r0, q0[7] +; CHECKBE-NEXT: vmov.16 q1[6], r0 +; CHECKBE-NEXT: vmov.16 q1[7], r1 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 8 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -272,50 +272,36 @@ define arm_aapcs_vfpcc <16 x i8> @ext_add_ashr_trunc_i8i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: ext_add_ashr_trunc_i8i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .pad #112 -; CHECK-NEXT: sub sp, #112 +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: add r1, sp, #16 -; CHECK-NEXT: mov r4, sp +; CHECK-NEXT: add r2, sp, #32 ; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vstrw.32 q0, [r4] -; CHECK-NEXT: vldrb.u16 q0, [r1, #8] -; CHECK-NEXT: add r3, sp, #64 -; CHECK-NEXT: add r5, sp, #32 -; CHECK-NEXT: add r0, sp, #80 -; CHECK-NEXT: vstrw.32 q0, [r3] -; CHECK-NEXT: add r2, sp, #48 -; CHECK-NEXT: vldrb.s16 q0, [r4, #8] -; CHECK-NEXT: vstrw.32 q0, [r5] -; CHECK-NEXT: vldrb.u16 q0, [r1] -; CHECK-NEXT: add r1, sp, #96 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrb.s16 q0, [r4] ; CHECK-NEXT: vstrw.32 q0, [r2] -; CHECK-NEXT: vldrh.u32 q0, [r3, #8] -; CHECK-NEXT: vldrh.s32 q1, [r5, #8] +; CHECK-NEXT: vldrb.u32 q0, [r1, #12] +; CHECK-NEXT: vldrb.s32 q1, [r2, #12] +; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrb.32 q0, [r1, #12] -; CHECK-NEXT: vldrh.u32 q0, [r3] -; CHECK-NEXT: vldrh.s32 q1, [r5] +; CHECK-NEXT: vstrb.32 q0, [r0, #12] +; CHECK-NEXT: vldrb.u32 q0, [r1, #8] +; CHECK-NEXT: vldrb.s32 q1, [r2, #8] ; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrb.32 q0, [r1, #8] -; CHECK-NEXT: vldrh.u32 q0, [r0, #8] -; CHECK-NEXT: vldrh.s32 q1, [r2, #8] +; CHECK-NEXT: vstrb.32 q0, [r0, #8] +; CHECK-NEXT: vldrb.u32 q0, [r1, #4] +; CHECK-NEXT: vldrb.s32 q1, [r2, #4] ; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrb.32 q0, [r1, #4] -; CHECK-NEXT: vldrh.u32 q0, [r0] -; CHECK-NEXT: vldrh.s32 q1, [r2] +; CHECK-NEXT: vstrb.32 q0, [r0, #4] +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrb.s32 q1, [r2] ; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vshr.u32 q0, q0, #1 -; CHECK-NEXT: vstrb.32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: add sp, #112 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: bx lr entry: %sa = sext <16 x i8> %a to <16 x i32> %sb = zext <16 x i8> %b to <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -94,48 +94,53 @@ ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: ldrd r12, lr, [r1] +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: @ implicit-def: $q0 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 +; CHECK-LE-NEXT: vmov q1[2], q1[0], r12, lr +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-LE-NEXT: csetm r4, lt ; CHECK-LE-NEXT: movs r3, #0 -; CHECK-LE-NEXT: @ implicit-def: $q1 -; CHECK-LE-NEXT: rsbs.w r1, r12, #0 -; CHECK-LE-NEXT: vmov q0[2], q0[0], r12, lr -; CHECK-LE-NEXT: sbcs.w r1, r3, r12, asr #31 -; CHECK-LE-NEXT: csetm r1, lt -; CHECK-LE-NEXT: rsbs.w r4, lr, #0 -; CHECK-LE-NEXT: sbcs.w r4, r3, lr, asr #31 -; CHECK-LE-NEXT: bfi r3, r1, #0, #1 -; CHECK-LE-NEXT: csetm r1, lt -; CHECK-LE-NEXT: bfi r3, r1, #1, #1 -; CHECK-LE-NEXT: lsls r1, r3, #31 -; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: ldrne r1, [r2] -; CHECK-LE-NEXT: vmovne.32 q1[0], r1 +; CHECK-LE-NEXT: rsbs.w r5, lr, #0 +; CHECK-LE-NEXT: bfi r3, r4, #0, #1 +; CHECK-LE-NEXT: sbcs.w r5, r1, lr, asr #31 +; CHECK-LE-NEXT: bfi r1, r4, #0, #8 +; CHECK-LE-NEXT: vmov r4, s4 +; CHECK-LE-NEXT: csetm r5, lt +; CHECK-LE-NEXT: bfi r1, r5, #8, #8 +; CHECK-LE-NEXT: bfi r3, r5, #1, #1 +; CHECK-LE-NEXT: lsls r1, r1, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vldrne s0, [r2] ; CHECK-LE-NEXT: lsls r1, r3, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] -; CHECK-LE-NEXT: vmovmi.32 q1[2], r1 -; CHECK-LE-NEXT: vmov r2, s6 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 +; CHECK-LE-NEXT: vmov r1, s2 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vmov r3, s0 -; CHECK-LE-NEXT: vmov r4, s4 -; CHECK-LE-NEXT: vmov q1[2], q1[0], r4, r2 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: asr.w r12, r2, #31 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3, asr #31 -; CHECK-LE-NEXT: vmov r3, s2 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: asr.w lr, r4, #31 -; CHECK-LE-NEXT: vmov q1[3], q1[1], lr, r12 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-LE-NEXT: rsbs r5, r4, #0 +; CHECK-LE-NEXT: vmov r5, s6 +; CHECK-LE-NEXT: asr.w r12, r1, #31 +; CHECK-LE-NEXT: sbcs.w r1, r2, r4, asr #31 +; CHECK-LE-NEXT: asr.w lr, r3, #31 +; CHECK-LE-NEXT: csetm r3, lt +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmov q0[3], q0[1], lr, r12 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: rsbs r4, r5, #0 +; CHECK-LE-NEXT: sbcs.w r5, r2, r5, asr #31 +; CHECK-LE-NEXT: bfi r2, r3, #0, #8 +; CHECK-LE-NEXT: csetm r5, lt +; CHECK-LE-NEXT: bfi r2, r5, #8, #8 +; CHECK-LE-NEXT: bfi r1, r5, #1, #1 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: it ne -; CHECK-LE-NEXT: vstrne d2, [r0] +; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi -; CHECK-LE-NEXT: vstrmi d3, [r0, #8] +; CHECK-LE-NEXT: vstrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: pop {r4, r5, r7, pc} ; @@ -160,8 +165,7 @@ ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB5_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load -; CHECK-BE-NEXT: ldr r3, [r2] -; CHECK-BE-NEXT: vmov.32 q1[1], r3 +; CHECK-BE-NEXT: vldr s5, [r2] ; CHECK-BE-NEXT: vrev64.32 q2, q1 ; CHECK-BE-NEXT: .LBB5_2: @ %else ; CHECK-BE-NEXT: vrev64.32 q1, q0 @@ -218,18 +222,22 @@ ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: ldrd r12, lr, [r1] -; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: rsbs.w r1, r12, #0 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 ; CHECK-LE-NEXT: vmov q1[2], q1[0], r12, lr -; CHECK-LE-NEXT: sbcs.w r1, r3, r12, asr #31 -; CHECK-LE-NEXT: csetm r1, lt -; CHECK-LE-NEXT: rsbs.w r4, lr, #0 -; CHECK-LE-NEXT: sbcs.w r4, r3, lr, asr #31 -; CHECK-LE-NEXT: bfi r3, r1, #0, #1 -; CHECK-LE-NEXT: csetm r1, lt -; CHECK-LE-NEXT: bfi r3, r1, #1, #1 -; CHECK-LE-NEXT: lsls r1, r3, #31 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-LE-NEXT: csetm r4, lt +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: rsbs.w r5, lr, #0 +; CHECK-LE-NEXT: bfi r3, r4, #0, #1 +; CHECK-LE-NEXT: sbcs.w r5, r1, lr, asr #31 +; CHECK-LE-NEXT: bfi r1, r4, #0, #8 +; CHECK-LE-NEXT: vmov r4, s4 +; CHECK-LE-NEXT: csetm r5, lt +; CHECK-LE-NEXT: bfi r1, r5, #8, #8 +; CHECK-LE-NEXT: bfi r3, r5, #1, #1 +; CHECK-LE-NEXT: lsls r1, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r1, [r2] ; CHECK-LE-NEXT: vmovne.32 q0[0], r1 @@ -237,24 +245,26 @@ ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] ; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r2, s2 +; CHECK-LE-NEXT: vmov r1, s2 +; CHECK-LE-NEXT: movs r2, #0 +; CHECK-LE-NEXT: vmov r3, s0 +; CHECK-LE-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-LE-NEXT: rsbs r5, r4, #0 +; CHECK-LE-NEXT: vmov r5, s6 +; CHECK-LE-NEXT: asr.w r12, r1, #31 +; CHECK-LE-NEXT: sbcs.w r1, r2, r4, asr #31 +; CHECK-LE-NEXT: asr.w lr, r3, #31 +; CHECK-LE-NEXT: csetm r3, lt ; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmov r3, s4 -; CHECK-LE-NEXT: vmov r4, s0 -; CHECK-LE-NEXT: vmov q0[2], q0[0], r4, r2 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: asr.w r12, r2, #31 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3, asr #31 -; CHECK-LE-NEXT: vmov r3, s6 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: asr.w lr, r4, #31 ; CHECK-LE-NEXT: vmov q0[3], q0[1], lr, r12 -; CHECK-LE-NEXT: rsbs r5, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: rsbs r4, r5, #0 +; CHECK-LE-NEXT: sbcs.w r5, r2, r5, asr #31 +; CHECK-LE-NEXT: bfi r2, r3, #0, #8 +; CHECK-LE-NEXT: csetm r5, lt +; CHECK-LE-NEXT: bfi r2, r5, #8, #8 +; CHECK-LE-NEXT: bfi r1, r5, #1, #1 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, r3, d0 ; CHECK-LE-NEXT: strdne r2, r3, [r0] @@ -341,51 +351,56 @@ define void @foo_zext_v2i64_v2i32(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-LABEL: foo_zext_v2i64_v2i32: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, lr} -; CHECK-LE-NEXT: push {r4, lr} +; CHECK-LE-NEXT: .save {r4, r5, r7, lr} +; CHECK-LE-NEXT: push {r4, r5, r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: ldrd r12, lr, [r1] -; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-LE-NEXT: rsbs.w r1, r12, #0 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 ; CHECK-LE-NEXT: vmov q1[2], q1[0], r12, lr -; CHECK-LE-NEXT: sbcs.w r1, r3, r12, asr #31 -; CHECK-LE-NEXT: csetm r1, lt -; CHECK-LE-NEXT: rsbs.w r4, lr, #0 -; CHECK-LE-NEXT: sbcs.w r4, r3, lr, asr #31 -; CHECK-LE-NEXT: bfi r3, r1, #0, #1 -; CHECK-LE-NEXT: csetm r1, lt -; CHECK-LE-NEXT: bfi r3, r1, #1, #1 -; CHECK-LE-NEXT: lsls r1, r3, #31 -; CHECK-LE-NEXT: itt ne -; CHECK-LE-NEXT: ldrne r1, [r2] -; CHECK-LE-NEXT: vmovne.32 q0[0], r1 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-LE-NEXT: csetm r4, lt +; CHECK-LE-NEXT: rsbs.w r5, lr, #0 +; CHECK-LE-NEXT: sbcs.w r5, r1, lr, asr #31 +; CHECK-LE-NEXT: mov.w r3, #0 +; CHECK-LE-NEXT: csetm r5, lt +; CHECK-LE-NEXT: bfi r1, r4, #0, #8 +; CHECK-LE-NEXT: bfi r3, r4, #0, #1 +; CHECK-LE-NEXT: bfi r1, r5, #8, #8 +; CHECK-LE-NEXT: bfi r3, r5, #1, #1 +; CHECK-LE-NEXT: vmov r5, s6 +; CHECK-LE-NEXT: lsls r1, r1, #31 +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vldrne s0, [r2] ; CHECK-LE-NEXT: lsls r1, r3, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] ; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r2, s4 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmov r1, s4 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vand q0, q0, q2 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: vmov r3, s6 -; CHECK-LE-NEXT: sbcs.w r2, r1, r2, asr #31 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: rsbs r4, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: rsbs r3, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, r1, asr #31 +; CHECK-LE-NEXT: csetm r3, lt +; CHECK-LE-NEXT: rsbs r4, r5, #0 +; CHECK-LE-NEXT: sbcs.w r5, r2, r5, asr #31 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: csetm r5, lt +; CHECK-LE-NEXT: bfi r2, r3, #0, #8 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: bfi r2, r5, #8, #8 +; CHECK-LE-NEXT: bfi r1, r5, #1, #1 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: pop {r4, pc} +; CHECK-LE-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-BE-LABEL: foo_zext_v2i64_v2i32: ; CHECK-BE: @ %bb.0: @ %entry @@ -408,8 +423,7 @@ ; CHECK-BE-NEXT: lsls r3, r1, #30 ; CHECK-BE-NEXT: bpl .LBB7_2 ; CHECK-BE-NEXT: @ %bb.1: @ %cond.load -; CHECK-BE-NEXT: ldr r3, [r2] -; CHECK-BE-NEXT: vmov.32 q2[1], r3 +; CHECK-BE-NEXT: vldr s9, [r2] ; CHECK-BE-NEXT: vrev64.32 q0, q2 ; CHECK-BE-NEXT: .LBB7_2: @ %else ; CHECK-BE-NEXT: vrev64.32 q2, q1 @@ -455,24 +469,28 @@ define void @foo_zext_v2i64_v2i32_unaligned(ptr %dest, ptr %mask, ptr %src) { ; CHECK-LE-LABEL: foo_zext_v2i64_v2i32_unaligned: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, lr} -; CHECK-LE-NEXT: push {r4, lr} +; CHECK-LE-NEXT: .save {r4, r5, r7, lr} +; CHECK-LE-NEXT: push {r4, r5, r7, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: ldrd r12, lr, [r1] -; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-LE-NEXT: rsbs.w r1, r12, #0 +; CHECK-LE-NEXT: rsbs.w r3, r12, #0 ; CHECK-LE-NEXT: vmov q1[2], q1[0], r12, lr -; CHECK-LE-NEXT: sbcs.w r1, r3, r12, asr #31 -; CHECK-LE-NEXT: csetm r1, lt -; CHECK-LE-NEXT: rsbs.w r4, lr, #0 -; CHECK-LE-NEXT: sbcs.w r4, r3, lr, asr #31 -; CHECK-LE-NEXT: bfi r3, r1, #0, #1 -; CHECK-LE-NEXT: csetm r1, lt -; CHECK-LE-NEXT: bfi r3, r1, #1, #1 -; CHECK-LE-NEXT: lsls r1, r3, #31 +; CHECK-LE-NEXT: sbcs.w r3, r1, r12, asr #31 +; CHECK-LE-NEXT: csetm r4, lt +; CHECK-LE-NEXT: rsbs.w r5, lr, #0 +; CHECK-LE-NEXT: sbcs.w r5, r1, lr, asr #31 +; CHECK-LE-NEXT: mov.w r3, #0 +; CHECK-LE-NEXT: csetm r5, lt +; CHECK-LE-NEXT: bfi r1, r4, #0, #8 +; CHECK-LE-NEXT: bfi r3, r4, #0, #1 +; CHECK-LE-NEXT: bfi r1, r5, #8, #8 +; CHECK-LE-NEXT: bfi r3, r5, #1, #1 +; CHECK-LE-NEXT: vmov r5, s6 +; CHECK-LE-NEXT: lsls r1, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r1, [r2] ; CHECK-LE-NEXT: vmovne.32 q0[0], r1 @@ -480,19 +498,21 @@ ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrmi r1, [r2, #4] ; CHECK-LE-NEXT: vmovmi.32 q0[2], r1 -; CHECK-LE-NEXT: vmov r2, s4 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmov r1, s4 +; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vand q0, q0, q2 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: vmov r3, s6 -; CHECK-LE-NEXT: sbcs.w r2, r1, r2, asr #31 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: rsbs r4, r3, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, r3, asr #31 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: rsbs r3, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r2, r1, asr #31 +; CHECK-LE-NEXT: csetm r3, lt +; CHECK-LE-NEXT: rsbs r4, r5, #0 +; CHECK-LE-NEXT: sbcs.w r5, r2, r5, asr #31 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: csetm r5, lt +; CHECK-LE-NEXT: bfi r2, r3, #0, #8 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: bfi r2, r5, #8, #8 +; CHECK-LE-NEXT: bfi r1, r5, #1, #1 +; CHECK-LE-NEXT: lsls r2, r2, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, r3, d0 ; CHECK-LE-NEXT: strdne r2, r3, [r0] @@ -501,7 +521,7 @@ ; CHECK-LE-NEXT: vmovmi r1, r2, d1 ; CHECK-LE-NEXT: strdmi r1, r2, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: pop {r4, pc} +; CHECK-LE-NEXT: pop {r4, r5, r7, pc} ; ; CHECK-BE-LABEL: foo_zext_v2i64_v2i32_unaligned: ; CHECK-BE: @ %bb.0: @ %entry @@ -730,20 +750,19 @@ ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmrs lr, p0 -; CHECK-LE-NEXT: and r1, lr, #1 +; CHECK-LE-NEXT: ands r1, lr, #1 ; CHECK-LE-NEXT: ubfx r3, lr, #4, #1 ; CHECK-LE-NEXT: rsb.w r12, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r12, #0, #1 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, lr, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 ; CHECK-LE-NEXT: ubfx r3, lr, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #3, #1 -; CHECK-LE-NEXT: lsls r3, r1, #31 ; CHECK-LE-NEXT: bne .LBB18_6 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r3, r1, #30 @@ -764,19 +783,18 @@ ; CHECK-LE-NEXT: vcvtb.f32.f16 s2, s1 ; CHECK-LE-NEXT: vcvtt.f32.f16 s1, s0 ; CHECK-LE-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-LE-NEXT: and r3, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: ands r3, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 ; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 ; CHECK-LE-NEXT: strne r2, [r0] @@ -921,20 +939,19 @@ ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmrs lr, p0 -; CHECK-LE-NEXT: and r1, lr, #1 +; CHECK-LE-NEXT: ands r1, lr, #1 ; CHECK-LE-NEXT: ubfx r3, lr, #4, #1 ; CHECK-LE-NEXT: rsb.w r12, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r12, #0, #1 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, lr, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 ; CHECK-LE-NEXT: ubfx r3, lr, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #3, #1 -; CHECK-LE-NEXT: lsls r3, r1, #31 ; CHECK-LE-NEXT: bne .LBB19_6 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r3, r1, #30 @@ -955,19 +972,18 @@ ; CHECK-LE-NEXT: vcvtb.f32.f16 s2, s1 ; CHECK-LE-NEXT: vcvtt.f32.f16 s1, s0 ; CHECK-LE-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-LE-NEXT: and r3, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: ands r3, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 ; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 ; CHECK-LE-NEXT: strne r2, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -50,20 +50,19 @@ ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: ands r1, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 ; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r2, [r0] ; CHECK-LE-NEXT: vmovne.32 q0[0], r2 @@ -200,20 +199,19 @@ ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: ands r1, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 ; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] ; CHECK-LE-NEXT: vmovne.32 q0[0], r2 @@ -358,20 +356,19 @@ ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: ands r1, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 ; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] ; CHECK-LE-NEXT: vmovne.32 q0[0], r2 @@ -557,37 +554,36 @@ ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmrs r1, p0 -; CHECK-LE-NEXT: and r2, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: ands r1, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #2, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #6, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #10, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #12, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #14, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #6, #1 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #7, #1 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrhne r2, [r0] ; CHECK-LE-NEXT: vmovne.16 q0[0], r2 +; CHECK-LE-NEXT: uxtb r1, r1 ; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrhmi r2, [r0, #2] @@ -1226,20 +1222,19 @@ ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: ands r1, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 ; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: ldrne r2, [r0] ; CHECK-LE-NEXT: vmovne s0, r2 @@ -1421,34 +1416,33 @@ ; CHECK-LE-NEXT: sub sp, #36 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 -; CHECK-LE-NEXT: vmrs r1, p0 -; CHECK-LE-NEXT: and r2, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: ands r1, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #2, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #6, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #10, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #12, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #14, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #6, #1 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r1 ; CHECK-LE-NEXT: bne .LBB45_9 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1730,22 +1724,25 @@ define arm_aapcs_vfpcc <2 x i64> @masked_v2i64_align4_zero(ptr %dest, <2 x i64> %a) { ; CHECK-LE-LABEL: masked_v2i64_align4_zero: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r7, lr} -; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r2, r3, d0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmov r12, lr, d1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: rsbs.w r3, r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, lr -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: vmov r1, r2, d0 +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: vmov lr, r12, d1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r3, r2 +; CHECK-LE-NEXT: csetm r4, lt +; CHECK-LE-NEXT: rsbs.w r2, lr, #0 +; CHECK-LE-NEXT: sbcs.w r2, r3, r12 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: csetm r2, lt +; CHECK-LE-NEXT: bfi r3, r4, #0, #8 +; CHECK-LE-NEXT: bfi r1, r4, #0, #1 +; CHECK-LE-NEXT: bfi r3, r2, #8, #8 ; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: beq .LBB49_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load ; CHECK-LE-NEXT: vldr d1, .LCPI49_0 @@ -1758,7 +1755,7 @@ ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vldrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: pop {r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .p2align 3 ; CHECK-LE-NEXT: @ %bb.4: ; CHECK-LE-NEXT: .LCPI49_0: @@ -1811,22 +1808,25 @@ define arm_aapcs_vfpcc <2 x double> @masked_v2f64_align4_zero(ptr %dest, <2 x double> %a, <2 x i64> %b) { ; CHECK-LE-LABEL: masked_v2f64_align4_zero: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r7, lr} -; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r2, r3, d2 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmov r12, lr, d3 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: rsbs.w r3, r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, lr -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: vmov r1, r2, d2 +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: vmov lr, r12, d3 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r3, r2 +; CHECK-LE-NEXT: csetm r4, lt +; CHECK-LE-NEXT: rsbs.w r2, lr, #0 +; CHECK-LE-NEXT: sbcs.w r2, r3, r12 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: csetm r2, lt +; CHECK-LE-NEXT: bfi r3, r4, #0, #8 +; CHECK-LE-NEXT: bfi r1, r4, #0, #1 +; CHECK-LE-NEXT: bfi r3, r2, #8, #8 ; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: beq .LBB50_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load ; CHECK-LE-NEXT: vldr d1, .LCPI50_0 @@ -1839,7 +1839,7 @@ ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vldrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: pop {r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .p2align 3 ; CHECK-LE-NEXT: @ %bb.4: ; CHECK-LE-NEXT: .LCPI50_0: @@ -1917,20 +1917,19 @@ ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: mov.w r12, #0 ; CHECK-LE-NEXT: vmrs r3, p0 -; CHECK-LE-NEXT: and r1, r3, #1 -; CHECK-LE-NEXT: rsbs r2, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: ands r1, r3, #1 +; CHECK-LE-NEXT: rsb.w r2, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: bfi r1, r2, #0, #1 ; CHECK-LE-NEXT: ubfx r2, r3, #4, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #1, #1 ; CHECK-LE-NEXT: ubfx r2, r3, #8, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #2, #1 ; CHECK-LE-NEXT: ubfx r2, r3, #12, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: beq .LBB52_2 ; CHECK-LE-NEXT: @ %bb.1: @ %cond.load ; CHECK-LE-NEXT: ldrh r2, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -28,20 +28,19 @@ ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: ands r1, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 ; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 ; CHECK-LE-NEXT: strne r2, [r0] @@ -184,37 +183,36 @@ ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr -; CHECK-LE-NEXT: vmrs r1, p0 -; CHECK-LE-NEXT: and r2, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r2, #0 -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: ands r1, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #2, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #6, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #10, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #12, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #14, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #6, #1 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #7, #1 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne.u16 r2, q0[0] ; CHECK-LE-NEXT: strhne r2, [r0] +; CHECK-LE-NEXT: uxtb r1, r1 ; CHECK-LE-NEXT: lsls r2, r1, #30 ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi.u16 r2, q0[1] @@ -476,19 +474,18 @@ ; CHECK-LE-NEXT: vcmp.i32 ne, q1, zr ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r3, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: ands r3, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 ; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 ; CHECK-LE-NEXT: strne r2, [r0] @@ -633,34 +630,33 @@ ; CHECK-LE-NEXT: .pad #36 ; CHECK-LE-NEXT: sub sp, #36 ; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr -; CHECK-LE-NEXT: movs r2, #0 -; CHECK-LE-NEXT: vmrs r1, p0 -; CHECK-LE-NEXT: and r3, r1, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #0, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #2, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #1, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #2, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #6, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #3, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #8, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #4, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #10, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #5, #1 -; CHECK-LE-NEXT: ubfx r3, r1, #12, #1 -; CHECK-LE-NEXT: ubfx r1, r1, #14, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 -; CHECK-LE-NEXT: bfi r2, r3, #6, #1 -; CHECK-LE-NEXT: rsbs r1, r1, #0 -; CHECK-LE-NEXT: bfi r2, r1, #7, #1 -; CHECK-LE-NEXT: uxtb r1, r2 -; CHECK-LE-NEXT: lsls r2, r2, #31 +; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: vmrs r2, p0 +; CHECK-LE-NEXT: ands r3, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #0, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #2, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #1, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #2, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #6, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #3, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #4, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #10, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #5, #1 +; CHECK-LE-NEXT: ubfx r3, r2, #12, #1 +; CHECK-LE-NEXT: ubfx r2, r2, #14, #1 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 +; CHECK-LE-NEXT: bfi r1, r3, #6, #1 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 +; CHECK-LE-NEXT: bfi r1, r2, #7, #1 +; CHECK-LE-NEXT: uxtb r1, r1 ; CHECK-LE-NEXT: bne .LBB16_9 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -915,29 +911,32 @@ define arm_aapcs_vfpcc void @masked_v2i64(ptr %dest, <2 x i64> %a) { ; CHECK-LE-LABEL: masked_v2i64: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r7, lr} -; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r2, r3, d0 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmov r12, lr, d1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: rsbs.w r3, r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, lr -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: vmov r1, r2, d0 +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: vmov lr, r12, d1 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r3, r2 +; CHECK-LE-NEXT: csetm r4, lt +; CHECK-LE-NEXT: rsbs.w r2, lr, #0 +; CHECK-LE-NEXT: sbcs.w r2, r3, r12 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: csetm r2, lt +; CHECK-LE-NEXT: bfi r3, r4, #0, #8 +; CHECK-LE-NEXT: bfi r1, r4, #0, #1 +; CHECK-LE-NEXT: bfi r3, r2, #8, #8 ; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: pop {r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: masked_v2i64: ; CHECK-BE: @ %bb.0: @ %entry @@ -974,29 +973,32 @@ define arm_aapcs_vfpcc void @masked_v2f64(ptr %dest, <2 x double> %a, <2 x i64> %b) { ; CHECK-LE-LABEL: masked_v2f64: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r7, lr} -; CHECK-LE-NEXT: push {r7, lr} +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 -; CHECK-LE-NEXT: vmov r2, r3, d2 -; CHECK-LE-NEXT: movs r1, #0 -; CHECK-LE-NEXT: vmov r12, lr, d3 -; CHECK-LE-NEXT: rsbs r2, r2, #0 -; CHECK-LE-NEXT: sbcs.w r2, r1, r3 -; CHECK-LE-NEXT: csetm r2, lt -; CHECK-LE-NEXT: rsbs.w r3, r12, #0 -; CHECK-LE-NEXT: sbcs.w r3, r1, lr -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 +; CHECK-LE-NEXT: vmov r1, r2, d2 +; CHECK-LE-NEXT: movs r3, #0 +; CHECK-LE-NEXT: vmov lr, r12, d3 +; CHECK-LE-NEXT: rsbs r1, r1, #0 +; CHECK-LE-NEXT: sbcs.w r1, r3, r2 +; CHECK-LE-NEXT: csetm r4, lt +; CHECK-LE-NEXT: rsbs.w r2, lr, #0 +; CHECK-LE-NEXT: sbcs.w r2, r3, r12 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: csetm r2, lt +; CHECK-LE-NEXT: bfi r3, r4, #0, #8 +; CHECK-LE-NEXT: bfi r1, r4, #0, #1 +; CHECK-LE-NEXT: bfi r3, r2, #8, #8 ; CHECK-LE-NEXT: bfi r1, r2, #1, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: it ne ; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 ; CHECK-LE-NEXT: it mi ; CHECK-LE-NEXT: vstrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: pop {r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: masked_v2f64: ; CHECK-BE: @ %bb.0: @ %entry @@ -1097,20 +1099,19 @@ ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s32 gt, q0, zr ; CHECK-LE-NEXT: vmrs r2, p0 -; CHECK-LE-NEXT: and r1, r2, #1 -; CHECK-LE-NEXT: rsbs r3, r1, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: ands r1, r2, #1 +; CHECK-LE-NEXT: rsb.w r3, r1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: bfi r1, r3, #0, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #4, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #1, #1 ; CHECK-LE-NEXT: ubfx r3, r2, #8, #1 ; CHECK-LE-NEXT: ubfx r2, r2, #12, #1 -; CHECK-LE-NEXT: rsbs r3, r3, #0 +; CHECK-LE-NEXT: rsb.w r3, r3, #0 ; CHECK-LE-NEXT: bfi r1, r3, #2, #1 -; CHECK-LE-NEXT: rsbs r2, r2, #0 +; CHECK-LE-NEXT: rsb.w r2, r2, #0 ; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 ; CHECK-LE-NEXT: itt ne ; CHECK-LE-NEXT: vmovne r2, s0 ; CHECK-LE-NEXT: strhne r2, [r0] @@ -1177,30 +1178,37 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(ptr %dest, <4 x float> %a) { ; CHECK-LE-LABEL: masked_v4f16_align4: ; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.f32 s0, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: csetm r12, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: bfi r3, r12, #0, #4 +; CHECK-LE-NEXT: bfi r1, r12, #0, #1 +; CHECK-LE-NEXT: csetm lr, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s3, #0 -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: bfi r3, lr, #4, #4 +; CHECK-LE-NEXT: bfi r1, lr, #1, #1 ; CHECK-LE-NEXT: csetm r2, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: bfi r3, r2, #8, #4 ; CHECK-LE-NEXT: bfi r1, r2, #2, #1 -; CHECK-LE-NEXT: csetm r2, gt -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: csetm r4, gt +; CHECK-LE-NEXT: bfi r3, r4, #12, #4 +; CHECK-LE-NEXT: bfi r1, r4, #3, #1 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: bne .LBB25_5 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1213,7 +1221,7 @@ ; CHECK-LE-NEXT: bmi .LBB25_8 ; CHECK-LE-NEXT: .LBB25_4: @ %else6 ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .LBB25_5: @ %cond.store ; CHECK-LE-NEXT: vstr.16 s4, [r0] ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1231,7 +1239,7 @@ ; CHECK-LE-NEXT: vmovx.f16 s0, s5 ; CHECK-LE-NEXT: vstr.16 s0, [r0, #6] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: masked_v4f16_align4: ; CHECK-BE: @ %bb.0: @ %entry @@ -1301,30 +1309,37 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(ptr %dest, <4 x float> %a) { ; CHECK-LE-LABEL: masked_v4f16_align2: ; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: .pad #4 ; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.f32 s0, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: csetm r12, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: bfi r3, r12, #0, #4 +; CHECK-LE-NEXT: bfi r1, r12, #0, #1 +; CHECK-LE-NEXT: csetm lr, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s3, #0 -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: bfi r3, lr, #4, #4 +; CHECK-LE-NEXT: bfi r1, lr, #1, #1 ; CHECK-LE-NEXT: csetm r2, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: bfi r3, r2, #8, #4 ; CHECK-LE-NEXT: bfi r1, r2, #2, #1 -; CHECK-LE-NEXT: csetm r2, gt -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: csetm r4, gt +; CHECK-LE-NEXT: bfi r3, r4, #12, #4 +; CHECK-LE-NEXT: bfi r1, r4, #3, #1 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: bne .LBB26_5 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1337,7 +1352,7 @@ ; CHECK-LE-NEXT: bmi .LBB26_8 ; CHECK-LE-NEXT: .LBB26_4: @ %else6 ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .LBB26_5: @ %cond.store ; CHECK-LE-NEXT: vstr.16 s4, [r0] ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1355,7 +1370,7 @@ ; CHECK-LE-NEXT: vmovx.f16 s0, s5 ; CHECK-LE-NEXT: vstr.16 s0, [r0, #6] ; CHECK-LE-NEXT: add sp, #4 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: masked_v4f16_align2: ; CHECK-BE: @ %bb.0: @ %entry @@ -1425,30 +1440,37 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(ptr %dest, <4 x float> %a) { ; CHECK-LE-LABEL: masked_v4f16_align1: ; CHECK-LE: @ %bb.0: @ %entry +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: .pad #20 ; CHECK-LE-NEXT: sub sp, #20 ; CHECK-LE-NEXT: vcmp.f32 s0, #0 -; CHECK-LE-NEXT: movs r1, #0 +; CHECK-LE-NEXT: movs r3, #0 ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s1, #0 +; CHECK-LE-NEXT: mov.w r1, #0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s4, s0 ; CHECK-LE-NEXT: vcvtb.f16.f32 s5, s2 ; CHECK-LE-NEXT: vcvtt.f16.f32 s4, s1 ; CHECK-LE-NEXT: vcvtt.f16.f32 s5, s3 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: csetm r12, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s2, #0 -; CHECK-LE-NEXT: bfi r1, r2, #0, #1 -; CHECK-LE-NEXT: csetm r2, gt +; CHECK-LE-NEXT: bfi r3, r12, #0, #4 +; CHECK-LE-NEXT: bfi r1, r12, #0, #1 +; CHECK-LE-NEXT: csetm lr, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-LE-NEXT: vcmp.f32 s3, #0 -; CHECK-LE-NEXT: bfi r1, r2, #1, #1 +; CHECK-LE-NEXT: bfi r3, lr, #4, #4 +; CHECK-LE-NEXT: bfi r1, lr, #1, #1 ; CHECK-LE-NEXT: csetm r2, gt ; CHECK-LE-NEXT: vmrs APSR_nzcv, fpscr +; CHECK-LE-NEXT: bfi r3, r2, #8, #4 ; CHECK-LE-NEXT: bfi r1, r2, #2, #1 -; CHECK-LE-NEXT: csetm r2, gt -; CHECK-LE-NEXT: bfi r1, r2, #3, #1 -; CHECK-LE-NEXT: lsls r2, r1, #31 +; CHECK-LE-NEXT: csetm r4, gt +; CHECK-LE-NEXT: bfi r3, r4, #12, #4 +; CHECK-LE-NEXT: bfi r1, r4, #3, #1 +; CHECK-LE-NEXT: lsls r2, r3, #31 ; CHECK-LE-NEXT: bne .LBB27_5 ; CHECK-LE-NEXT: @ %bb.1: @ %else ; CHECK-LE-NEXT: lsls r2, r1, #30 @@ -1461,7 +1483,7 @@ ; CHECK-LE-NEXT: bmi .LBB27_8 ; CHECK-LE-NEXT: .LBB27_4: @ %else6 ; CHECK-LE-NEXT: add sp, #20 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .LBB27_5: @ %cond.store ; CHECK-LE-NEXT: vstr.16 s4, [sp, #12] ; CHECK-LE-NEXT: ldrh.w r2, [sp, #12] @@ -1487,7 +1509,7 @@ ; CHECK-LE-NEXT: ldrh.w r1, [sp] ; CHECK-LE-NEXT: strh r1, [r0, #6] ; CHECK-LE-NEXT: add sp, #20 -; CHECK-LE-NEXT: bx lr +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: masked_v4f16_align1: ; CHECK-BE: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll @@ -313,12 +313,18 @@ ; CHECK-NEXT: vmov r12, r1, d9 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: bfi r4, r0, #0, #8 ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: bfi r4, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r4 diff --git a/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll b/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll --- a/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll +++ b/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll @@ -13,12 +13,15 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #96 ; CHECK-NEXT: sub sp, #96 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: add.w r0, r0, #256 ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] -; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: add.w lr, sp, #64 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: vld21.32 {q0, q1}, [r5]! -; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vld20.32 {q4, q5}, [r4] +; CHECK-NEXT: mov r5, r4 +; CHECK-NEXT: vld21.32 {q0, q1}, [r0] +; CHECK-NEXT: add.w r0, r4, #192 +; CHECK-NEXT: vld21.32 {q4, q5}, [r5]! ; CHECK-NEXT: vstmia lr, {d0, d1, d2, d3} @ 32-byte Spill ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] ; CHECK-NEXT: add.w lr, sp, #32 @@ -27,29 +30,26 @@ ; CHECK-NEXT: vstmia lr, {d0, d1, d2, d3} @ 32-byte Spill ; CHECK-NEXT: vld20.32 {q0, q1}, [r0] ; CHECK-NEXT: vld21.32 {q0, q1}, [r0] -; CHECK-NEXT: add.w r0, r4, #192 +; CHECK-NEXT: add.w r0, r4, #64 ; CHECK-NEXT: vld20.32 {q6, q7}, [r0] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3} @ 32-byte Spill ; CHECK-NEXT: vld21.32 {q6, q7}, [r0] -; CHECK-NEXT: add.w r0, r4, #256 -; CHECK-NEXT: vld20.32 {q4, q5}, [r0] -; CHECK-NEXT: vld21.32 {q4, q5}, [r0] ; CHECK-NEXT: bl external_function -; CHECK-NEXT: vldmia sp, {d2, d3, d4, d5} @ 32-byte Reload +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3} @ 32-byte Reload ; CHECK-NEXT: add.w lr, sp, #32 -; CHECK-NEXT: vstrw.32 q2, [r4, #80] -; CHECK-NEXT: vstrw.32 q5, [r4, #144] -; CHECK-NEXT: vstrw.32 q4, [r4, #128] -; CHECK-NEXT: vstrw.32 q7, [r4, #112] -; CHECK-NEXT: vstrw.32 q1, [r4, #64] -; CHECK-NEXT: vldmia lr, {d2, d3, d4, d5} @ 32-byte Reload +; CHECK-NEXT: vstrw.32 q1, [r4, #80] +; CHECK-NEXT: vstrw.32 q5, [r4, #16] +; CHECK-NEXT: vstrw.32 q4, [r4] +; CHECK-NEXT: vstrw.32 q6, [r5] +; CHECK-NEXT: vstrw.32 q0, [r4, #64] +; CHECK-NEXT: vldmia lr, {d0, d1, d2, d3} @ 32-byte Reload ; CHECK-NEXT: add.w lr, sp, #64 -; CHECK-NEXT: vstrw.32 q2, [r4, #48] -; CHECK-NEXT: vstrw.32 q6, [r4, #96] -; CHECK-NEXT: vstrw.32 q1, [r5] -; CHECK-NEXT: vldmia lr, {d2, d3, d4, d5} @ 32-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r4, #16] -; CHECK-NEXT: vstrw.32 q1, [r4] +; CHECK-NEXT: vstrw.32 q1, [r4, #112] +; CHECK-NEXT: vstrw.32 q7, [r4, #48] +; CHECK-NEXT: vstrw.32 q0, [r4, #96] +; CHECK-NEXT: vldmia lr, {d0, d1, d2, d3} @ 32-byte Reload +; CHECK-NEXT: vstrw.32 q1, [r4, #144] +; CHECK-NEXT: vstrw.32 q0, [r4, #128] ; CHECK-NEXT: add sp, #96 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -151,62 +151,61 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #408 ; CHECK-NEXT: sub sp, #408 -; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals +; CHECK-NEXT: movw r8, :lower16:.L_MergedGlobals ; CHECK-NEXT: vldr s15, .LCPI1_1 -; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals -; CHECK-NEXT: movw r2, :lower16:e -; CHECK-NEXT: mov r4, r7 -; CHECK-NEXT: mov r3, r7 -; CHECK-NEXT: ldr r6, [r4, #8]! +; CHECK-NEXT: movt r8, :upper16:.L_MergedGlobals +; CHECK-NEXT: movw r4, :lower16:e +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: ldr r6, [r5, #8]! ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: ldr r0, [r3, #4]! +; CHECK-NEXT: ldr r0, [r1, #4]! ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: movt r2, :upper16:e -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r4 -; CHECK-NEXT: vmov s13, r3 +; CHECK-NEXT: movt r4, :upper16:e +; CHECK-NEXT: vmov r7, s15 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r5 +; CHECK-NEXT: vmov s13, r1 ; CHECK-NEXT: vldr s12, .LCPI1_0 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r2 -; CHECK-NEXT: vdup.32 q7, r3 -; CHECK-NEXT: vmov q6[2], q6[0], r3, r5 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r4 +; CHECK-NEXT: vdup.32 q7, r1 +; CHECK-NEXT: vmov q6[2], q6[0], r1, r7 ; CHECK-NEXT: vstrw.32 q0, [sp, #92] ; CHECK-NEXT: vmov q0, q7 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vmov q4, q7 -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q7[1], r2 -; CHECK-NEXT: vmov s21, r2 -; CHECK-NEXT: movs r1, #64 +; CHECK-NEXT: vmov q6[3], q6[1], r1, r4 +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.32 q7[1], r4 +; CHECK-NEXT: vmov s21, r4 ; CHECK-NEXT: vmov.f32 s20, s12 -; CHECK-NEXT: str r0, [sp, #40] +; CHECK-NEXT: strd r0, r2, [sp, #40] ; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: str r6, [r0] +; CHECK-NEXT: movs r3, #64 ; CHECK-NEXT: vmov.f32 s23, s15 +; CHECK-NEXT: str r6, [r0] ; CHECK-NEXT: str r0, [r0] ; CHECK-NEXT: vstrw.32 q5, [r0] ; CHECK-NEXT: vstrw.32 q7, [r0] ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q6, [r0] -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r3 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r3 +; CHECK-NEXT: vmov q1[2], q1[0], r5, r1 +; CHECK-NEXT: vmov q2[2], q2[0], r1, r1 ; CHECK-NEXT: mov.w r12, #4 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r4 ; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 -; CHECK-NEXT: vmov.32 q4[0], r8 -; CHECK-NEXT: @ implicit-def: $r2 -; CHECK-NEXT: str.w r8, [sp, #44] -; CHECK-NEXT: vstrw.32 q3, [sp, #60] +; CHECK-NEXT: vmov q1[3], q1[1], r4, r5 +; CHECK-NEXT: vmov q2[3], q2[1], r5, r7 +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: @ implicit-def: $r0 ; CHECK-NEXT: strh.w r12, [sp, #406] -; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2 +; CHECK-NEXT: vstrw.32 q3, [sp, #60] +; CHECK-NEXT: wlstp.8 lr, r3, .LBB1_2 ; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: .LBB1_2: @ %entry ; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: str.w r8, [r7] +; CHECK-NEXT: str.w r2, [r8] ; CHECK-NEXT: vstrw.32 q4, [r0] ; CHECK-NEXT: vstrw.32 q2, [r0] ; CHECK-NEXT: str.w r12, [sp, #324] diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll --- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll @@ -287,16 +287,16 @@ ; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: .LBB2_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r4, #16] -; CHECK-NEXT: vldrw.u32 q1, [r5, #16] -; CHECK-NEXT: vldrw.u32 q2, [r6, #16] -; CHECK-NEXT: vldrw.u32 q3, [r6] -; CHECK-NEXT: vfma.f32 q2, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [r4], #32 ; CHECK-NEXT: vldrw.u32 q1, [r5], #32 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vldrw.u32 q3, [r6, #16] +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r4, #-16] +; CHECK-NEXT: vldrw.u32 q1, [r5, #-16] ; CHECK-NEXT: vfma.f32 q3, q1, q0 -; CHECK-NEXT: vstrw.32 q3, [r6], #32 -; CHECK-NEXT: vstrw.32 q2, [r6, #-16] +; CHECK-NEXT: vstrw.32 q3, [r6, #16] +; CHECK-NEXT: vstrw.32 q2, [r6], #32 ; CHECK-NEXT: le lr, .LBB2_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block ; CHECK-NEXT: cmp r12, r3 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll @@ -89,9 +89,8 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: uxtb r0, r0 -; CHECK-BE-NEXT: vmov.i8 q1, #0x0 ; CHECK-BE-NEXT: rbit r0, r0 +; CHECK-BE-NEXT: vmov.i8 q1, #0x0 ; CHECK-BE-NEXT: vmov.i8 q2, #0xff ; CHECK-BE-NEXT: lsrs r0, r0, #24 ; CHECK-BE-NEXT: vmsr p0, r0 @@ -141,12 +140,11 @@ ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: .pad #4 ; CHECK-BE-NEXT: sub sp, #4 -; CHECK-BE-NEXT: uxth r0, r0 -; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: rbit r0, r0 +; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 -; CHECK-BE-NEXT: vrev32.8 q0, q0 ; CHECK-BE-NEXT: lsrs r0, r0, #16 +; CHECK-BE-NEXT: vrev32.8 q0, q0 ; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.8 q0, q1 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll @@ -172,14 +172,16 @@ ; CHECK-MVE-NEXT: mov r3, r5 ; CHECK-MVE-NEXT: bl __aeabi_dcmpeq ; CHECK-MVE-NEXT: vmov r2, r1, d8 -; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: clz r0, r0 ; CHECK-MVE-NEXT: mov r3, r5 -; CHECK-MVE-NEXT: csetm r6, eq +; CHECK-MVE-NEXT: lsrs r0, r0, #5 +; CHECK-MVE-NEXT: csetm r6, ne ; CHECK-MVE-NEXT: mov r0, r2 ; CHECK-MVE-NEXT: mov r2, r4 ; CHECK-MVE-NEXT: bl __aeabi_dcmpeq -; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: csetm r0, eq +; CHECK-MVE-NEXT: clz r0, r0 +; CHECK-MVE-NEXT: lsrs r0, r0, #5 +; CHECK-MVE-NEXT: csetm r0, ne ; CHECK-MVE-NEXT: vmov q0[2], q0[0], r0, r6 ; CHECK-MVE-NEXT: vmov q0[3], q0[1], r0, r6 ; CHECK-MVE-NEXT: vpop {d8, d9} @@ -198,22 +200,24 @@ ; CHECK-MVEFP-NEXT: vpush {d8, d9} ; CHECK-MVEFP-NEXT: vmov q4, q0 ; CHECK-MVEFP-NEXT: vldr d0, .LCPI6_0 -; CHECK-MVEFP-NEXT: vmov r0, r1, d9 +; CHECK-MVEFP-NEXT: vmov r0, r1, d8 ; CHECK-MVEFP-NEXT: vmov r4, r5, d0 ; CHECK-MVEFP-NEXT: mov r2, r4 ; CHECK-MVEFP-NEXT: mov r3, r5 ; CHECK-MVEFP-NEXT: bl __aeabi_dcmpeq ; CHECK-MVEFP-NEXT: mov r6, r0 -; CHECK-MVEFP-NEXT: vmov r0, r1, d8 +; CHECK-MVEFP-NEXT: vmov r0, r1, d9 ; CHECK-MVEFP-NEXT: mov r2, r4 ; CHECK-MVEFP-NEXT: mov r3, r5 ; CHECK-MVEFP-NEXT: bl __aeabi_dcmpeq -; CHECK-MVEFP-NEXT: cmp r6, #0 -; CHECK-MVEFP-NEXT: csetm r1, eq -; CHECK-MVEFP-NEXT: cmp r0, #0 -; CHECK-MVEFP-NEXT: csetm r0, eq -; CHECK-MVEFP-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-MVEFP-NEXT: vmov q0[3], q0[1], r0, r1 +; CHECK-MVEFP-NEXT: clz r0, r0 +; CHECK-MVEFP-NEXT: clz r1, r6 +; CHECK-MVEFP-NEXT: lsrs r0, r0, #5 +; CHECK-MVEFP-NEXT: csetm r0, ne +; CHECK-MVEFP-NEXT: lsrs r1, r1, #5 +; CHECK-MVEFP-NEXT: csetm r1, ne +; CHECK-MVEFP-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-MVEFP-NEXT: vmov q0[3], q0[1], r1, r0 ; CHECK-MVEFP-NEXT: vpop {d8, d9} ; CHECK-MVEFP-NEXT: pop {r4, r5, r6, pc} ; CHECK-MVEFP-NEXT: .p2align 3 @@ -408,16 +412,18 @@ ; CHECK-MVE-NEXT: mov r3, r5 ; CHECK-MVE-NEXT: bl __aeabi_dcmpeq ; CHECK-MVE-NEXT: vmov r2, r1, d8 +; CHECK-MVE-NEXT: clz r0, r0 ; CHECK-MVE-NEXT: adr r3, .LCPI13_1 -; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: lsrs r0, r0, #5 ; CHECK-MVE-NEXT: vldrw.u32 q4, [r3] ; CHECK-MVE-NEXT: mov r3, r5 -; CHECK-MVE-NEXT: csetm r6, eq +; CHECK-MVE-NEXT: csetm r6, ne ; CHECK-MVE-NEXT: mov r0, r2 ; CHECK-MVE-NEXT: mov r2, r4 ; CHECK-MVE-NEXT: bl __aeabi_dcmpeq -; CHECK-MVE-NEXT: cmp r0, #0 -; CHECK-MVE-NEXT: csetm r0, eq +; CHECK-MVE-NEXT: clz r0, r0 +; CHECK-MVE-NEXT: lsrs r0, r0, #5 +; CHECK-MVE-NEXT: csetm r0, ne ; CHECK-MVE-NEXT: vmov q0[2], q0[0], r0, r6 ; CHECK-MVE-NEXT: vand q0, q0, q4 ; CHECK-MVE-NEXT: vpop {d8, d9} @@ -441,24 +447,27 @@ ; CHECK-MVEFP-NEXT: vpush {d8, d9} ; CHECK-MVEFP-NEXT: vmov q4, q0 ; CHECK-MVEFP-NEXT: vldr d0, .LCPI13_0 -; CHECK-MVEFP-NEXT: vmov r0, r1, d8 +; CHECK-MVEFP-NEXT: vmov r0, r1, d9 ; CHECK-MVEFP-NEXT: vmov r4, r5, d0 ; CHECK-MVEFP-NEXT: mov r2, r4 ; CHECK-MVEFP-NEXT: mov r3, r5 ; CHECK-MVEFP-NEXT: bl __aeabi_dcmpeq -; CHECK-MVEFP-NEXT: mov r6, r0 -; CHECK-MVEFP-NEXT: vmov r0, r1, d9 -; CHECK-MVEFP-NEXT: mov r2, r4 +; CHECK-MVEFP-NEXT: vmov r2, r1, d8 +; CHECK-MVEFP-NEXT: clz r0, r0 ; CHECK-MVEFP-NEXT: mov r3, r5 +; CHECK-MVEFP-NEXT: vldr s17, .LCPI13_1 +; CHECK-MVEFP-NEXT: lsrs r0, r0, #5 +; CHECK-MVEFP-NEXT: cset r6, ne +; CHECK-MVEFP-NEXT: mov r0, r2 +; CHECK-MVEFP-NEXT: mov r2, r4 ; CHECK-MVEFP-NEXT: bl __aeabi_dcmpeq -; CHECK-MVEFP-NEXT: cmp r0, #0 -; CHECK-MVEFP-NEXT: vldr s1, .LCPI13_1 -; CHECK-MVEFP-NEXT: cset r0, eq -; CHECK-MVEFP-NEXT: cmp r6, #0 -; CHECK-MVEFP-NEXT: vmov s2, r0 -; CHECK-MVEFP-NEXT: cset r0, eq -; CHECK-MVEFP-NEXT: vmov s0, r0 -; CHECK-MVEFP-NEXT: vmov.f32 s3, s1 +; CHECK-MVEFP-NEXT: clz r0, r0 +; CHECK-MVEFP-NEXT: vmov s18, r6 +; CHECK-MVEFP-NEXT: vmov.f32 s19, s17 +; CHECK-MVEFP-NEXT: lsrs r0, r0, #5 +; CHECK-MVEFP-NEXT: cset r0, ne +; CHECK-MVEFP-NEXT: vmov s16, r0 +; CHECK-MVEFP-NEXT: vmov q0, q4 ; CHECK-MVEFP-NEXT: vpop {d8, d9} ; CHECK-MVEFP-NEXT: pop {r4, r5, r6, pc} ; CHECK-MVEFP-NEXT: .p2align 3 diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-not.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-not.ll @@ -4,8 +4,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpeqz_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpeqz_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -17,8 +17,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpnez_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpnez_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -30,8 +30,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpsltz_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpsltz_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -43,8 +43,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpsgtz_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpsgtz_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -56,8 +56,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpslez_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpslez_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -69,8 +69,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpsgez_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpsgez_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -82,8 +82,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpultz_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpultz_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -95,8 +95,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpugtz_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpugtz_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -108,8 +108,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpulez_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpulez_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -121,8 +121,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpugez_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpugez_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -136,8 +136,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpeq_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpeq_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -149,8 +149,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpne_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpne_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -162,8 +162,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpslt_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpslt_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -175,8 +175,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpsgt_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpsgt_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -188,8 +188,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpsle_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpsle_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -201,8 +201,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpsge_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpsge_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -214,8 +214,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpult_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpult_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -227,8 +227,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpugt_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpugt_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -240,8 +240,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpule_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpule_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -253,8 +253,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpuge_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpuge_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -269,8 +269,8 @@ define arm_aapcs_vfpcc <8 x i16> @cmpeqz_v8i1(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: cmpeqz_v8i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i16 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <8 x i16> %a, zeroinitializer @@ -282,8 +282,8 @@ define arm_aapcs_vfpcc <8 x i16> @cmpeq_v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { ; CHECK-LABEL: cmpeq_v8i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i16 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <8 x i16> %a, zeroinitializer @@ -296,8 +296,8 @@ define arm_aapcs_vfpcc <16 x i8> @cmpeqz_v16i1(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: cmpeqz_v16i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i8 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i8 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <16 x i8> %a, zeroinitializer @@ -309,8 +309,8 @@ define arm_aapcs_vfpcc <16 x i8> @cmpeq_v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { ; CHECK-LABEL: cmpeq_v16i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i8 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i8 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <16 x i8> %a, zeroinitializer @@ -326,14 +326,14 @@ ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: bfi r1, r0, #0, #8 ; CHECK-NEXT: vmov r0, r2, d1 ; CHECK-NEXT: orrs r0, r2 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer @@ -348,14 +348,14 @@ ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: orrs r0, r1 ; CHECK-NEXT: mov.w r1, #0 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: bfi r1, r0, #0, #8 ; CHECK-NEXT: vmov r0, r2, d1 ; CHECK-NEXT: orrs r0, r2 -; CHECK-NEXT: csetm r0, eq +; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <2 x i64> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-xor.ll @@ -171,8 +171,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpugez_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpugez_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll --- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll +++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll @@ -34,45 +34,44 @@ define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) { ; CHECK-LABEL: sadd_int64_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w r12, r2, r0 -; CHECK-NEXT: vmov r0, r4, d1 -; CHECK-NEXT: adc.w lr, r3, r1 -; CHECK-NEXT: subs.w r2, r12, r2 -; CHECK-NEXT: sbcs.w r2, lr, r3 -; CHECK-NEXT: cset r2, lt -; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: it mi -; CHECK-NEXT: eormi r2, r2, #1 -; CHECK-NEXT: rsbs r1, r2, #0 +; CHECK-NEXT: mov.w r12, #1 +; CHECK-NEXT: vmov r3, r2, d0 +; CHECK-NEXT: adds.w lr, r3, r0 +; CHECK-NEXT: lsr.w r4, r1, #31 +; CHECK-NEXT: adc.w r0, r2, r1 +; CHECK-NEXT: subs.w r3, lr, r3 +; CHECK-NEXT: sbcs.w r2, r0, r2 +; CHECK-NEXT: it lt +; CHECK-NEXT: eorlt.w r4, r12, r1, lsr #31 +; CHECK-NEXT: rsbs r1, r4, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: bfi r2, r1, #0, #8 ; CHECK-NEXT: vmov r1, r3, d3 -; CHECK-NEXT: adds r1, r1, r0 -; CHECK-NEXT: adc.w r5, r4, r3 -; CHECK-NEXT: subs r0, r1, r0 -; CHECK-NEXT: sbcs.w r0, r5, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r12, r1 -; CHECK-NEXT: cset r0, lt -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: it mi -; CHECK-NEXT: eormi r0, r0, #1 -; CHECK-NEXT: asr.w r1, lr, #31 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q0[3], q0[1], lr, r5 -; CHECK-NEXT: bfi r2, r0, #8, #8 -; CHECK-NEXT: asrs r0, r5, #31 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmsr p0, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov r4, r5, d1 +; CHECK-NEXT: adds r1, r1, r4 +; CHECK-NEXT: lsr.w r7, r3, #31 +; CHECK-NEXT: adc.w r6, r5, r3 +; CHECK-NEXT: vmov q0[2], q0[0], lr, r1 +; CHECK-NEXT: subs r4, r1, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r6 +; CHECK-NEXT: asr.w r1, r6, #31 +; CHECK-NEXT: asr.w r0, r0, #31 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 +; CHECK-NEXT: sbcs.w r4, r6, r5 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 ; CHECK-NEXT: adr r0, .LCPI3_0 +; CHECK-NEXT: it lt +; CHECK-NEXT: eorlt.w r7, r12, r3, lsr #31 +; CHECK-NEXT: rsbs r3, r7, #0 ; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: bfi r2, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r2 ; CHECK-NEXT: veor q1, q1, q2 ; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI3_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll @@ -247,36 +247,52 @@ define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(ptr %base, ptr %offptr, <8 x i64> %input) { ; CHECK-LABEL: trunc_signed_unscaled_i64_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrb.s32 q4, [r1] +; CHECK-NEXT: vldrb.s32 q5, [r1, #4] ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vmov r2, r3, d8 -; CHECK-NEXT: vmov r12, lr, d9 -; CHECK-NEXT: vldrb.s32 q4, [r1, #4] -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vmov r0, r1, d8 -; CHECK-NEXT: strh r4, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r4, r5, d9 -; CHECK-NEXT: strh r2, [r3] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: strh.w r2, [r12] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: strh.w r2, [lr] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: strh r2, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: strh r0, [r1] -; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vadd.i32 q5, q5, r0 +; CHECK-NEXT: vmov r3, r2, d8 +; CHECK-NEXT: vmov r1, r0, d9 +; CHECK-NEXT: vmov.16 q4[0], r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.16 q4[1], r4 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov.16 q4[2], r4 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov.16 q4[3], r4 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov.16 q4[4], r4 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov.16 q4[5], r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov lr, r12, d10 +; CHECK-NEXT: vmov.16 q4[6], r4 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov.16 q4[7], r4 +; CHECK-NEXT: vmov r4, r5, d11 +; CHECK-NEXT: vmov.u16 r6, q4[0] +; CHECK-NEXT: strh r6, [r3] +; CHECK-NEXT: vmov.u16 r3, q4[1] +; CHECK-NEXT: strh r3, [r2] +; CHECK-NEXT: vmov.u16 r2, q4[2] +; CHECK-NEXT: strh r2, [r1] +; CHECK-NEXT: vmov.u16 r1, q4[3] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: strh.w r0, [lr] +; CHECK-NEXT: vmov.u16 r0, q4[5] +; CHECK-NEXT: strh.w r0, [r12] +; CHECK-NEXT: vmov.u16 r0, q4[6] ; CHECK-NEXT: strh r0, [r4] -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.u16 r0, q4[7] ; CHECK-NEXT: strh r0, [r5] -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %offs = load <8 x i8>, ptr %offptr, align 1 %offs.sext = sext <8 x i8> %offs to <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -36,14 +36,13 @@ ; CHECK-LABEL: unscaled_v2i8_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrb r2, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: ldrb r1, [r1, #1] -; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strb r2, [r0, r1] -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: add r2, r0 +; CHECK-NEXT: adds r3, r0, r1 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: strb r3, [r2] ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: strb r2, [r0, r1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -8,9 +8,12 @@ define arm_aapcs_vfpcc void @ptr_v2i32(<2 x i32> %v, ptr %offptr) { ; CHECK-LABEL: ptr_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrd r1, r0, [r0] -; CHECK-NEXT: str r2, [r1] +; CHECK-NEXT: vldr s4, [r0] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vldr s6, [r0, #4] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: str r1, [r0] ; CHECK-NEXT: bx lr @@ -125,8 +128,11 @@ define arm_aapcs_vfpcc void @ptr_v2f32(<2 x float> %v, ptr %offptr) { ; CHECK-LABEL: ptr_v2f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r1, r0, [r0] -; CHECK-NEXT: vstr s0, [r1] +; CHECK-NEXT: vldr s4, [r0] +; CHECK-NEXT: vldr s2, [r0, #4] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vstr s0, [r0] +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vstr s1, [r0] ; CHECK-NEXT: bx lr entry: @@ -216,9 +222,12 @@ define arm_aapcs_vfpcc void @ptr_v2i16_trunc(<2 x i32> %v, ptr %offptr) { ; CHECK-LABEL: ptr_v2i16_trunc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrd r1, r0, [r0] -; CHECK-NEXT: strh r2, [r1] +; CHECK-NEXT: vldr s4, [r0] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vldr s6, [r0, #4] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: strh r1, [r0] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-sext.ll b/llvm/test/CodeGen/Thumb2/mve-sext.ll --- a/llvm/test/CodeGen/Thumb2/mve-sext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext.ll @@ -157,21 +157,15 @@ define arm_aapcs_vfpcc <16 x i32> @sext_v16i8_v16i32(<16 x i8> %src) { ; CHECK-LABEL: sext_v16i8_v16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: add r1, sp, #32 ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrb.s16 q0, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrb.s16 q0, [r0, #8] -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vldrh.s32 q2, [r0] -; CHECK-NEXT: vldrh.s32 q3, [r0, #8] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vldrb.s32 q0, [r0] +; CHECK-NEXT: vldrb.s32 q1, [r0, #4] +; CHECK-NEXT: vldrb.s32 q2, [r0, #8] +; CHECK-NEXT: vldrb.s32 q3, [r0, #12] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %0 = sext <16 x i8> %src to <16 x i32> @@ -260,21 +254,15 @@ define arm_aapcs_vfpcc <16 x i32> @zext_v16i8_v16i32(<16 x i8> %src) { ; CHECK-LABEL: zext_v16i8_v16i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: add r1, sp, #32 ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrb.u16 q0, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrb.u16 q0, [r0, #8] -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vldrh.u32 q2, [r0] -; CHECK-NEXT: vldrh.u32 q3, [r0, #8] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vldrb.u32 q1, [r0, #4] +; CHECK-NEXT: vldrb.u32 q2, [r0, #8] +; CHECK-NEXT: vldrb.u32 q3, [r0, #12] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %0 = zext <16 x i8> %src to <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll @@ -83,8 +83,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: mov r1, sp ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vldrh.s32 q0, [r0] @@ -122,8 +122,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: mov r1, sp ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vldrh.s32 q0, [r0] @@ -226,8 +226,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: mov r1, sp ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vldrh.u32 q0, [r0] @@ -265,8 +265,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: add r1, sp, #16 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: mov r1, sp ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vldrh.u32 q0, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -21,7 +21,8 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vabd.s16 q0, q0, q1 +; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: vabs.s16 q0, q0 ; CHECK-NEXT: bx lr %sextsrc1 = sext <8 x i8> %src1 to <8 x i16> %sextsrc2 = sext <8 x i8> %src2 to <8 x i16> @@ -73,7 +74,8 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vabd.s32 q0, q0, q1 +; CHECK-NEXT: vsub.i32 q0, q0, q1 +; CHECK-NEXT: vabs.s32 q0, q0 ; CHECK-NEXT: bx lr %sextsrc1 = sext <4 x i16> %src1 to <4 x i32> %sextsrc2 = sext <4 x i16> %src2 to <4 x i32> @@ -154,7 +156,8 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vabd.u16 q0, q0, q1 +; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: vabs.s16 q0, q0 ; CHECK-NEXT: bx lr %zextsrc1 = zext <8 x i8> %src1 to <8 x i16> %zextsrc2 = zext <8 x i8> %src2 to <8 x i16> @@ -205,7 +208,8 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vmovlb.u16 q1, q1 ; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vabd.u32 q0, q0, q1 +; CHECK-NEXT: vsub.i32 q0, q0, q1 +; CHECK-NEXT: vabs.s32 q0, q0 ; CHECK-NEXT: bx lr %zextsrc1 = zext <4 x i16> %src1 to <4 x i32> %zextsrc2 = zext <4 x i16> %src2 to <4 x i32> @@ -292,10 +296,22 @@ ; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: .LBB15_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q0, [r1], #16 -; CHECK-NEXT: vldrb.u8 q1, [r0], #16 -; CHECK-NEXT: vabd.s8 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: vldrb.s32 q0, [r1, #12] +; CHECK-NEXT: vldrb.s32 q1, [r0, #12] +; CHECK-NEXT: vabd.s32 q0, q1, q0 +; CHECK-NEXT: vldrb.s32 q1, [r0, #8] +; CHECK-NEXT: vstrb.32 q0, [r2, #12] +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vabd.s32 q0, q1, q0 +; CHECK-NEXT: vldrb.s32 q1, [r0, #4] +; CHECK-NEXT: vstrb.32 q0, [r2, #8] +; CHECK-NEXT: vldrb.s32 q0, [r1, #4] +; CHECK-NEXT: vabd.s32 q0, q1, q0 +; CHECK-NEXT: vldrb.s32 q1, [r0], #16 +; CHECK-NEXT: vstrb.32 q0, [r2, #4] +; CHECK-NEXT: vldrb.s32 q0, [r1], #16 +; CHECK-NEXT: vabd.s32 q0, q1, q0 +; CHECK-NEXT: vstrb.32 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB15_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -455,10 +471,22 @@ ; CHECK-NEXT: mov.w lr, #64 ; CHECK-NEXT: .LBB18_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q0, [r1], #16 -; CHECK-NEXT: vldrb.u8 q1, [r0], #16 -; CHECK-NEXT: vabd.u8 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: vldrb.u32 q0, [r1, #12] +; CHECK-NEXT: vldrb.u32 q1, [r0, #12] +; CHECK-NEXT: vabd.s32 q0, q1, q0 +; CHECK-NEXT: vldrb.u32 q1, [r0, #8] +; CHECK-NEXT: vstrb.32 q0, [r2, #12] +; CHECK-NEXT: vldrb.u32 q0, [r1, #8] +; CHECK-NEXT: vabd.s32 q0, q1, q0 +; CHECK-NEXT: vldrb.u32 q1, [r0, #4] +; CHECK-NEXT: vstrb.32 q0, [r2, #8] +; CHECK-NEXT: vldrb.u32 q0, [r1, #4] +; CHECK-NEXT: vabd.s32 q0, q1, q0 +; CHECK-NEXT: vldrb.u32 q1, [r0], #16 +; CHECK-NEXT: vstrb.32 q0, [r2, #4] +; CHECK-NEXT: vldrb.u32 q0, [r1], #16 +; CHECK-NEXT: vabd.s32 q0, q1, q0 +; CHECK-NEXT: vstrb.32 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB18_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll @@ -467,9 +467,7 @@ ; ; CHECK-MVEFP-LABEL: vcmp_ord_v4f32: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vpt.f32 ge, q0, zr -; CHECK-MVEFP-NEXT: vcmpt.f32 lt, q0, zr -; CHECK-MVEFP-NEXT: vpsel q0, q2, q1 +; CHECK-MVEFP-NEXT: vmov q0, q1 ; CHECK-MVEFP-NEXT: bx lr entry: %c = fcmp ord <4 x float> %src, zeroinitializer @@ -504,9 +502,7 @@ ; ; CHECK-MVEFP-LABEL: vcmp_uno_v4f32: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vpt.f32 ge, q0, zr -; CHECK-MVEFP-NEXT: vcmpt.f32 lt, q0, zr -; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 +; CHECK-MVEFP-NEXT: vmov q0, q2 ; CHECK-MVEFP-NEXT: bx lr entry: %c = fcmp uno <4 x float> %src, zeroinitializer @@ -1445,9 +1441,7 @@ ; ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vpt.f16 ge, q0, zr -; CHECK-MVEFP-NEXT: vcmpt.f16 lt, q0, zr -; CHECK-MVEFP-NEXT: vpsel q0, q2, q1 +; CHECK-MVEFP-NEXT: vmov q0, q1 ; CHECK-MVEFP-NEXT: bx lr entry: %c = fcmp ord <8 x half> %src, zeroinitializer @@ -1518,9 +1512,7 @@ ; ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vpt.f16 ge, q0, zr -; CHECK-MVEFP-NEXT: vcmpt.f16 lt, q0, zr -; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 +; CHECK-MVEFP-NEXT: vmov q0, q2 ; CHECK-MVEFP-NEXT: bx lr entry: %c = fcmp uno <8 x half> %src, zeroinitializer @@ -1996,9 +1988,7 @@ ; ; CHECK-MVEFP-LABEL: vcmp_r_ord_v4f32: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vpt.f32 le, q0, zr -; CHECK-MVEFP-NEXT: vcmpt.f32 gt, q0, zr -; CHECK-MVEFP-NEXT: vpsel q0, q2, q1 +; CHECK-MVEFP-NEXT: vmov q0, q1 ; CHECK-MVEFP-NEXT: bx lr entry: %c = fcmp ord <4 x float> zeroinitializer, %src @@ -2033,9 +2023,7 @@ ; ; CHECK-MVEFP-LABEL: vcmp_r_uno_v4f32: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vpt.f32 le, q0, zr -; CHECK-MVEFP-NEXT: vcmpt.f32 gt, q0, zr -; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 +; CHECK-MVEFP-NEXT: vmov q0, q2 ; CHECK-MVEFP-NEXT: bx lr entry: %c = fcmp uno <4 x float> zeroinitializer, %src @@ -2974,9 +2962,7 @@ ; ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vpt.f16 le, q0, zr -; CHECK-MVEFP-NEXT: vcmpt.f16 gt, q0, zr -; CHECK-MVEFP-NEXT: vpsel q0, q2, q1 +; CHECK-MVEFP-NEXT: vmov q0, q1 ; CHECK-MVEFP-NEXT: bx lr entry: %c = fcmp ord <8 x half> zeroinitializer, %src @@ -3047,9 +3033,7 @@ ; ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vpt.f16 le, q0, zr -; CHECK-MVEFP-NEXT: vcmpt.f16 gt, q0, zr -; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 +; CHECK-MVEFP-NEXT: vmov q0, q2 ; CHECK-MVEFP-NEXT: bx lr entry: %c = fcmp uno <8 x half> zeroinitializer, %src diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -309,9 +309,18 @@ define arm_aapcs_vfpcc void @store_shuffletrunc_8(ptr %src, <4 x float> %val1, <4 x float> %val2) { ; CHECK-LABEL: store_shuffletrunc_8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtb.f16.f32 q0, q0 -; CHECK-NEXT: vcvtt.f16.f32 q0, q1 -; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vcvtb.f16.f32 q2, q2 +; CHECK-NEXT: vstrh.32 q2, [r0, #8] +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vcvtb.f16.f32 q0, q2 +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %val1, <4 x float> %val2, <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -129,27 +129,27 @@ ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r1, r2, d4 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 @@ -214,8 +214,24 @@ define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x) { ; CHECK-LABEL: add_v4i16_v4i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vaddlv.s32 r0, r1, q0 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: adds r1, r1, r0 +; CHECK-NEXT: adc.w r0, r2, r0, asr #31 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: adc.w r2, r0, r2, asr #31 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: sxth r3, r0 +; CHECK-NEXT: adds r0, r1, r3 +; CHECK-NEXT: adc.w r1, r2, r3, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i16> %x to <4 x i64> @@ -292,8 +308,24 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x) { ; CHECK-LABEL: add_v8i8_v8i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vaddv.s16 r0, q0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmovlb.s8 q0, q2 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r0, q1 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i8> %x to <8 x i32> @@ -396,27 +428,27 @@ ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r1, r2, d4 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 @@ -538,28 +570,28 @@ ; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov.u16 r1, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r1, r2, d4 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc.w r1, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 @@ -633,9 +665,24 @@ define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x) { ; CHECK-LABEL: add_v4i8_v4i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vaddlv.s32 r0, r1, q0 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: adds r1, r1, r0 +; CHECK-NEXT: adc.w r0, r2, r0, asr #31 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: adc.w r2, r0, r2, asr #31 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: sxtb r3, r0 +; CHECK-NEXT: adds r0, r1, r3 +; CHECK-NEXT: adc.w r1, r2, r3, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i8> %x to <4 x i64> @@ -827,46 +874,46 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w lr, r3, r2 +; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: add r2, lr -; CHECK-NEXT: add.w lr, r2, r3 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: add.w r12, r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add lr, r2 +; CHECK-NEXT: vmov r2, lr, d4 +; CHECK-NEXT: add r12, r2 ; CHECK-NEXT: vmov r3, r2, d5 -; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adds.w r4, r12, r3 ; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: adc.w r12, lr, r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d1 ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <8 x i16> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -928,8 +975,26 @@ define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v4i16_v4i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vaddlva.s32 r0, r1, q0 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: asr.w r12, r3, #31 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r3, r3, r2 +; CHECK-NEXT: adc.w r12, r12, r2, asr #31 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r3, r3, r2 +; CHECK-NEXT: adc.w r12, r12, r2, asr #31 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r3, r3, r2 +; CHECK-NEXT: adc.w r2, r12, r2, asr #31 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i16> %x to <4 x i64> @@ -1016,8 +1081,24 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) { ; CHECK-LABEL: add_v8i8_v8i32_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vaddva.s16 r0, q0 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmovlb.s8 q0, q2 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: vaddva.u32 r0, q1 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i8> %x to <8 x i32> @@ -1123,86 +1204,86 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w lr, r3, r2 +; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: vmov.u8 r2, q0[2] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: add r2, lr -; CHECK-NEXT: add.w lr, r2, r3 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: add.w r12, r2, r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add lr, r2 +; CHECK-NEXT: vmov r2, lr, d4 +; CHECK-NEXT: add r12, r2 ; CHECK-NEXT: vmov r3, r2, d5 -; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adds.w r4, r12, r3 ; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: adc.w r12, lr, r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d5 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d5 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d5 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[13] +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d5 +; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[15] +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d1 ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) @@ -1275,47 +1356,47 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v8i8_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w lr, r3, r2 +; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: add r2, lr -; CHECK-NEXT: add.w lr, r2, r3 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: add.w r12, r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add lr, r2 +; CHECK-NEXT: vmov r2, lr, d4 +; CHECK-NEXT: add r12, r2 ; CHECK-NEXT: vmov r3, r2, d5 -; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adds.w r4, r12, r3 ; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: adc.w r12, lr, r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adds.w lr, r4, r2 +; CHECK-NEXT: vmov r4, r2, d1 ; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adds.w r4, r4, lr +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <8 x i8> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1386,9 +1467,26 @@ define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v4i8_v4i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vaddlva.s32 r0, r1, q0 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: asr.w r12, r3, #31 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r3, r3, r2 +; CHECK-NEXT: adc.w r12, r12, r2, asr #31 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r3, r3, r2 +; CHECK-NEXT: adc.w r12, r12, r2, asr #31 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r3, r3, r2 +; CHECK-NEXT: adc.w r2, r12, r2, asr #31 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adcs r1, r2 ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i8> %x to <4 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -396,10 +396,51 @@ define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %b) { ; CHECK-LABEL: add_v4i16_v4i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s10, s1 ; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpt.i32 eq, q1, zr -; CHECK-NEXT: vaddlvt.s32 r0, r1, q0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vcmp.i32 eq, q1, zr +; CHECK-NEXT: vmov.i8 q1, #0x0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vpsel q1, q2, q1 +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 +; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vpsel q3, q3, q2 +; CHECK-NEXT: vmov r0, r1, d7 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q0, q0, q2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <4 x i16> %b, zeroinitializer @@ -526,9 +567,44 @@ ; CHECK-LABEL: add_v8i8_v8i32_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vpt.i16 eq, q1, zr -; CHECK-NEXT: vaddvt.s16 r0, q0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vcmp.i16 eq, q1, zr +; CHECK-NEXT: vmov.i8 q1, #0x0 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vpsel q1, q3, q1 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.u16 r1, q1[0] +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.u16 r1, q1[1] +; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vpsel q2, q2, q3 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmovlb.s8 q0, q3 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vpt.i32 ne, q3, zr +; CHECK-NEXT: vaddt.i32 q2, q2, q0 +; CHECK-NEXT: vaddv.u32 r0, q2 ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i8> %b, zeroinitializer @@ -1314,12 +1390,52 @@ define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8_v4i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpt.i32 eq, q1, zr -; CHECK-NEXT: vaddlvt.s32 r0, r1, q0 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vcmp.i32 eq, q1, zr +; CHECK-NEXT: vmov.i8 q1, #0x0 +; CHECK-NEXT: vpsel q1, q2, q1 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: asrs r0, r0, #31 +; CHECK-NEXT: asrs r1, r1, #31 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 +; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vpsel q3, q3, q2 +; CHECK-NEXT: vmov r0, r1, d7 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r2, r2, #31 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 +; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q0, q0, q2 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %c = icmp eq <4 x i8> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -183,8 +183,38 @@ define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_sext(<8 x i16> %x, <8 x i8> %y) { ; CHECK-LABEL: add_v8i8i16_v8i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1 +; CHECK-NEXT: vmov.u16 r1, q1[1] +; CHECK-NEXT: vmov.s16 r0, q0[1] +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vmov.u16 r3, q1[0] +; CHECK-NEXT: smull r0, r1, r0, r1 +; CHECK-NEXT: vmov.s16 r2, q0[0] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r2, r3 +; CHECK-NEXT: vmov.u16 r3, q1[2] +; CHECK-NEXT: vmov.s16 r2, q0[2] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r2, r3 +; CHECK-NEXT: vmov.u16 r3, q1[3] +; CHECK-NEXT: vmov.s16 r2, q0[3] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r2, r3 +; CHECK-NEXT: vmov.u16 r3, q1[4] +; CHECK-NEXT: vmov.s16 r2, q0[4] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r2, r3 +; CHECK-NEXT: vmov.u16 r3, q1[5] +; CHECK-NEXT: vmov.s16 r2, q0[5] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r2, r3 +; CHECK-NEXT: vmov.u16 r3, q1[6] +; CHECK-NEXT: vmov.s16 r2, q0[6] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r2, r3 +; CHECK-NEXT: vmov.u16 r3, q1[7] +; CHECK-NEXT: vmov.s16 r2, q0[7] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r2, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i16> %x to <8 x i64> @@ -212,9 +242,32 @@ define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) { ; CHECK-LABEL: add_v4i16_v4i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i16> %x to <4 x i64> @@ -352,9 +405,40 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-LABEL: add_v8i8_v8i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmlav.s16 r0, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: vmovlb.s8 q3, q3 +; CHECK-NEXT: vmovlb.s16 q3, q3 +; CHECK-NEXT: vmlav.u32 r0, q3, q2 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[3] +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmovlb.s8 q1, q2 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmovlb.s8 q0, q2 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmlava.u32 r0, q0, q1 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i8> %x to <8 x i32> @@ -381,8 +465,31 @@ define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext(<8 x i8> %x, <8 x i16> %y) { ; CHECK-LABEL: add_v8i8i16_v8i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmlav.s16 r0, q0, q1 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 +; CHECK-NEXT: vmovlb.s8 q0, q2 +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmlav.u32 r0, q0, q2 +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i8> %x to <8 x i32> @@ -516,15 +623,15 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r1, sp, #16 ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrb.u16 q0, [r0, #8] -; CHECK-NEXT: vldrb.s16 q1, [r1, #8] -; CHECK-NEXT: vmlav.u16 r2, q1, q0 ; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vldrb.s16 q1, [r1] +; CHECK-NEXT: vmlav.u16 r2, q1, q0 +; CHECK-NEXT: vldrb.u16 q0, [r0, #8] +; CHECK-NEXT: vldrb.s16 q1, [r1, #8] ; CHECK-NEXT: vmlava.u16 r2, q1, q0 ; CHECK-NEXT: sxth r0, r2 ; CHECK-NEXT: add sp, #32 @@ -586,8 +693,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r2, sp, #16 -; CHECK-NEXT: mov r3, sp +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: add r3, sp, #16 ; CHECK-NEXT: vstrw.32 q1, [r2] ; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: vldrb.u16 q0, [r2] @@ -611,8 +718,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r2, sp, #16 -; CHECK-NEXT: mov r3, sp +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: add r3, sp, #16 ; CHECK-NEXT: vstrw.32 q1, [r2] ; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: vldrb.s16 q0, [r2] @@ -693,9 +800,46 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) { ; CHECK-LABEL: add_v8i8_v8i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.u16 r3, q0[3] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.u16 r3, q0[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i8> %x to <8 x i64> @@ -724,11 +868,32 @@ define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) { ; CHECK-LABEL: add_v4i8_v4i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i8> %x to <4 x i64> @@ -757,10 +922,32 @@ define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext(<4 x i8> %x, <4 x i16> %y) { ; CHECK-LABEL: add_v4i8i16_v4i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i8> %x to <4 x i64> @@ -776,7 +963,8 @@ ; CHECK-NEXT: vmov.i32 q2, #0xff ; CHECK-NEXT: vmovlb.u16 q1, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1 +; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vaddlv.u32 r0, r1, q0 ; CHECK-NEXT: bx lr entry: %xx = zext <4 x i8> %x to <4 x i32> @@ -793,7 +981,8 @@ ; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1 +; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vaddlv.s32 r0, r1, q0 ; CHECK-NEXT: bx lr entry: %xx = sext <4 x i8> %x to <4 x i32> @@ -1344,8 +1533,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r2, sp, #16 -; CHECK-NEXT: mov r3, sp +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: add r3, sp, #16 ; CHECK-NEXT: vstrw.32 q1, [r2] ; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: vldrb.u16 q0, [r2] @@ -1370,8 +1559,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r2, sp, #16 -; CHECK-NEXT: mov r3, sp +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: add r3, sp, #16 ; CHECK-NEXT: vstrw.32 q1, [r2] ; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: vldrb.s16 q0, [r2] diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -246,10 +246,115 @@ define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_sext(<8 x i16> %x, <8 x i8> %y, <8 x i16> %b) { ; CHECK-LABEL: add_v8i8i16_v8i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vpt.i16 eq, q2, zr -; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.i8 q3, #0x0 +; CHECK-NEXT: vmov.i8 q4, #0xff +; CHECK-NEXT: vcmp.i16 eq, q2, zr +; CHECK-NEXT: vmov.u16 r3, q1[0] +; CHECK-NEXT: vpsel q5, q4, q3 +; CHECK-NEXT: vmov.s16 r2, q0[0] +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vmov.u16 r1, q5[0] +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q5[3] +; CHECK-NEXT: vmov.u16 r1, q5[1] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: smull r2, r3, r2, r3 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vpsel q6, q4, q3 +; CHECK-NEXT: vmov r0, r1, d12 +; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 +; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vmov.u16 r1, q1[1] +; CHECK-NEXT: vmov.s16 r0, q0[1] +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r0, r1, r0, r1 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov q7[2], q7[0], r2, r0 +; CHECK-NEXT: vmov q7[3], q7[1], r3, r1 +; CHECK-NEXT: vpsel q7, q7, q2 +; CHECK-NEXT: vmov r0, r1, d15 +; CHECK-NEXT: vmov r2, r3, d14 +; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov.s16 r0, q0[2] +; CHECK-NEXT: adc.w lr, r3, r1 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.u16 r3, q1[3] +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vmov.s16 r2, q0[3] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r0, r1, r0, r1 +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: smull r2, r3, r2, r3 +; CHECK-NEXT: vmov q6[2], q6[0], r0, r2 +; CHECK-NEXT: vmov q6[3], q6[1], r1, r3 +; CHECK-NEXT: vpsel q6, q6, q2 +; CHECK-NEXT: vmov r0, r1, d12 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: vmov.s16 r0, q0[4] +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: smull r0, r1, r0, r1 +; CHECK-NEXT: vpsel q3, q4, q3 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 +; CHECK-NEXT: vmov.u16 r3, q1[5] +; CHECK-NEXT: vmov.s16 r2, q0[5] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r2, r3 +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: vmov q4[2], q4[0], r0, r2 +; CHECK-NEXT: vmov q4[3], q4[1], r1, r3 +; CHECK-NEXT: vpsel q4, q4, q2 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: vmov.u16 r3, q1[7] +; CHECK-NEXT: vmov.s16 r2, q0[7] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.s16 r0, q0[6] +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r2, r3, r2, r3 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: smull r0, r1, r0, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vpsel q0, q0, q2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <8 x i16> %b, zeroinitializer %xx = sext <8 x i16> %x to <8 x i64> @@ -332,12 +437,68 @@ define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) { ; CHECK-LABEL: add_v4i16_v4i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s14, s5 ; CHECK-NEXT: vmovlb.u16 q2, q2 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpt.i32 eq, q2, zr -; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vcmp.i32 eq, q2, zr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 +; CHECK-NEXT: vmov q4[3], q4[1], r3, r1 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r1 +; CHECK-NEXT: vmov q3[3], q3[1], r0, r1 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vpsel q4, q4, q3 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adc.w lr, r3, r1 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <4 x i16> %b, zeroinitializer %xx = sext <4 x i16> %x to <4 x i64> @@ -538,11 +699,66 @@ define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i32_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.u16 r1, q1[0] +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.u16 r1, q1[1] ; CHECK-NEXT: vmovlb.u8 q2, q2 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vpt.i16 eq, q2, zr -; CHECK-NEXT: vmlavt.s16 r0, q0, q1 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmovlb.s8 q3, q3 +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmovlb.s16 q4, q3 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vcmp.i16 eq, q2, zr +; CHECK-NEXT: vmovlb.s8 q3, q3 +; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: vmovlb.s16 q5, q3 +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.u16 r1, q2[0] +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.u16 r1, q2[1] +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q3, q5, q4 +; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmovlb.s8 q1, q4 +; CHECK-NEXT: vmov q4[2], q4[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmov q4[3], q4[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmovlb.s8 q0, q4 +; CHECK-NEXT: vmov.u16 r1, q2[4] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.u16 r1, q2[5] +; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vaddt.i32 q3, q3, q0 +; CHECK-NEXT: vaddv.u32 r0, q3 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i8> %b, zeroinitializer @@ -575,10 +791,57 @@ define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8i16_v8i32_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q2, q2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vpt.i16 eq, q2, zr -; CHECK-NEXT: vmlavt.s16 r0, q0, q1 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vldrh.s32 q4, [r0] +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmovlb.s16 q3, q1 +; CHECK-NEXT: vmovlb.u8 q1, q2 +; CHECK-NEXT: vcmp.i16 eq, q1, zr +; CHECK-NEXT: vmov.i8 q1, #0x0 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vpsel q1, q2, q1 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[3] +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i32 q2, q3, q4 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmovlb.s8 q0, q3 +; CHECK-NEXT: vldrh.s32 q3, [r0, #8] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmul.i32 q0, q0, q3 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: vpt.i32 ne, q3, zr +; CHECK-NEXT: vaddt.i32 q2, q2, q0 +; CHECK-NEXT: vaddv.u32 r0, q2 +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %c = icmp eq <8 x i8> %b, zeroinitializer @@ -692,8 +955,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r1, sp, #16 ; CHECK-NEXT: vstrw.32 q1, [r0] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vcmp.i8 eq, q2, zr @@ -1400,12 +1663,124 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) { ; CHECK-LABEL: add_v8i8_v8i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vmovlb.u8 q2, q2 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vpt.i16 eq, q2, zr -; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.i8 q3, #0x0 +; CHECK-NEXT: vcmp.i16 eq, q2, zr +; CHECK-NEXT: vmov.i8 q4, #0xff +; CHECK-NEXT: vpsel q5, q4, q3 +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vmov.u16 r1, q5[0] +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q5[3] +; CHECK-NEXT: vmov.u16 r1, q5[1] +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vpsel q6, q4, q3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov r0, r1, d12 +; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 +; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vmov.u16 r0, q1[1] +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov q7[2], q7[0], r2, r0 +; CHECK-NEXT: vmov q7[3], q7[1], r3, r1 +; CHECK-NEXT: vpsel q7, q7, q2 +; CHECK-NEXT: vmov r0, r1, d15 +; CHECK-NEXT: vmov r2, r3, d14 +; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: adc.w lr, r3, r1 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.u16 r3, q0[3] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q6[2], q6[0], r0, r2 +; CHECK-NEXT: vmov q6[3], q6[1], r1, r3 +; CHECK-NEXT: vpsel q6, q6, q2 +; CHECK-NEXT: vmov r0, r1, d12 +; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] +; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: vcmp.i32 ne, q6, zr +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vpsel q3, q4, q3 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vcmp.i32 ne, q4, zr +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r2 +; CHECK-NEXT: vmov q4[3], q4[1], r1, r3 +; CHECK-NEXT: vpsel q4, q4, q2 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adc.w lr, r1, r3 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.u16 r3, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vpsel q0, q0, q2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <8 x i8> %b, zeroinitializer %xx = sext <8 x i8> %x to <8 x i64> @@ -1439,15 +1814,69 @@ define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8_v4i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: vmov.i32 q3, #0xff -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpt.i32 eq, q2, zr -; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.i32 eq, q2, zr +; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 +; CHECK-NEXT: vmov q4[3], q4[1], r3, r1 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r1 +; CHECK-NEXT: vmov q3[3], q3[1], r0, r1 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vpsel q4, q4, q3 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adc.w lr, r3, r1 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <4 x i8> %b, zeroinitializer %xx = sext <4 x i8> %x to <4 x i64> @@ -1481,14 +1910,69 @@ define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8i16_v4i64_sext: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s14, s5 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: vmov.i32 q3, #0xff -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpt.i32 eq, q2, zr -; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vmov.i8 q3, #0xff +; CHECK-NEXT: vcmp.i32 eq, q2, zr +; CHECK-NEXT: vmov.i8 q2, #0x0 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 +; CHECK-NEXT: vmov q4[3], q4[1], r3, r1 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r1 +; CHECK-NEXT: vmov q3[3], q3[1], r0, r1 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vpsel q4, q4, q3 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adc.w lr, r3, r1 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 +; CHECK-NEXT: vmov r0, r1, d5 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vpsel q0, q0, q3 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc.w r1, r1, lr +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} entry: %c = icmp eq <4 x i8> %b, zeroinitializer %xx = sext <4 x i8> %x to <4 x i64> @@ -1504,10 +1988,11 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vpt.i32 eq, q2, zr -; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1 +; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vand q1, q2, q3 +; CHECK-NEXT: vpt.i32 eq, q1, zr +; CHECK-NEXT: vaddlvt.u32 r0, r1, q0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <4 x i8> %b, zeroinitializer @@ -1523,13 +2008,14 @@ define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) { ; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q3, #0xff ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vmovlb.s16 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vpt.i32 eq, q2, zr -; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1 +; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0xff +; CHECK-NEXT: vand q1, q2, q1 +; CHECK-NEXT: vpt.i32 eq, q1, zr +; CHECK-NEXT: vaddlvt.s32 r0, r1, q0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <4 x i8> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-slp.ll @@ -707,8 +707,10 @@ define i32 @addv8i32i8(ptr %x) { ; CHECK-LABEL: addv8i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r0] -; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r0] +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: vaddv.u32 r0, q1 +; CHECK-NEXT: vaddva.u32 r0, q0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i8>, ptr %x, align 1 @@ -720,8 +722,15 @@ define i32 @addv16i32i8(ptr %x) { ; CHECK-LABEL: addv16i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r0] +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #8] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #12] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i8>, ptr %x, align 1 @@ -733,10 +742,19 @@ define i32 @addv24i32i8(ptr %x) { ; CHECK-LABEL: addv24i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q1, [r0] -; CHECK-NEXT: vldrb.u16 q0, [r0, #16] -; CHECK-NEXT: vaddv.u8 r0, q1 -; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r0] +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #8] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #12] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #16] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #20] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i8>, ptr %x, align 1 @@ -800,10 +818,18 @@ ; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #28] ; CHECK-NEXT: vaddva.u32 r2, q0 -; CHECK-NEXT: vldrb.u8 q0, [r0, #32] -; CHECK-NEXT: vaddva.u8 r2, q0 -; CHECK-NEXT: vldrb.u16 q0, [r0, #48] -; CHECK-NEXT: vaddva.u16 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #32] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #36] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #40] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #44] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #48] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #52] +; CHECK-NEXT: vaddva.u32 r2, q0 ; CHECK-NEXT: vldrb.u32 q0, [r0, #56] ; CHECK-NEXT: ldrb.w r0, [r0, #63] ; CHECK-NEXT: vaddva.u32 r2, q0 @@ -853,23 +879,71 @@ define i32 @addv128i32i8(ptr %x) { ; CHECK-LABEL: addv128i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q1, [r0] -; CHECK-NEXT: vldrb.u8 q0, [r0, #16] -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: vaddv.u8 r0, q1 -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #32] -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #48] -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #64] -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #80] -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #96] -; CHECK-NEXT: vaddva.u8 r0, q0 -; CHECK-NEXT: vldrb.u8 q0, [r1, #112] -; CHECK-NEXT: vaddva.u8 r0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r0] +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: vaddv.u32 r2, q1 +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #8] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #12] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #16] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #20] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #24] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #28] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #32] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #36] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #40] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #44] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #48] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #52] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #56] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #60] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #64] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #68] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #72] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #76] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #80] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #84] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #88] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #92] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #96] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #100] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #104] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #108] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #112] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #116] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #120] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #124] +; CHECK-NEXT: vaddva.u32 r2, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, ptr %x, align 1 @@ -2465,9 +2539,13 @@ define i32 @mlav8i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav8i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r0] -; CHECK-NEXT: vldrb.u16 q1, [r1] -; CHECK-NEXT: vmlav.u16 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i8>, ptr %x, align 1 @@ -2482,9 +2560,19 @@ define i32 @mlav16i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav16i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vldrb.u8 q1, [r1] -; CHECK-NEXT: vmlav.u8 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vmlav.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #4] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #8] +; CHECK-NEXT: vldrb.u32 q1, [r1, #8] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r0, #12] +; CHECK-NEXT: vldrb.u32 q1, [r1, #12] +; CHECK-NEXT: vmlava.u32 r2, q1, q0 +; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bx lr entry: %0 = load <16 x i8>, ptr %x, align 1 @@ -2499,13 +2587,25 @@ define i32 @mlav24i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav24i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u16 q0, [r0] -; CHECK-NEXT: vldrb.u16 q1, [r1] -; CHECK-NEXT: vmlav.u16 r2, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r0, #8] -; CHECK-NEXT: vldrb.u8 q1, [r1, #8] -; CHECK-NEXT: vmlava.u8 r2, q1, q0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: vmlav.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #4] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #8] +; CHECK-NEXT: vldrb.u32 q1, [r1, #8] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #12] +; CHECK-NEXT: vldrb.u32 q1, [r1, #12] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #16] +; CHECK-NEXT: vldrb.u32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #20] +; CHECK-NEXT: vldrb.u32 q1, [r1, #20] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = load <8 x i8>, ptr %x, align 1 @@ -2568,19 +2668,55 @@ define i32 @mlav64i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav64i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vldrb.u8 q1, [r1] -; CHECK-NEXT: vmlav.u8 r2, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r0, #16] -; CHECK-NEXT: vldrb.u8 q1, [r1, #16] -; CHECK-NEXT: vmlava.u8 r2, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r0, #32] -; CHECK-NEXT: vldrb.u8 q1, [r1, #32] -; CHECK-NEXT: vmlava.u8 r2, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r0, #48] -; CHECK-NEXT: vldrb.u8 q1, [r1, #48] -; CHECK-NEXT: vmlava.u8 r2, q1, q0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: vmlav.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #4] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #8] +; CHECK-NEXT: vldrb.u32 q1, [r1, #8] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #12] +; CHECK-NEXT: vldrb.u32 q1, [r1, #12] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #16] +; CHECK-NEXT: vldrb.u32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #20] +; CHECK-NEXT: vldrb.u32 q1, [r1, #20] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #24] +; CHECK-NEXT: vldrb.u32 q1, [r1, #24] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #28] +; CHECK-NEXT: vldrb.u32 q1, [r1, #28] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #32] +; CHECK-NEXT: vldrb.u32 q1, [r1, #32] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #36] +; CHECK-NEXT: vldrb.u32 q1, [r1, #36] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #40] +; CHECK-NEXT: vldrb.u32 q1, [r1, #40] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #44] +; CHECK-NEXT: vldrb.u32 q1, [r1, #44] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #48] +; CHECK-NEXT: vldrb.u32 q1, [r1, #48] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #52] +; CHECK-NEXT: vldrb.u32 q1, [r1, #52] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #56] +; CHECK-NEXT: vldrb.u32 q1, [r1, #56] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #60] +; CHECK-NEXT: vldrb.u32 q1, [r1, #60] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, ptr %x, align 1 @@ -2622,31 +2758,103 @@ define i32 @mlav128i32i8(ptr %x, ptr %y) { ; CHECK-LABEL: mlav128i32i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r0] -; CHECK-NEXT: vldrb.u8 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: vldrb.u32 q1, [r1] ; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmlav.u8 r0, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r2, #16] -; CHECK-NEXT: vldrb.u8 q1, [r1, #16] -; CHECK-NEXT: vmlava.u8 r0, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r2, #32] -; CHECK-NEXT: vldrb.u8 q1, [r1, #32] -; CHECK-NEXT: vmlava.u8 r0, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r2, #48] -; CHECK-NEXT: vldrb.u8 q1, [r1, #48] -; CHECK-NEXT: vmlava.u8 r0, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r2, #64] -; CHECK-NEXT: vldrb.u8 q1, [r1, #64] -; CHECK-NEXT: vmlava.u8 r0, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r2, #80] -; CHECK-NEXT: vldrb.u8 q1, [r1, #80] -; CHECK-NEXT: vmlava.u8 r0, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r2, #96] -; CHECK-NEXT: vldrb.u8 q1, [r1, #96] -; CHECK-NEXT: vmlava.u8 r0, q1, q0 -; CHECK-NEXT: vldrb.u8 q0, [r2, #112] -; CHECK-NEXT: vldrb.u8 q1, [r1, #112] -; CHECK-NEXT: vmlava.u8 r0, q1, q0 +; CHECK-NEXT: vmlav.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #4] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #8] +; CHECK-NEXT: vldrb.u32 q1, [r1, #8] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #12] +; CHECK-NEXT: vldrb.u32 q1, [r1, #12] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #16] +; CHECK-NEXT: vldrb.u32 q1, [r1, #16] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #20] +; CHECK-NEXT: vldrb.u32 q1, [r1, #20] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #24] +; CHECK-NEXT: vldrb.u32 q1, [r1, #24] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #28] +; CHECK-NEXT: vldrb.u32 q1, [r1, #28] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #32] +; CHECK-NEXT: vldrb.u32 q1, [r1, #32] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #36] +; CHECK-NEXT: vldrb.u32 q1, [r1, #36] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #40] +; CHECK-NEXT: vldrb.u32 q1, [r1, #40] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #44] +; CHECK-NEXT: vldrb.u32 q1, [r1, #44] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #48] +; CHECK-NEXT: vldrb.u32 q1, [r1, #48] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #52] +; CHECK-NEXT: vldrb.u32 q1, [r1, #52] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #56] +; CHECK-NEXT: vldrb.u32 q1, [r1, #56] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #60] +; CHECK-NEXT: vldrb.u32 q1, [r1, #60] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #64] +; CHECK-NEXT: vldrb.u32 q1, [r1, #64] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #68] +; CHECK-NEXT: vldrb.u32 q1, [r1, #68] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #72] +; CHECK-NEXT: vldrb.u32 q1, [r1, #72] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #76] +; CHECK-NEXT: vldrb.u32 q1, [r1, #76] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #80] +; CHECK-NEXT: vldrb.u32 q1, [r1, #80] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #84] +; CHECK-NEXT: vldrb.u32 q1, [r1, #84] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #88] +; CHECK-NEXT: vldrb.u32 q1, [r1, #88] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #92] +; CHECK-NEXT: vldrb.u32 q1, [r1, #92] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #96] +; CHECK-NEXT: vldrb.u32 q1, [r1, #96] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #100] +; CHECK-NEXT: vldrb.u32 q1, [r1, #100] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #104] +; CHECK-NEXT: vldrb.u32 q1, [r1, #104] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #108] +; CHECK-NEXT: vldrb.u32 q1, [r1, #108] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #112] +; CHECK-NEXT: vldrb.u32 q1, [r1, #112] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #116] +; CHECK-NEXT: vldrb.u32 q1, [r1, #116] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #120] +; CHECK-NEXT: vldrb.u32 q1, [r1, #120] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 +; CHECK-NEXT: vldrb.u32 q0, [r2, #124] +; CHECK-NEXT: vldrb.u32 q1, [r1, #124] +; CHECK-NEXT: vmlava.u32 r0, q1, q0 ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, ptr %x, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -68,18 +68,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov r2, r12, d1 -; CHECK-NEXT: vmov r3, lr, d0 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0], #32 -; CHECK-NEXT: vmov r4, r7, d1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, r6, d0 -; CHECK-NEXT: adc.w r5, lr, r12 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adcs r7, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r5 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r2, r5, d0 +; CHECK-NEXT: vmov r4, r7, d4 +; CHECK-NEXT: vmov r3, r6, d1 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r6, r6, r12 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adcs r7, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r6 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -207,11 +207,11 @@ ; CHECK-NEXT: .pad #32 ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vldrb.u8 q0, [r0, #16] -; CHECK-NEXT: add r2, sp, #16 +; CHECK-NEXT: mov r2, sp ; CHECK-NEXT: vshr.u32 q1, q0, #16 ; CHECK-NEXT: vstrh.32 q1, [r2, #8] ; CHECK-NEXT: vldrb.u8 q1, [r0] -; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: vshr.u32 q2, q1, #16 ; CHECK-NEXT: vstrh.32 q2, [r2] ; CHECK-NEXT: vstrh.32 q0, [r0, #8] @@ -314,12 +314,16 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r4, d1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: vmov r5, r6, d0 +; CHECK-NEXT: vmov r0, r4, d4 +; CHECK-NEXT: vmov r3, r2, d1 ; CHECK-NEXT: adds.w r3, r3, lr ; CHECK-NEXT: adc.w r2, r2, r12 ; CHECK-NEXT: adds r0, r0, r5 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -1023,12 +1023,9 @@ define void @vld3_v2f16(ptr %src, ptr %dst) { ; CHECK-LABEL: vld3_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r2, r3, [r0] -; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vldmia r0, {s0, s1, s2} ; CHECK-NEXT: vmovx.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vins.f16 s8, s2 ; CHECK-NEXT: vmovx.f16 s2, s1 ; CHECK-NEXT: vins.f16 s1, s4 @@ -1053,26 +1050,25 @@ define void @vld3_v4f16(ptr %src, ptr %dst) { ; CHECK-LABEL: vld3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r2, r3, [r0, #16] +; CHECK-NEXT: vldr s0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmovx.f16 s13, s7 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vins.f16 s1, s0 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vldr s13, [r0, #20] +; CHECK-NEXT: vmov.f32 s1, s0 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s2, s13 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s2, s6 +; CHECK-NEXT: vmovx.f16 s9, s7 +; CHECK-NEXT: vins.f16 s5, s2 ; CHECK-NEXT: vins.f16 s7, s0 ; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vins.f16 s12, s6 -; CHECK-NEXT: vins.f16 s13, s9 +; CHECK-NEXT: vins.f16 s8, s6 +; CHECK-NEXT: vins.f16 s9, s13 ; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vadd.f16 q1, q1, q3 +; CHECK-NEXT: vadd.f16 q1, q1, q2 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -6,119 +6,119 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: mul r12, r3, r2 ; CHECK-NEXT: lsrs.w r2, r12, #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 -; CHECK-NEXT: ldr r2, [sp, #56] +; CHECK-NEXT: ldr r2, [sp, #72] ; CHECK-NEXT: and.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0, #32] -; CHECK-NEXT: vldrh.u16 q4, [r0, #48] +; CHECK-NEXT: vldrh.u16 q4, [r0, #32] +; CHECK-NEXT: vldrh.u16 q5, [r0, #48] ; CHECK-NEXT: vldrh.u16 q3, [r0], #64 -; CHECK-NEXT: vmovx.f16 s26, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s6 -; CHECK-NEXT: vldrh.u16 q5, [r0, #-48] -; CHECK-NEXT: vmovx.f16 s27, s16 -; CHECK-NEXT: vins.f16 s26, s6 -; CHECK-NEXT: vmovx.f16 s6, s18 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vmovx.f16 s24, s12 -; CHECK-NEXT: vins.f16 s10, s8 -; CHECK-NEXT: vins.f16 s27, s6 -; CHECK-NEXT: vmovx.f16 s6, s14 -; CHECK-NEXT: vmovx.f16 s8, s19 -; CHECK-NEXT: vmovx.f16 s11, s17 +; CHECK-NEXT: vmovx.f16 s4, s19 +; CHECK-NEXT: vmovx.f16 s6, s17 +; CHECK-NEXT: vldrh.u16 q6, [r0, #-48] +; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vmovx.f16 s4, s23 +; CHECK-NEXT: vmovx.f16 s7, s21 +; CHECK-NEXT: vins.f16 s7, s4 +; CHECK-NEXT: vmovx.f16 s8, s15 +; CHECK-NEXT: vmovx.f16 s4, s13 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmov.f32 s3, s21 +; CHECK-NEXT: vmovx.f16 s5, s25 ; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vins.f16 s11, s8 -; CHECK-NEXT: vmovx.f16 s25, s20 -; CHECK-NEXT: vins.f16 s24, s6 -; CHECK-NEXT: vmovx.f16 s6, s22 -; CHECK-NEXT: vmovx.f16 s1, s15 -; CHECK-NEXT: vmovx.f16 s8, s13 +; CHECK-NEXT: vins.f16 s25, s27 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vmovx.f16 s8, s27 +; CHECK-NEXT: vins.f16 s5, s8 +; CHECK-NEXT: vins.f16 s2, s19 +; CHECK-NEXT: vins.f16 s3, s23 +; CHECK-NEXT: vins.f16 s0, s15 +; CHECK-NEXT: vmov.f32 s1, s25 +; CHECK-NEXT: vmul.f16 q1, q1, r2 +; CHECK-NEXT: vmul.f16 q0, q0, r2 +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s30, s16 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vmovx.f16 s10, s18 +; CHECK-NEXT: vmovx.f16 s31, s20 +; CHECK-NEXT: vins.f16 s30, s10 +; CHECK-NEXT: vmovx.f16 s10, s22 +; CHECK-NEXT: vmovx.f16 s28, s12 +; CHECK-NEXT: vins.f16 s31, s10 +; CHECK-NEXT: vmovx.f16 s10, s14 ; CHECK-NEXT: vins.f16 s20, s22 +; CHECK-NEXT: vmovx.f16 s29, s24 +; CHECK-NEXT: vins.f16 s24, s26 +; CHECK-NEXT: vins.f16 s28, s10 +; CHECK-NEXT: vmovx.f16 s10, s26 ; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vins.f16 s25, s6 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vins.f16 s0, s15 -; CHECK-NEXT: vmovx.f16 s9, s21 -; CHECK-NEXT: vins.f16 s8, s1 -; CHECK-NEXT: vmovx.f16 s1, s23 +; CHECK-NEXT: vins.f16 s29, s10 ; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vins.f16 s21, s23 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmov.f32 s15, s16 -; CHECK-NEXT: vins.f16 s9, s1 -; CHECK-NEXT: vmov.f32 s13, s20 -; CHECK-NEXT: vmul.f16 q6, q6, r2 +; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vmul.f16 q7, q7, r2 +; CHECK-NEXT: vmov.f32 s15, s20 +; CHECK-NEXT: vins.f16 s1, s5 +; CHECK-NEXT: vmov.f32 s13, s24 +; CHECK-NEXT: vmovx.f16 s23, s0 ; CHECK-NEXT: vmul.f16 q3, q3, r2 -; CHECK-NEXT: vins.f16 s2, s7 -; CHECK-NEXT: vins.f16 s3, s19 -; CHECK-NEXT: vmov.f32 s1, s21 -; CHECK-NEXT: vmul.f16 q0, q0, r2 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s10, s13 +; CHECK-NEXT: vmovx.f16 s5, s29 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vins.f16 s10, s5 +; CHECK-NEXT: vins.f16 s23, s4 ; CHECK-NEXT: vmovx.f16 s4, s12 -; CHECK-NEXT: vmovx.f16 s6, s24 -; CHECK-NEXT: vmul.f16 q2, q2, r2 -; CHECK-NEXT: vmovx.f16 s7, s0 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s8, s8 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s5, s1 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vins.f16 s7, s8 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmovx.f16 s6, s13 -; CHECK-NEXT: vmovx.f16 s8, s25 -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmovx.f16 s19, s2 -; CHECK-NEXT: vmovx.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s18, s14 -; CHECK-NEXT: vins.f16 s19, s8 -; CHECK-NEXT: vmovx.f16 s8, s26 -; CHECK-NEXT: vins.f16 s18, s8 -; CHECK-NEXT: vmovx.f16 s23, s3 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vins.f16 s14, s26 -; CHECK-NEXT: vins.f16 s23, s8 -; CHECK-NEXT: vmovx.f16 s22, s15 -; CHECK-NEXT: vins.f16 s15, s27 -; CHECK-NEXT: vmovx.f16 s8, s27 -; CHECK-NEXT: vins.f16 s12, s24 -; CHECK-NEXT: vins.f16 s13, s25 -; CHECK-NEXT: vins.f16 s3, s11 -; CHECK-NEXT: vins.f16 s1, s9 -; CHECK-NEXT: vins.f16 s2, s10 -; CHECK-NEXT: vins.f16 s22, s8 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vmov.f32 s17, s0 -; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmovx.f16 s5, s28 +; CHECK-NEXT: vmovx.f16 s19, s3 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vins.f16 s3, s7 +; CHECK-NEXT: vmovx.f16 s5, s7 +; CHECK-NEXT: vmovx.f16 s7, s2 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vmovx.f16 s6, s6 ; CHECK-NEXT: vmov q6, q0 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s17, s2 -; CHECK-NEXT: vmov.f32 s16, s14 -; CHECK-NEXT: vmov.f32 s21, s3 -; CHECK-NEXT: vstrh.16 q4, [r1, #32] -; CHECK-NEXT: vmov.f32 s20, s15 -; CHECK-NEXT: vmov.f32 s7, s5 -; CHECK-NEXT: vstrh.16 q5, [r1, #48] -; CHECK-NEXT: vstrh.16 q2, [r1], #64 -; CHECK-NEXT: vmov.f32 s4, s13 -; CHECK-NEXT: vmov.f32 s5, s25 -; CHECK-NEXT: vstrh.16 q1, [r1, #-48] +; CHECK-NEXT: vins.f16 s19, s5 +; CHECK-NEXT: vmovx.f16 s18, s15 +; CHECK-NEXT: vins.f16 s15, s31 +; CHECK-NEXT: vmovx.f16 s5, s31 +; CHECK-NEXT: vins.f16 s7, s6 +; CHECK-NEXT: vmovx.f16 s6, s14 +; CHECK-NEXT: vins.f16 s13, s29 +; CHECK-NEXT: vins.f16 s12, s28 +; CHECK-NEXT: vins.f16 s14, s30 +; CHECK-NEXT: vins.f16 s18, s5 +; CHECK-NEXT: vmovx.f16 s5, s30 +; CHECK-NEXT: vmov.f32 s9, s25 +; CHECK-NEXT: vmov q6, q3 +; CHECK-NEXT: vins.f16 s6, s5 +; CHECK-NEXT: vmov.f32 s11, s8 +; CHECK-NEXT: vmov.f32 s8, s13 +; CHECK-NEXT: vmov.f32 s26, s4 +; CHECK-NEXT: vstrh.16 q2, [r1, #16] +; CHECK-NEXT: vmov.f32 s25, s0 +; CHECK-NEXT: vmov.f32 s27, s23 +; CHECK-NEXT: vmov.f32 s17, s3 +; CHECK-NEXT: vmov.f32 s16, s15 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vstrh.16 q4, [r1, #48] +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vstrh.16 q1, [r1, #32] +; CHECK-NEXT: vstrh.16 q6, [r1], #64 +; CHECK-NEXT: vmov.f32 s1, s0 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -191,10 +191,9 @@ ; CHECK-LABEL: sext32_0213_0ext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 +; CHECK-NEXT: vrev64.32 q2, q0 +; CHECK-NEXT: vmullb.s32 q1, q2, q3 ; CHECK-NEXT: vmullb.s32 q2, q0, q3 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmullb.s32 q1, q0, q3 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: @@ -210,11 +209,10 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0ext_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0ext_0213: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev64.32 q2, q0 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 +; CHECK-NEXT: vmullb.s32 q1, q3, q2 ; CHECK-NEXT: vmullb.s32 q2, q3, q0 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmullb.s32 q1, q3, q0 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: @@ -232,12 +230,11 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vrev64.32 q1, q0 ; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r2, r5, r3, r0 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r1, r0 +; CHECK-NEXT: umull r2, r5, r3, r0 ; CHECK-NEXT: vmov q1[2], q1[0], r2, lr ; CHECK-NEXT: asrs r2, r0, #31 ; CHECK-NEXT: mla r4, r1, r2, r12 @@ -245,20 +242,20 @@ ; CHECK-NEXT: mla r5, r3, r2, r5 ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: mla r1, r1, r0, r4 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: mla r3, r3, r0, r5 ; CHECK-NEXT: vmov q1[3], q1[1], r3, r1 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: umull r3, r5, r1, r0 -; CHECK-NEXT: mla r5, r1, r2, r5 +; CHECK-NEXT: umull r5, lr, r4, r0 +; CHECK-NEXT: umull r3, r12, r1, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r3 +; CHECK-NEXT: mla r3, r1, r2, r12 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r12, r1, r0, r5 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: umull r4, r1, r5, r0 -; CHECK-NEXT: mla r1, r5, r2, r1 -; CHECK-NEXT: asrs r2, r5, #31 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: mla r0, r2, r0, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: mla r2, r4, r2, lr +; CHECK-NEXT: mla r1, r1, r0, r3 +; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: mla r0, r3, r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -275,33 +272,32 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vrev64.32 q1, q0 ; CHECK-NEXT: asrs r4, r0, #31 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r2, r5, r0, r3 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r0, r1 +; CHECK-NEXT: umull r2, r5, r0, r3 ; CHECK-NEXT: vmov q1[2], q1[0], r2, lr ; CHECK-NEXT: asrs r2, r1, #31 ; CHECK-NEXT: mla r2, r0, r2, r12 ; CHECK-NEXT: mla r1, r4, r1, r2 ; CHECK-NEXT: asrs r2, r3, #31 ; CHECK-NEXT: mla r2, r0, r2, r5 +; CHECK-NEXT: vmov r5, s0 ; CHECK-NEXT: mla r2, r4, r3, r2 ; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: umull r2, r3, r0, r1 -; CHECK-NEXT: asrs r5, r1, #31 -; CHECK-NEXT: mla r3, r0, r5, r3 -; CHECK-NEXT: mla r12, r4, r1, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r5, r1, r0, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: mla r0, r0, r2, r1 -; CHECK-NEXT: mla r0, r4, r3, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: umull r3, lr, r0, r5 +; CHECK-NEXT: umull r2, r12, r0, r1 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: asrs r2, r1, #31 +; CHECK-NEXT: mla r2, r0, r2, r12 +; CHECK-NEXT: mla r1, r4, r1, r2 +; CHECK-NEXT: asrs r2, r5, #31 +; CHECK-NEXT: mla r0, r0, r2, lr +; CHECK-NEXT: mla r0, r4, r5, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -467,10 +463,9 @@ ; CHECK-LABEL: zext32_0213_0ext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 +; CHECK-NEXT: vrev64.32 q2, q0 +; CHECK-NEXT: vmullb.u32 q1, q2, q3 ; CHECK-NEXT: vmullb.u32 q2, q0, q3 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmullb.u32 q1, q0, q3 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: @@ -486,11 +481,10 @@ define arm_aapcs_vfpcc <4 x i64> @zext32_0ext_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: zext32_0ext_0213: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev64.32 q2, q0 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r0 +; CHECK-NEXT: vmullb.u32 q1, q3, q2 ; CHECK-NEXT: vmullb.u32 q2, q3, q0 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmullb.u32 q1, q3, q0 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: @@ -507,9 +501,10 @@ ; CHECK-LABEL: zext32_0213_ext0: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vrev64.32 q1, q0 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: umull r1, r12, r1, r0 ; CHECK-NEXT: umull r3, r2, r3, r0 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 @@ -536,9 +531,10 @@ ; CHECK-LABEL: zext32_ext0_0213: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vrev64.32 q1, q0 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: vand q0, q1, q0 ; CHECK-NEXT: umull r1, r12, r0, r1 ; CHECK-NEXT: umull r3, r2, r0, r3 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll @@ -35,8 +35,52 @@ define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: vqdmulh_v8i8_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 -; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmovlb.s8 q3, q3 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmovlb.s16 q3, q3 +; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vshr.s32 q3, q2, #7 +; CHECK-NEXT: vmov.i32 q2, #0x7f +; CHECK-NEXT: vmin.s32 q3, q3, q2 +; CHECK-NEXT: vstrh.32 q3, [r0, #8] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmovlb.s8 q0, q3 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[3] +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmovlb.s8 q1, q3 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vshr.s32 q0, q0, #7 +; CHECK-NEXT: vmin.s32 q0, q0, q2 +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %l2 = sext <8 x i8> %s0 to <8 x i32> @@ -51,9 +95,14 @@ define arm_aapcs_vfpcc <4 x i8> @vqdmulh_v4i8_b(<4 x i8> %s0, <4 x i8> %s1) { ; CHECK-LABEL: vqdmulh_v4i8_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 ; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vmov.i32 q1, #0x7f +; CHECK-NEXT: vshr.s32 q0, q0, #7 +; CHECK-NEXT: vmin.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %l2 = sext <4 x i8> %s0 to <4 x i32> @@ -115,8 +164,10 @@ define arm_aapcs_vfpcc <4 x i16> @vqdmulh_v4i16_b(<4 x i16> %s0, <4 x i16> %s1) { ; CHECK-LABEL: vqdmulh_v4i16_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmullb.s16 q0, q1, q0 +; CHECK-NEXT: vmov.i32 q1, #0x7fff +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vmin.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %l2 = sext <4 x i16> %s0 to <4 x i32> @@ -220,9 +271,14 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved2(<4 x i32> %s0a, <8 x i16> %s1) { ; CHECK-LABEL: vqdmulh_v8i16_interleaved2: ; CHECK: @ %bb.0: -; CHECK-NEXT: vqdmulh.s16 q2, q1, q0 +; CHECK-NEXT: vmullb.s16 q2, q1, q0 ; CHECK-NEXT: vrev32.16 q1, q1 -; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 +; CHECK-NEXT: vmullb.s16 q0, q1, q0 +; CHECK-NEXT: vshr.s32 q2, q2, #15 +; CHECK-NEXT: vmov.i32 q3, #0x7fff +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vmin.s32 q2, q2, q3 +; CHECK-NEXT: vmin.s32 q0, q0, q3 ; CHECK-NEXT: vmovnt.i32 q2, q0 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr @@ -278,14 +334,37 @@ define arm_aapcs_vfpcc <2 x i32> @vqdmulh_v2i32_b(<2 x i32> %s0, <2 x i32> %s1) { ; CHECK-LABEL: vqdmulh_v2i32_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmullb.s32 q2, q1, q0 +; CHECK-NEXT: mvn r12, #-2147483648 +; CHECK-NEXT: vmov r0, r5, d4 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: asrl r0, r5, #31 +; CHECK-NEXT: subs.w r3, r0, r12 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: csetm r3, lt +; CHECK-NEXT: bfi r2, r3, #0, #8 +; CHECK-NEXT: vmov r4, r3, d5 +; CHECK-NEXT: asrl r4, r3, #31 +; CHECK-NEXT: subs.w r1, r4, r12 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 +; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: adr r0, .LCPI14_0 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 0 @ 0x0 entry: %l2 = sext <2 x i32> %s0 to <2 x i64> %l5 = sext <2 x i32> %s1 to <2 x i64> @@ -349,12 +428,33 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 +; CHECK-NEXT: vmov.i32 q0, #0x7f ; CHECK-NEXT: .LBB17_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q0, [r0], #16 -; CHECK-NEXT: vldrb.u8 q1, [r1], #16 -; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: vldrb.s32 q1, [r0, #12] +; CHECK-NEXT: vldrb.s32 q2, [r1, #12] +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vldrb.s32 q2, [r1, #8] +; CHECK-NEXT: vshr.s32 q1, q1, #7 +; CHECK-NEXT: vmin.s32 q1, q1, q0 +; CHECK-NEXT: vstrb.32 q1, [r2, #12] +; CHECK-NEXT: vldrb.s32 q1, [r0, #8] +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vshr.s32 q1, q1, #7 +; CHECK-NEXT: vmin.s32 q1, q1, q0 +; CHECK-NEXT: vstrb.32 q1, [r2, #8] +; CHECK-NEXT: vldrb.s32 q1, [r0, #4] +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vldrb.s32 q2, [r1], #16 +; CHECK-NEXT: vshr.s32 q1, q1, #7 +; CHECK-NEXT: vmin.s32 q1, q1, q0 +; CHECK-NEXT: vstrb.32 q1, [r2, #4] +; CHECK-NEXT: vldrb.s32 q1, [r0], #16 +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vshr.s32 q1, q1, #7 +; CHECK-NEXT: vmin.s32 q1, q1, q0 +; CHECK-NEXT: vstrb.32 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB17_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll @@ -37,8 +37,52 @@ define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) { ; CHECK-LABEL: vqdmulh_v8i8_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 -; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov q2[3], q2[1], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmovlb.s8 q2, q2 +; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmovlb.s16 q2, q2 +; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmovlb.s8 q3, q3 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmovlb.s16 q3, q3 +; CHECK-NEXT: vmul.i32 q2, q3, q2 +; CHECK-NEXT: vshr.s32 q3, q2, #7 +; CHECK-NEXT: vmov.i32 q2, #0x7f +; CHECK-NEXT: vmin.s32 q3, q3, q2 +; CHECK-NEXT: vstrh.32 q3, [r0, #8] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmovlb.s8 q0, q3 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[3] +; CHECK-NEXT: vmov.u16 r2, q1[1] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 +; CHECK-NEXT: vmovlb.s8 q1, q3 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vshr.s32 q0, q0, #7 +; CHECK-NEXT: vmin.s32 q0, q0, q2 +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: bx lr entry: %l2 = sext <8 x i8> %s0 to <8 x i32> @@ -54,9 +98,14 @@ define arm_aapcs_vfpcc <4 x i8> @vqdmulh_v4i8_b(<4 x i8> %s0, <4 x i8> %s1) { ; CHECK-LABEL: vqdmulh_v4i8_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 ; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vmov.i32 q1, #0x7f +; CHECK-NEXT: vshr.s32 q0, q0, #7 +; CHECK-NEXT: vmin.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %l2 = sext <4 x i8> %s0 to <4 x i32> @@ -122,8 +171,10 @@ define arm_aapcs_vfpcc <4 x i16> @vqdmulh_v4i16_b(<4 x i16> %s0, <4 x i16> %s1) { ; CHECK-LABEL: vqdmulh_v4i16_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmullb.s16 q0, q1, q0 +; CHECK-NEXT: vmov.i32 q1, #0x7fff +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vmin.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %l2 = sext <4 x i16> %s0 to <4 x i32> @@ -231,9 +282,14 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved2(<4 x i32> %s0a, <8 x i16> %s1) { ; CHECK-LABEL: vqdmulh_v8i16_interleaved2: ; CHECK: @ %bb.0: -; CHECK-NEXT: vqdmulh.s16 q2, q1, q0 +; CHECK-NEXT: vmullb.s16 q2, q1, q0 ; CHECK-NEXT: vrev32.16 q1, q1 -; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 +; CHECK-NEXT: vmullb.s16 q0, q1, q0 +; CHECK-NEXT: vshr.s32 q2, q2, #15 +; CHECK-NEXT: vmov.i32 q3, #0x7fff +; CHECK-NEXT: vshr.s32 q0, q0, #15 +; CHECK-NEXT: vmin.s32 q2, q2, q3 +; CHECK-NEXT: vmin.s32 q0, q0, q3 ; CHECK-NEXT: vmovnt.i32 q2, q0 ; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr @@ -293,14 +349,37 @@ define arm_aapcs_vfpcc <2 x i32> @vqdmulh_v2i32_b(<2 x i32> %s0, <2 x i32> %s1) { ; CHECK-LABEL: vqdmulh_v2i32_b: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqdmulh.s32 q0, q1, q0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmullb.s32 q2, q1, q0 +; CHECK-NEXT: mvn r12, #-2147483648 +; CHECK-NEXT: vmov r0, r5, d4 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: asrl r0, r5, #31 +; CHECK-NEXT: subs.w r3, r0, r12 +; CHECK-NEXT: sbcs r3, r5, #0 +; CHECK-NEXT: csetm r3, lt +; CHECK-NEXT: bfi r2, r3, #0, #8 +; CHECK-NEXT: vmov r4, r3, d5 +; CHECK-NEXT: asrl r4, r3, #31 +; CHECK-NEXT: subs.w r1, r4, r12 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 +; CHECK-NEXT: sbcs r1, r3, #0 +; CHECK-NEXT: adr r0, .LCPI14_0 +; CHECK-NEXT: csetm r1, lt +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: bfi r2, r1, #8, #8 +; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 2147483647 @ 0x7fffffff +; CHECK-NEXT: .long 0 @ 0x0 entry: %l2 = sext <2 x i32> %s0 to <2 x i64> %l5 = sext <2 x i32> %s1 to <2 x i64> @@ -367,12 +446,33 @@ ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #64 +; CHECK-NEXT: vmov.i32 q0, #0x7f ; CHECK-NEXT: .LBB17_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q0, [r0], #16 -; CHECK-NEXT: vldrb.u8 q1, [r1], #16 -; CHECK-NEXT: vqdmulh.s8 q0, q1, q0 -; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: vldrb.s32 q1, [r0, #12] +; CHECK-NEXT: vldrb.s32 q2, [r1, #12] +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vldrb.s32 q2, [r1, #8] +; CHECK-NEXT: vshr.s32 q1, q1, #7 +; CHECK-NEXT: vmin.s32 q1, q1, q0 +; CHECK-NEXT: vstrb.32 q1, [r2, #12] +; CHECK-NEXT: vldrb.s32 q1, [r0, #8] +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vshr.s32 q1, q1, #7 +; CHECK-NEXT: vmin.s32 q1, q1, q0 +; CHECK-NEXT: vstrb.32 q1, [r2, #8] +; CHECK-NEXT: vldrb.s32 q1, [r0, #4] +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vldrb.s32 q2, [r1], #16 +; CHECK-NEXT: vshr.s32 q1, q1, #7 +; CHECK-NEXT: vmin.s32 q1, q1, q0 +; CHECK-NEXT: vstrb.32 q1, [r2, #4] +; CHECK-NEXT: vldrb.s32 q1, [r0], #16 +; CHECK-NEXT: vmul.i32 q1, q2, q1 +; CHECK-NEXT: vshr.s32 q1, q1, #7 +; CHECK-NEXT: vmin.s32 q1, q1, q0 +; CHECK-NEXT: vstrb.32 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB17_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll @@ -70,9 +70,7 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_uminmax_t1(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_uminmax_t1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqmovnb.u32 q0, q0 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vmovnt.i32 q1, q0 +; CHECK-NEXT: vqmovnt.u32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -87,7 +85,6 @@ ; CHECK-LABEL: vqmovni32_uminmax_t2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vqmovnb.u32 q0, q0 -; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmovnt.i32 q0, q1 ; CHECK-NEXT: bx lr entry: @@ -101,9 +98,7 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_uminmax_b1(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_uminmax_b1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqmovnb.u32 q0, q0 -; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vmovnb.i32 q1, q0 +; CHECK-NEXT: vqmovnb.u32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -240,7 +235,8 @@ define arm_aapcs_vfpcc <16 x i8> @vqmovni16_uminmax_b2(<8 x i16> %s0, <16 x i8> %src1) { ; CHECK-LABEL: vqmovni16_uminmax_b2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vqmovnb.u16 q0, q0 +; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmovnb.i16 q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -6,10 +6,10 @@ define void @vst2_v2i32(ptr %src, ptr %dst) { ; CHECK-LABEL: vst2_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldm.w r0, {r2, r3, r12} -; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r0 +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldr s2, [r0, #4] +; CHECK-NEXT: vldr s1, [r0, #8] +; CHECK-NEXT: vldr s3, [r0, #12] ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -333,20 +333,20 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vmov.f64 d9, d0 -; CHECK-NEXT: vmov.f64 d0, d5 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f64 d5, d2 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f64 d4, d6 -; CHECK-NEXT: vmov.f64 d2, d7 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f64 d8, d1 +; CHECK-NEXT: vmov.f64 d9, d7 +; CHECK-NEXT: vmov.f64 d1, d6 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f64 d6, d3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vmov.f64 d7, d5 +; CHECK-NEXT: vmov.f64 d3, d4 +; CHECK-NEXT: vstrw.32 q3, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -503,12 +503,9 @@ define void @vst2_v4f16(ptr %src, ptr %dst) { ; CHECK-LABEL: vst2_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r2, r12, [r0] -; CHECK-NEXT: ldrd r3, r0, [r0, #8] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.32 q0[1], r12 -; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vldmia r0, {s0, s1} +; CHECK-NEXT: vldr s4, [r0, #8] +; CHECK-NEXT: vldr s5, [r0, #12] ; CHECK-NEXT: vmovx.f16 s2, s0 ; CHECK-NEXT: vins.f16 s0, s4 ; CHECK-NEXT: vmovx.f16 s4, s4 @@ -636,20 +633,20 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vmov.f64 d9, d0 -; CHECK-NEXT: vmov.f64 d0, d5 -; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vmov.f64 d5, d2 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f64 d4, d6 -; CHECK-NEXT: vmov.f64 d2, d7 -; CHECK-NEXT: vstrw.32 q2, [r1, #32] -; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f64 d8, d1 +; CHECK-NEXT: vmov.f64 d9, d7 +; CHECK-NEXT: vmov.f64 d1, d6 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: vmov.f64 d6, d3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vmov.f64 d7, d5 +; CHECK-NEXT: vmov.f64 d3, d4 +; CHECK-NEXT: vstrw.32 q3, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -6,28 +6,17 @@ define void @vst4_v2i32(ptr %src, ptr %dst) { ; CHECK-LABEL: vst4_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: add.w r6, r0, #16 -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldm r6, {r4, r5, r6} -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: ldr r0, [r0, #28] -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r6 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r0 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldr s4, [r0, #4] +; CHECK-NEXT: vldr s1, [r0, #8] +; CHECK-NEXT: vldr s5, [r0, #12] +; CHECK-NEXT: vldr s2, [r0, #16] +; CHECK-NEXT: vldr s6, [r0, #20] +; CHECK-NEXT: vldr s3, [r0, #24] +; CHECK-NEXT: vldr s7, [r0, #28] +; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: bx lr entry: %l1 = load <2 x i32>, ptr %src, align 4 %s2 = getelementptr <2 x i32>, ptr %src, i32 1 @@ -732,16 +721,16 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #64 ; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vldrw.u32 q7, [r0, #80] -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov.f64 d15, d10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov.f64 d14, d12 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill @@ -753,21 +742,21 @@ ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.f64 d10, d13 ; CHECK-NEXT: vmov.f64 d2, d5 -; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vstrw.32 q5, [r1, #48] ; CHECK-NEXT: vmov.f64 d5, d6 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vmov.f64 d13, d8 -; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vstrw.32 q2, [r1, #80] ; CHECK-NEXT: vmov.f64 d12, d0 ; CHECK-NEXT: vmov.f64 d8, d1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [r1, #80] -; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q6, [r1, #64] +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.f64 d6, d15 -; CHECK-NEXT: vstrw.32 q4, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vstrw.32 q3, [r1, #96] +; CHECK-NEXT: vstrw.32 q4, [r1, #96] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q3, [r1, #112] ; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -1010,19 +999,17 @@ define void @vst4_v2f16(ptr %src, ptr %dst) { ; CHECK-LABEL: vst4_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldm.w r0, {r2, r3, r12} -; CHECK-NEXT: vmov.32 q1[0], r12 -; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldr s5, [r0, #4] +; CHECK-NEXT: vldr s4, [r0, #8] ; CHECK-NEXT: vmovx.f16 s2, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vldr s1, [r0, #12] +; CHECK-NEXT: vmovx.f16 s6, s5 ; CHECK-NEXT: vmovx.f16 s3, s4 ; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s6, s5 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vins.f16 s4, s1 +; CHECK-NEXT: vins.f16 s0, s5 ; CHECK-NEXT: vins.f16 s3, s6 ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vstrh.16 q0, [r1] @@ -1045,41 +1032,31 @@ define void @vst4_v4f16(ptr %src, ptr %dst) { ; CHECK-LABEL: vst4_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: add.w r6, r0, #16 -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldm r6, {r4, r5, r6} -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: ldr r0, [r0, #28] -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r6 -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r0 +; CHECK-NEXT: vldmia r0, {s0, s1, s2, s3, s4, s5, s6, s7} +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vins.f16 s12, s6 +; CHECK-NEXT: vmovx.f16 s11, s5 +; CHECK-NEXT: vmovx.f16 s6, s7 ; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vins.f16 s11, s6 +; CHECK-NEXT: vmovx.f16 s6, s0 ; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vmovx.f16 s2, s2 -; CHECK-NEXT: vmovx.f16 s11, s1 -; CHECK-NEXT: vins.f16 s12, s2 -; CHECK-NEXT: vmovx.f16 s2, s3 -; CHECK-NEXT: vins.f16 s11, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vmovx.f16 s10, s1 ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s6, s7 -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vins.f16 s10, s6 -; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmovx.f16 s2, s3 +; CHECK-NEXT: vins.f16 s10, s2 +; CHECK-NEXT: vmov.f32 s8, s1 +; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s12 -; CHECK-NEXT: vstrh.16 q1, [r1] -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s12 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr entry: %l1 = load <4 x half>, ptr %src, align 4 %s2 = getelementptr <4 x half>, ptr %src, i32 1 @@ -1279,16 +1256,16 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #64 ; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vldrw.u32 q7, [r0, #80] -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] +; CHECK-NEXT: vldrw.u32 q6, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov.f64 d15, d10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov.f64 d14, d12 ; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill @@ -1300,21 +1277,21 @@ ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.f64 d10, d13 ; CHECK-NEXT: vmov.f64 d2, d5 -; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vstrw.32 q5, [r1, #48] ; CHECK-NEXT: vmov.f64 d5, d6 -; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1, #32] ; CHECK-NEXT: vmov.f64 d13, d8 -; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vstrw.32 q2, [r1, #80] ; CHECK-NEXT: vmov.f64 d12, d0 ; CHECK-NEXT: vmov.f64 d8, d1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [r1, #80] -; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q6, [r1, #64] +; CHECK-NEXT: vstrw.32 q0, [r1, #16] ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov.f64 d6, d15 -; CHECK-NEXT: vstrw.32 q4, [r1, #112] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vstrw.32 q3, [r1, #96] +; CHECK-NEXT: vstrw.32 q4, [r1, #96] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q3, [r1, #112] ; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll --- a/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ b/llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -344,10 +344,10 @@ ; CHECK-LE-NEXT: .pad #32 ; CHECK-LE-NEXT: sub sp, #32 ; CHECK-LE-NEXT: vldrb.s16 q0, [r1, #8] -; CHECK-LE-NEXT: add r2, sp, #16 +; CHECK-LE-NEXT: mov r2, sp ; CHECK-LE-NEXT: vstrw.32 q0, [r2] ; CHECK-LE-NEXT: vldrb.s16 q0, [r1] -; CHECK-LE-NEXT: mov r1, sp +; CHECK-LE-NEXT: add r1, sp, #16 ; CHECK-LE-NEXT: vstrw.32 q0, [r1] ; CHECK-LE-NEXT: vldrh.u32 q0, [r2, #8] ; CHECK-LE-NEXT: vstrw.32 q0, [r0, #48] @@ -365,10 +365,10 @@ ; CHECK-BE-NEXT: .pad #32 ; CHECK-BE-NEXT: sub sp, #32 ; CHECK-BE-NEXT: vldrb.s16 q0, [r1, #8] -; CHECK-BE-NEXT: add r2, sp, #16 +; CHECK-BE-NEXT: mov r2, sp ; CHECK-BE-NEXT: vstrh.16 q0, [r2] ; CHECK-BE-NEXT: vldrb.s16 q0, [r1] -; CHECK-BE-NEXT: mov r1, sp +; CHECK-BE-NEXT: add r1, sp, #16 ; CHECK-BE-NEXT: vstrh.16 q0, [r1] ; CHECK-BE-NEXT: vldrh.u32 q0, [r2, #8] ; CHECK-BE-NEXT: vstrw.32 q0, [r0, #48] @@ -410,12 +410,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r1, #32]! ; CHECK-NEXT: vldrh.s32 q1, [r1, #8] -; CHECK-NEXT: vldrh.s32 q2, [r1, #24] -; CHECK-NEXT: vldrh.s32 q3, [r1, #16] +; CHECK-NEXT: vldrh.s32 q2, [r1, #16] +; CHECK-NEXT: vldrh.s32 q3, [r1, #24] ; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: vstrw.32 q2, [r0, #48] +; CHECK-NEXT: vstrw.32 q2, [r0, #32] ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: vstrw.32 q3, [r0, #32] +; CHECK-NEXT: vstrw.32 q3, [r0, #48] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll @@ -61,45 +61,40 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; CHECK-LABEL: test_srem_vec: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: mov r6, r0 -; CHECK-NEXT: and r0, r3, #1 -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: rsbs r1, r0, #0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: and r1, r1, #1 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: movs r2, #9 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: and r0, r5, #1 -; CHECK-NEXT: mov r7, r2 -; CHECK-NEXT: rsbs r1, r0, #0 ; CHECK-NEXT: mov r4, r3 +; CHECK-NEXT: rsbs r1, r0, #0 +; CHECK-NEXT: vmov.32 d8[0], r2 ; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: movs r2, #9 ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: ldr r1, [sp, #44] -; CHECK-NEXT: vmov.32 d8[0], r2 -; CHECK-NEXT: ldr r0, [sp, #40] +; CHECK-NEXT: ldrd r0, r1, [sp, #32] ; CHECK-NEXT: mov r5, r3 ; CHECK-NEXT: and r1, r1, #1 -; CHECK-NEXT: mvn r2, #8 +; CHECK-NEXT: vmov.32 d9[0], r2 ; CHECK-NEXT: rsbs r1, r1, #0 +; CHECK-NEXT: mvn r2, #8 ; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: vmov.32 d9[0], r7 ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: vmov.32 d16[0], r2 ; CHECK-NEXT: adr r0, .LCPI3_0 -; CHECK-NEXT: vmov.32 d9[1], r4 +; CHECK-NEXT: vmov.32 d9[1], r5 ; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128] ; CHECK-NEXT: adr r0, .LCPI3_1 ; CHECK-NEXT: vmov.32 d16[1], r3 -; CHECK-NEXT: vmov.32 d8[1], r5 +; CHECK-NEXT: vmov.32 d8[1], r4 ; CHECK-NEXT: vand q8, q8, q9 ; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128] ; CHECK-NEXT: adr r0, .LCPI3_2 @@ -119,8 +114,7 @@ ; CHECK-NEXT: vmov.32 r1, d18[1] ; CHECK-NEXT: vmov.32 r2, d16[0] ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI3_0: diff --git a/llvm/test/CodeGen/Thumb2/thumb2-select_xform.ll b/llvm/test/CodeGen/Thumb2/thumb2-select_xform.ll --- a/llvm/test/CodeGen/Thumb2/thumb2-select_xform.ll +++ b/llvm/test/CodeGen/Thumb2/thumb2-select_xform.ll @@ -4,11 +4,11 @@ define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-LABEL: t1: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: mvn r1, #-2147483648 +; CHECK-NEXT: mvn r0, #-2147483648 ; CHECK-NEXT: cmp r2, #10 -; CHECK-NEXT: it le -; CHECK-NEXT: addle r0, r1 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt r0, #0 +; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: bx lr %tmp1 = icmp sgt i32 %c, 10 %tmp2 = select i1 %tmp1, i32 0, i32 2147483647 @@ -19,10 +19,11 @@ define i32 @t2(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-LABEL: t2: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: cmp r2, #10 -; CHECK-NEXT: it le -; CHECK-NEXT: addle.w r0, r0, #-2147483648 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: cmp r2, #11 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r0, #1 +; CHECK-NEXT: add.w r0, r1, r0, lsl #31 ; CHECK-NEXT: bx lr %tmp1 = icmp sgt i32 %c, 10 @@ -34,10 +35,11 @@ define i32 @t3(i32 %a, i32 %b, i32 %c, i32 %d) nounwind { ; CHECK-LABEL: t3: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: movs r0, #10 ; CHECK-NEXT: cmp r2, #10 -; CHECK-NEXT: it le -; CHECK-NEXT: suble r0, #10 +; CHECK-NEXT: it gt +; CHECK-NEXT: movgt r0, #0 +; CHECK-NEXT: subs r0, r1, r0 ; CHECK-NEXT: bx lr %tmp1 = icmp sgt i32 %c, 10 %tmp2 = select i1 %tmp1, i32 0, i32 10 diff --git a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll @@ -8,121 +8,121 @@ ; THUMBV7-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; THUMBV7-NEXT: .pad #44 ; THUMBV7-NEXT: sub sp, #44 -; THUMBV7-NEXT: ldr.w lr, [sp, #88] -; THUMBV7-NEXT: mov r11, r0 -; THUMBV7-NEXT: ldr r4, [sp, #96] -; THUMBV7-NEXT: ldr.w r12, [sp, #80] -; THUMBV7-NEXT: umull r1, r5, r2, lr -; THUMBV7-NEXT: umull r7, r6, r3, r4 -; THUMBV7-NEXT: str r1, [sp, #40] @ 4-byte Spill -; THUMBV7-NEXT: ldr r1, [sp, #100] -; THUMBV7-NEXT: umull r4, r0, r4, r2 -; THUMBV7-NEXT: str r7, [sp, #32] @ 4-byte Spill -; THUMBV7-NEXT: umull r7, r1, r1, r2 -; THUMBV7-NEXT: str r4, [sp, #24] @ 4-byte Spill -; THUMBV7-NEXT: str r0, [sp, #12] @ 4-byte Spill -; THUMBV7-NEXT: ldr r0, [sp, #84] -; THUMBV7-NEXT: str r7, [sp, #20] @ 4-byte Spill -; THUMBV7-NEXT: ldr r7, [sp, #92] -; THUMBV7-NEXT: umull r10, r8, r0, lr -; THUMBV7-NEXT: umull r4, r9, r7, r12 -; THUMBV7-NEXT: str r4, [sp, #8] @ 4-byte Spill -; THUMBV7-NEXT: umull r4, r0, r12, lr +; THUMBV7-NEXT: ldrd lr, r5, [sp, #88] +; THUMBV7-NEXT: mov r10, r0 +; THUMBV7-NEXT: ldrd r4, r12, [sp, #96] +; THUMBV7-NEXT: umull r7, r6, r2, lr +; THUMBV7-NEXT: str r7, [sp, #40] @ 4-byte Spill +; THUMBV7-NEXT: umull r1, r7, r3, r4 +; THUMBV7-NEXT: umull r4, r11, r4, r2 +; THUMBV7-NEXT: str r1, [sp, #28] @ 4-byte Spill +; THUMBV7-NEXT: umull r0, r1, r12, r2 +; THUMBV7-NEXT: str r4, [sp, #36] @ 4-byte Spill +; THUMBV7-NEXT: str r0, [sp, #20] @ 4-byte Spill +; THUMBV7-NEXT: ldrd r12, r0, [sp, #80] +; THUMBV7-NEXT: umull r0, r8, r0, lr +; THUMBV7-NEXT: umull r4, r9, r5, r12 +; THUMBV7-NEXT: str r0, [sp, #8] @ 4-byte Spill +; THUMBV7-NEXT: str r4, [sp, #12] @ 4-byte Spill +; THUMBV7-NEXT: umull r0, r4, r12, lr ; THUMBV7-NEXT: mov.w r12, #0 -; THUMBV7-NEXT: umlal r5, r12, r3, lr -; THUMBV7-NEXT: str r4, [sp, #16] @ 4-byte Spill -; THUMBV7-NEXT: str r0, [sp, #4] @ 4-byte Spill -; THUMBV7-NEXT: umull r4, r2, r2, r7 +; THUMBV7-NEXT: umlal r6, r12, r3, lr +; THUMBV7-NEXT: ldr.w lr, [sp, #100] +; THUMBV7-NEXT: str r4, [sp, #4] @ 4-byte Spill +; THUMBV7-NEXT: str r0, [sp, #16] @ 4-byte Spill +; THUMBV7-NEXT: umull r4, r2, r2, r5 ; THUMBV7-NEXT: ldr r0, [sp, #40] @ 4-byte Reload -; THUMBV7-NEXT: str r4, [sp, #28] @ 4-byte Spill -; THUMBV7-NEXT: str r2, [sp, #36] @ 4-byte Spill -; THUMBV7-NEXT: str.w r0, [r11] -; THUMBV7-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; THUMBV7-NEXT: str r4, [sp, #24] @ 4-byte Spill +; THUMBV7-NEXT: movs r4, #0 +; THUMBV7-NEXT: str r2, [sp, #32] @ 4-byte Spill +; THUMBV7-NEXT: str.w r0, [r10] +; THUMBV7-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; THUMBV7-NEXT: ldr r2, [sp, #20] @ 4-byte Reload ; THUMBV7-NEXT: add r2, r0 -; THUMBV7-NEXT: ldr r0, [sp, #12] @ 4-byte Reload -; THUMBV7-NEXT: adds.w lr, r0, r2 -; THUMBV7-NEXT: mov.w r2, #0 -; THUMBV7-NEXT: adc r0, r2, #0 -; THUMBV7-NEXT: str r0, [sp, #32] @ 4-byte Spill -; THUMBV7-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; THUMBV7-NEXT: add.w r4, r10, r0 +; THUMBV7-NEXT: adds.w r11, r11, r2 +; THUMBV7-NEXT: adc r0, r4, #0 +; THUMBV7-NEXT: str r0, [sp, #28] @ 4-byte Spill +; THUMBV7-NEXT: ldrd r2, r0, [sp, #8] @ 8-byte Folded Reload +; THUMBV7-NEXT: adds r4, r2, r0 ; THUMBV7-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; THUMBV7-NEXT: adds r4, r4, r0 -; THUMBV7-NEXT: adc r0, r2, #0 +; THUMBV7-NEXT: adds r2, r0, r4 +; THUMBV7-NEXT: mov.w r0, #0 +; THUMBV7-NEXT: adc r0, r0, #0 ; THUMBV7-NEXT: str r0, [sp, #40] @ 4-byte Spill -; THUMBV7-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; THUMBV7-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; THUMBV7-NEXT: adds.w r10, r2, r0 -; THUMBV7-NEXT: mov r2, r3 -; THUMBV7-NEXT: adc.w r0, r4, lr -; THUMBV7-NEXT: ldr.w lr, [sp, #100] +; THUMBV7-NEXT: ldr r0, [sp, #36] @ 4-byte Reload +; THUMBV7-NEXT: ldr r4, [sp, #16] @ 4-byte Reload +; THUMBV7-NEXT: adds r0, r0, r4 +; THUMBV7-NEXT: str r0, [sp, #36] @ 4-byte Spill +; THUMBV7-NEXT: adc.w r0, r2, r11 ; THUMBV7-NEXT: cmp r1, #0 -; THUMBV7-NEXT: str r0, [sp, #24] @ 4-byte Spill +; THUMBV7-NEXT: str r0, [sp, #20] @ 4-byte Spill ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r1, #1 ; THUMBV7-NEXT: cmp r3, #0 +; THUMBV7-NEXT: mov r2, r3 ; THUMBV7-NEXT: mov r0, lr ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r2, #1 ; THUMBV7-NEXT: cmp.w lr, #0 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r0, #1 -; THUMBV7-NEXT: ldr r4, [sp, #28] @ 4-byte Reload +; THUMBV7-NEXT: ldr.w r11, [sp, #24] @ 4-byte Reload ; THUMBV7-NEXT: ands r0, r2 ; THUMBV7-NEXT: orrs r1, r0 -; THUMBV7-NEXT: adds r5, r5, r4 -; THUMBV7-NEXT: str.w r5, [r11, #4] -; THUMBV7-NEXT: ldr r0, [sp, #36] @ 4-byte Reload -; THUMBV7-NEXT: mov.w r5, #0 +; THUMBV7-NEXT: movs r2, #0 +; THUMBV7-NEXT: adds.w r6, r6, r11 +; THUMBV7-NEXT: str.w r6, [r10, #4] +; THUMBV7-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; THUMBV7-NEXT: ldr r4, [sp, #84] ; THUMBV7-NEXT: adcs.w r0, r0, r12 -; THUMBV7-NEXT: adc r2, r5, #0 -; THUMBV7-NEXT: cmp r6, #0 -; THUMBV7-NEXT: it ne -; THUMBV7-NEXT: movne r6, #1 -; THUMBV7-NEXT: orrs r1, r6 -; THUMBV7-NEXT: ldr r6, [sp, #84] -; THUMBV7-NEXT: umlal r0, r2, r3, r7 -; THUMBV7-NEXT: ldr r3, [sp, #32] @ 4-byte Reload +; THUMBV7-NEXT: mov.w r12, #0 +; THUMBV7-NEXT: adc r2, r2, #0 ; THUMBV7-NEXT: cmp r7, #0 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r7, #1 +; THUMBV7-NEXT: umlal r0, r2, r3, r5 +; THUMBV7-NEXT: ldr r3, [sp, #28] @ 4-byte Reload +; THUMBV7-NEXT: orrs r1, r7 ; THUMBV7-NEXT: orrs r1, r3 -; THUMBV7-NEXT: mov r3, r6 -; THUMBV7-NEXT: cmp r6, #0 +; THUMBV7-NEXT: cmp r4, #0 +; THUMBV7-NEXT: mov r3, r4 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r3, #1 +; THUMBV7-NEXT: cmp r5, #0 +; THUMBV7-NEXT: it ne +; THUMBV7-NEXT: movne r5, #1 ; THUMBV7-NEXT: cmp.w r8, #0 -; THUMBV7-NEXT: and.w r3, r3, r7 -; THUMBV7-NEXT: ldr r7, [sp, #80] +; THUMBV7-NEXT: and.w r3, r3, r5 +; THUMBV7-NEXT: ldr r5, [sp, #80] ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne.w r8, #1 ; THUMBV7-NEXT: cmp.w r9, #0 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne.w r9, #1 -; THUMBV7-NEXT: orrs r7, r6 -; THUMBV7-NEXT: ldr r6, [sp, #96] +; THUMBV7-NEXT: orrs.w r7, r5, r4 +; THUMBV7-NEXT: ldr r4, [sp, #96] ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r7, #1 ; THUMBV7-NEXT: orr.w r3, r3, r8 -; THUMBV7-NEXT: orrs.w r6, r6, lr +; THUMBV7-NEXT: orrs.w r6, r4, lr ; THUMBV7-NEXT: orr.w r3, r3, r9 ; THUMBV7-NEXT: it ne ; THUMBV7-NEXT: movne r6, #1 -; THUMBV7-NEXT: adds.w r0, r0, r10 -; THUMBV7-NEXT: str.w r0, [r11, #8] -; THUMBV7-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; THUMBV7-NEXT: ldr r5, [sp, #36] @ 4-byte Reload +; THUMBV7-NEXT: adds r0, r0, r5 +; THUMBV7-NEXT: str.w r0, [r10, #8] +; THUMBV7-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; THUMBV7-NEXT: adcs r0, r2 -; THUMBV7-NEXT: str.w r0, [r11, #12] +; THUMBV7-NEXT: str.w r0, [r10, #12] ; THUMBV7-NEXT: ldr r0, [sp, #40] @ 4-byte Reload ; THUMBV7-NEXT: and.w r2, r7, r6 ; THUMBV7-NEXT: orr.w r0, r0, r3 ; THUMBV7-NEXT: orr.w r0, r0, r2 ; THUMBV7-NEXT: orr.w r0, r0, r1 -; THUMBV7-NEXT: adc r1, r5, #0 +; THUMBV7-NEXT: adc r1, r12, #0 ; THUMBV7-NEXT: orrs r0, r1 ; THUMBV7-NEXT: and r0, r0, #1 -; THUMBV7-NEXT: strb.w r0, [r11, #16] +; THUMBV7-NEXT: strb.w r0, [r10, #16] ; THUMBV7-NEXT: add sp, #44 ; THUMBV7-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} start: diff --git a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll @@ -42,13 +42,15 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; CHECK-LABEL: test_urem_odd_setne: ; CHECK: @ %bb.0: -; CHECK-NEXT: movs r1, #13 -; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: and r1, r0, #15 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: cmp r1, #3 -; CHECK-NEXT: it hi -; CHECK-NEXT: movhi r0, #1 +; CHECK-NEXT: movs r2, #13 +; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: lsrs r1, r1, #6 +; CHECK-NEXT: orr.w r1, r1, r1, lsl #2 +; CHECK-NEXT: subs r0, r0, r1 +; CHECK-NEXT: ands r0, r0, #15 +; CHECK-NEXT: it ne +; CHECK-NEXT: movne r0, #1 ; CHECK-NEXT: bx lr %urem = urem i4 %X, 5 %cmp = icmp ne i4 %urem, 0 diff --git a/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll b/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll --- a/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll +++ b/llvm/test/CodeGen/VE/Scalar/atomic_cmp_swap.ll @@ -1440,10 +1440,10 @@ ; CHECK-NEXT: monc ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB33_4: # %bb -; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: ld1b.zx %s3, (, %s0) -; CHECK-NEXT: ldl.zx %s4, 8(, %s11) ; CHECK-NEXT: lea %s2, 8(, %s11) +; CHECK-NEXT: ldl.zx %s4, 8(, %s11) +; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: lea %s5, -256 ; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: and %s4, %s4, %s5 @@ -1559,10 +1559,10 @@ ; CHECK-NEXT: monc ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB35_4: # %bb -; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: ld1b.zx %s3, (, %s0) -; CHECK-NEXT: ldl.zx %s4, 8(, %s11) ; CHECK-NEXT: lea %s2, 8(, %s11) +; CHECK-NEXT: ldl.zx %s4, 8(, %s11) +; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: lea %s5, -256 ; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: and %s4, %s4, %s5 @@ -1672,10 +1672,10 @@ ; CHECK-NEXT: monc ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB37_4: # %bb -; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: ld2b.zx %s3, (, %s0) -; CHECK-NEXT: ldl.zx %s4, 8(, %s11) ; CHECK-NEXT: lea %s2, 8(, %s11) +; CHECK-NEXT: ldl.zx %s4, 8(, %s11) +; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: lea %s5, -65536 ; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: and %s4, %s4, %s5 @@ -2004,28 +2004,28 @@ define zeroext i1 @_Z29atomic_cmp_swap_relaxed_gv_i1Rbb(ptr nocapture nonnull align 1 dereferenceable(1) %arg, i1 zeroext %arg1) { ; CHECK-LABEL: _Z29atomic_cmp_swap_relaxed_gv_i1Rbb: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: and %s2, %s1, (32)0 -; CHECK-NEXT: lea %s1, gv_i1@lo +; CHECK-NEXT: ld1b.zx %s2, (, %s0) +; CHECK-NEXT: lea %s3, gv_i1@lo +; CHECK-NEXT: and %s3, %s3, (32)0 +; CHECK-NEXT: lea.sl %s3, gv_i1@hi(, %s3) +; CHECK-NEXT: and %s3, -4, %s3 +; CHECK-NEXT: ldl.zx %s4, (, %s3) ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s1, gv_i1@hi(, %s1) -; CHECK-NEXT: and %s1, -4, %s1 -; CHECK-NEXT: ldl.zx %s4, (, %s1) -; CHECK-NEXT: ld1b.zx %s3, (, %s0) ; CHECK-NEXT: lea %s5, -256 ; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: and %s4, %s4, %s5 ; CHECK-NEXT: and %s4, %s4, (32)0 +; CHECK-NEXT: or %s1, %s4, %s1 ; CHECK-NEXT: or %s2, %s4, %s2 -; CHECK-NEXT: or %s3, %s4, %s3 -; CHECK-NEXT: cas.w %s2, (%s1), %s3 -; CHECK-NEXT: cmps.w.sx %s3, %s2, %s3 -; CHECK-NEXT: or %s1, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s1, (63)0, %s3 -; CHECK-NEXT: brne.w 0, %s1, .LBB44_2 +; CHECK-NEXT: cas.w %s1, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s1, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 +; CHECK-NEXT: brne.w 0, %s2, .LBB44_2 ; CHECK-NEXT: # %bb.1: # %bb5 -; CHECK-NEXT: st1b %s2, (, %s0) +; CHECK-NEXT: st1b %s1, (, %s0) ; CHECK-NEXT: .LBB44_2: # %bb7 -; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i = zext i1 %arg1 to i8 @@ -2091,28 +2091,28 @@ define zeroext i8 @_Z29atomic_cmp_swap_relaxed_gv_u8Rhh(ptr nocapture nonnull align 1 dereferenceable(1) %arg, i8 zeroext %arg1) { ; CHECK-LABEL: _Z29atomic_cmp_swap_relaxed_gv_u8Rhh: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: and %s2, %s1, (32)0 -; CHECK-NEXT: lea %s1, gv_u8@lo +; CHECK-NEXT: ld1b.zx %s2, (, %s0) +; CHECK-NEXT: lea %s3, gv_u8@lo +; CHECK-NEXT: and %s3, %s3, (32)0 +; CHECK-NEXT: lea.sl %s3, gv_u8@hi(, %s3) +; CHECK-NEXT: and %s3, -4, %s3 +; CHECK-NEXT: ldl.zx %s4, (, %s3) ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s1, gv_u8@hi(, %s1) -; CHECK-NEXT: and %s1, -4, %s1 -; CHECK-NEXT: ldl.zx %s4, (, %s1) -; CHECK-NEXT: ld1b.zx %s3, (, %s0) ; CHECK-NEXT: lea %s5, -256 ; CHECK-NEXT: and %s5, %s5, (32)0 ; CHECK-NEXT: and %s4, %s4, %s5 ; CHECK-NEXT: and %s4, %s4, (32)0 +; CHECK-NEXT: or %s1, %s4, %s1 ; CHECK-NEXT: or %s2, %s4, %s2 -; CHECK-NEXT: or %s3, %s4, %s3 -; CHECK-NEXT: cas.w %s2, (%s1), %s3 -; CHECK-NEXT: cmps.w.sx %s3, %s2, %s3 -; CHECK-NEXT: or %s1, 0, (0)1 -; CHECK-NEXT: cmov.w.eq %s1, (63)0, %s3 -; CHECK-NEXT: brne.w 0, %s1, .LBB46_2 +; CHECK-NEXT: cas.w %s1, (%s3), %s2 +; CHECK-NEXT: cmps.w.sx %s3, %s1, %s2 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s3 +; CHECK-NEXT: brne.w 0, %s2, .LBB46_2 ; CHECK-NEXT: # %bb.1: # %bb4 -; CHECK-NEXT: st1b %s2, (, %s0) +; CHECK-NEXT: st1b %s1, (, %s0) ; CHECK-NEXT: .LBB46_2: # %bb6 -; CHECK-NEXT: adds.w.zx %s0, %s1, (0)1 +; CHECK-NEXT: adds.w.zx %s0, %s2, (0)1 ; CHECK-NEXT: b.l.t (, %s10) bb: %i = load i8, ptr %arg, align 1 diff --git a/llvm/test/CodeGen/VE/Scalar/atomic_swap.ll b/llvm/test/CodeGen/VE/Scalar/atomic_swap.ll --- a/llvm/test/CodeGen/VE/Scalar/atomic_swap.ll +++ b/llvm/test/CodeGen/VE/Scalar/atomic_swap.ll @@ -837,7 +837,6 @@ ; CHECK-NEXT: monc ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB33_2: -; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s1, 1, (0)1 ; CHECK-NEXT: lea %s2, 8(, %s11) ; CHECK-NEXT: ts1am.w %s0, (%s2), %s1 @@ -876,7 +875,6 @@ ; CHECK-NEXT: monc ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB34_2: -; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s1, 1, (0)1 ; CHECK-NEXT: lea %s2, 8(, %s11) ; CHECK-NEXT: ts1am.w %s0, (%s2), %s1 @@ -907,7 +905,6 @@ ; CHECK-NEXT: monc ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB35_2: -; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s1, 1, (0)1 ; CHECK-NEXT: lea %s2, 8(, %s11) ; CHECK-NEXT: ts1am.w %s0, (%s2), %s1 @@ -937,7 +934,6 @@ ; CHECK-NEXT: monc ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB36_2: -; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s1, 3, (0)1 ; CHECK-NEXT: lea %s2, 8(, %s11) ; CHECK-NEXT: ts1am.w %s0, (%s2), %s1 @@ -968,7 +964,6 @@ ; CHECK-NEXT: monc ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB37_2: -; CHECK-NEXT: and %s0, %s0, (32)0 ; CHECK-NEXT: or %s1, 3, (0)1 ; CHECK-NEXT: lea %s2, 8(, %s11) ; CHECK-NEXT: ts1am.w %s0, (%s2), %s1 diff --git a/llvm/test/CodeGen/VE/Scalar/br_cc.ll b/llvm/test/CodeGen/VE/Scalar/br_cc.ll --- a/llvm/test/CodeGen/VE/Scalar/br_cc.ll +++ b/llvm/test/CodeGen/VE/Scalar/br_cc.ll @@ -529,7 +529,8 @@ ; CHECK-NEXT: cmpu.l %s0, %s0, (58)0 ; CHECK-NEXT: cmov.l.gt %s2, (63)0, %s0 ; CHECK-NEXT: cmov.l.eq %s4, %s2, %s1 -; CHECK-NEXT: brne.w 0, %s4, .LBB23_2 +; CHECK-NEXT: and %s0, 1, %s4 +; CHECK-NEXT: brne.w 0, %s0, .LBB23_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -558,7 +559,8 @@ ; CHECK-NEXT: cmpu.l %s0, %s0, (58)0 ; CHECK-NEXT: cmov.l.gt %s2, (63)0, %s0 ; CHECK-NEXT: cmov.l.eq %s4, %s2, %s1 -; CHECK-NEXT: brne.w 0, %s4, .LBB24_2 +; CHECK-NEXT: and %s0, 1, %s4 +; CHECK-NEXT: brne.w 0, %s0, .LBB24_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -652,7 +654,7 @@ define void @br_cc_imm_i1(i1 zeroext %0) { ; CHECK-LABEL: br_cc_imm_i1: ; CHECK: # %bb.0: -; CHECK-NEXT: breq.w 0, %s0, .LBB28_2 +; CHECK-NEXT: breq.l 0, %s0, .LBB28_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -862,7 +864,8 @@ ; CHECK-NEXT: cmov.l.lt %s3, (63)0, %s0 ; CHECK-NEXT: cmpu.l %s0, %s1, (0)0 ; CHECK-NEXT: cmov.l.eq %s4, %s3, %s0 -; CHECK-NEXT: brne.w 0, %s4, .LBB37_2 +; CHECK-NEXT: and %s0, 1, %s4 +; CHECK-NEXT: brne.w 0, %s0, .LBB37_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: #APP ; CHECK-NEXT: nop @@ -892,7 +895,8 @@ ; CHECK-NEXT: cmov.l.lt %s3, (63)0, %s0 ; CHECK-NEXT: cmpu.l %s0, %s1, (0)0 ; CHECK-NEXT: cmov.l.eq %s4, %s3, %s0 -; CHECK-NEXT: brne.w 0, %s4, .LBB38_2 +; CHECK-NEXT: and %s0, 1, %s4 +; CHECK-NEXT: brne.w 0, %s0, .LBB38_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: #APP ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/VE/Scalar/brcond.ll b/llvm/test/CodeGen/VE/Scalar/brcond.ll --- a/llvm/test/CodeGen/VE/Scalar/brcond.ll +++ b/llvm/test/CodeGen/VE/Scalar/brcond.ll @@ -5,7 +5,7 @@ define void @brcond_then(i1 zeroext %0) { ; CHECK-LABEL: brcond_then: ; CHECK: # %bb.0: -; CHECK-NEXT: breq.w 0, %s0, .LBB0_2 +; CHECK-NEXT: breq.l 0, %s0, .LBB0_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: #APP ; CHECK-NEXT: nop diff --git a/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll b/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll --- a/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll +++ b/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll @@ -205,7 +205,8 @@ ; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s19 ; CHECK-NEXT: bsic %s10, (, %s12) -; CHECK-NEXT: st2b %s19, (, %s18) +; CHECK-NEXT: and %s1, %s19, (32)0 +; CHECK-NEXT: st2b %s1, (, %s18) ; CHECK-NEXT: ld %s19, 296(, %s11) # 8-byte Folded Reload ; CHECK-NEXT: ld %s18, 288(, %s11) # 8-byte Folded Reload ; CHECK-NEXT: or %s11, 0, %s9 diff --git a/llvm/test/CodeGen/VE/Scalar/function_prologue_epilogue.ll b/llvm/test/CodeGen/VE/Scalar/function_prologue_epilogue.ll --- a/llvm/test/CodeGen/VE/Scalar/function_prologue_epilogue.ll +++ b/llvm/test/CodeGen/VE/Scalar/function_prologue_epilogue.ll @@ -62,7 +62,9 @@ ; CHECK-NEXT: monc ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: and %s1, %s0, (32)0 ; CHECK-NEXT: stl %s0, 12(, %s11) +; CHECK-NEXT: or %s0, 0, %s1 ; CHECK-NEXT: adds.l %s11, 16, %s11 ; CHECK-NEXT: b.l.t (, %s10) ; @@ -80,7 +82,9 @@ ; PIC-NEXT: monc ; PIC-NEXT: or %s0, 0, %s62 ; PIC-NEXT: .LBB2_2: +; PIC-NEXT: and %s1, %s0, (32)0 ; PIC-NEXT: stl %s0, 12(, %s11) +; PIC-NEXT: or %s0, 0, %s1 ; PIC-NEXT: adds.l %s11, 16, %s11 ; PIC-NEXT: b.l.t (, %s10) %2 = alloca i32, align 4 diff --git a/llvm/test/CodeGen/WebAssembly/pr59626.ll b/llvm/test/CodeGen/WebAssembly/pr59626.ll --- a/llvm/test/CodeGen/WebAssembly/pr59626.ll +++ b/llvm/test/CodeGen/WebAssembly/pr59626.ll @@ -12,20 +12,11 @@ ; CHECK-32-NEXT: local.get 0 ; CHECK-32-NEXT: i32.const 0 ; CHECK-32-NEXT: i32.store16 0 -; CHECK-32-NEXT: local.get 1 ; CHECK-32-NEXT: i32.const 0 -; CHECK-32-NEXT: i32.store8 2 -; CHECK-32-NEXT: local.get 1 -; CHECK-32-NEXT: local.get 0 -; CHECK-32-NEXT: i8x16.splat -; CHECK-32-NEXT: v128.store16_lane 0, 0 -; CHECK-32-NEXT: v128.const 0, 0 -; CHECK-32-NEXT: i32x4.extract_lane 0 ; CHECK-32-NEXT: # fallthrough-return ; ; CHECK-64-LABEL: f: ; CHECK-64: .functype f (i64, i64) -> (i32) -; CHECK-64-NEXT: .local i32 ; CHECK-64-NEXT: # %bb.0: # %BB ; CHECK-64-NEXT: local.get 0 ; CHECK-64-NEXT: i32.const 0 @@ -33,12 +24,7 @@ ; CHECK-64-NEXT: local.get 0 ; CHECK-64-NEXT: i32.const 0 ; CHECK-64-NEXT: i32.store16 0 -; CHECK-64-NEXT: local.get 1 -; CHECK-64-NEXT: local.get 2 -; CHECK-64-NEXT: i8x16.splat -; CHECK-64-NEXT: v128.store16_lane 0, 0 -; CHECK-64-NEXT: v128.const 0, 0 -; CHECK-64-NEXT: i32x4.extract_lane 0 +; CHECK-64-NEXT: i32.const 0 ; CHECK-64-NEXT: # fallthrough-return BB: store <3 x i8> zeroinitializer, ptr %0 diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll --- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll @@ -11,7 +11,7 @@ ; CHECK-LABEL: same_const_one_replaced_i16x8: ; CHECK: .functype same_const_one_replaced_i16x8 (i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const $push0=, 42, 42, 42, 42, 42, 0, 42, 42 +; CHECK-NEXT: v128.const $push0=, 42, 42, 42, 42, 42, 42, 42, 42 ; CHECK-NEXT: i16x8.replace_lane $push1=, $pop0, 5, $0 ; CHECK-NEXT: return $pop1 %v = insertelement @@ -39,7 +39,7 @@ ; CHECK-LABEL: same_const_one_replaced_f32x4: ; CHECK: .functype same_const_one_replaced_f32x4 (f32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: v128.const $push0=, 0x1.5p5, 0x1.5p5, 0x0p0, 0x1.5p5 +; CHECK-NEXT: v128.const $push0=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5 ; CHECK-NEXT: f32x4.replace_lane $push1=, $pop0, 2, $0 ; CHECK-NEXT: return $pop1 %v = insertelement @@ -76,12 +76,15 @@ ; CHECK-LABEL: splat_common_arg_i16x8: ; CHECK: .functype splat_common_arg_i16x8 (i32, i32, i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i16x8.splat $push0=, $2 -; CHECK-NEXT: i16x8.replace_lane $push1=, $pop0, 0, $1 +; CHECK-NEXT: i16x8.splat $push0=, $1 +; CHECK-NEXT: i16x8.replace_lane $push1=, $pop0, 1, $2 ; CHECK-NEXT: i16x8.replace_lane $push2=, $pop1, 2, $0 -; CHECK-NEXT: i16x8.replace_lane $push3=, $pop2, 4, $1 -; CHECK-NEXT: i16x8.replace_lane $push4=, $pop3, 7, $1 -; CHECK-NEXT: return $pop4 +; CHECK-NEXT: i16x8.replace_lane $push3=, $pop2, 3, $2 +; CHECK-NEXT: i16x8.replace_lane $push4=, $pop3, 4, $1 +; CHECK-NEXT: i16x8.replace_lane $push5=, $pop4, 5, $2 +; CHECK-NEXT: i16x8.replace_lane $push6=, $pop5, 6, $2 +; CHECK-NEXT: i16x8.replace_lane $push7=, $pop6, 7, $1 +; CHECK-NEXT: return $pop7 %v0 = insertelement <8 x i16> undef, i16 %b, i32 0 %v1 = insertelement <8 x i16> %v0, i16 %c, i32 1 %v2 = insertelement <8 x i16> %v1, i16 %a, i32 2 @@ -283,12 +286,11 @@ ; CHECK-LABEL: half_shuffle_i32x4: ; CHECK: .functype half_shuffle_i32x4 (v128) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i8x16.shuffle $push0=, $0, $0, 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK-NEXT: i32.const $push1=, 0 -; CHECK-NEXT: i32x4.replace_lane $push2=, $pop0, 0, $pop1 -; CHECK-NEXT: i32.const $push3=, 3 -; CHECK-NEXT: i32x4.replace_lane $push4=, $pop2, 3, $pop3 -; CHECK-NEXT: return $pop4 +; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0 +; CHECK-NEXT: i8x16.shuffle $push1=, $0, $pop0, 16, 17, 18, 19, 8, 9, 10, 11, 0, 1, 2, 3, 28, 29, 30, 31 +; CHECK-NEXT: i32.const $push2=, 3 +; CHECK-NEXT: i32x4.replace_lane $push3=, $pop1, 3, $pop2 +; CHECK-NEXT: return $pop3 %s0 = extractelement <4 x i32> %src, i32 0 %s2 = extractelement <4 x i32> %src, i32 2 %v0 = insertelement <4 x i32> undef, i32 0, i32 0 @@ -303,25 +305,28 @@ ; CHECK-LABEL: mashup_swizzle_i8x16: ; CHECK: .functype mashup_swizzle_i8x16 (v128, v128, i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: global.get $push12=, __stack_pointer -; CHECK-NEXT: i32.const $push13=, 16 -; CHECK-NEXT: i32.sub $push16=, $pop12, $pop13 -; CHECK-NEXT: local.tee $push15=, $3=, $pop16 -; CHECK-NEXT: v128.store 0($pop15), $0 -; CHECK-NEXT: i8x16.extract_lane_u $push7=, $1, 7 +; CHECK-NEXT: global.get $push14=, __stack_pointer +; CHECK-NEXT: i32.const $push15=, 16 +; CHECK-NEXT: i32.sub $push19=, $pop14, $pop15 +; CHECK-NEXT: local.tee $push18=, $3=, $pop19 +; CHECK-NEXT: v128.store 0($pop18), $0 +; CHECK-NEXT: i8x16.extract_lane_u $push8=, $1, 7 ; CHECK-NEXT: i32.const $push1=, 15 -; CHECK-NEXT: i32.and $push8=, $pop7, $pop1 -; CHECK-NEXT: i32.or $push9=, $3, $pop8 +; CHECK-NEXT: i32.and $push9=, $pop8, $pop1 +; CHECK-NEXT: i32.or $push10=, $3, $pop9 ; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 0 -; CHECK-NEXT: i32.const $push14=, 15 -; CHECK-NEXT: i32.and $push2=, $pop0, $pop14 +; CHECK-NEXT: i32.const $push17=, 15 +; CHECK-NEXT: i32.and $push2=, $pop0, $pop17 ; CHECK-NEXT: i32.or $push3=, $3, $pop2 -; CHECK-NEXT: v128.const $push4=, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 0 -; CHECK-NEXT: v128.load8_lane $push5=, 0($pop3), $pop4, 0 -; CHECK-NEXT: i8x16.replace_lane $push6=, $pop5, 3, $2 -; CHECK-NEXT: v128.load8_lane $push10=, 0($pop9), $pop6, 7 -; CHECK-NEXT: i8x16.replace_lane $push11=, $pop10, 12, $2 -; CHECK-NEXT: return $pop11 +; CHECK-NEXT: v128.load8_splat $push4=, 0($pop3) +; CHECK-NEXT: i8x16.replace_lane $push5=, $pop4, 3, $2 +; CHECK-NEXT: i32.const $push6=, 42 +; CHECK-NEXT: i8x16.replace_lane $push7=, $pop5, 4, $pop6 +; CHECK-NEXT: v128.load8_lane $push11=, 0($pop10), $pop7, 7 +; CHECK-NEXT: i8x16.replace_lane $push12=, $pop11, 12, $2 +; CHECK-NEXT: i32.const $push16=, 42 +; CHECK-NEXT: i8x16.replace_lane $push13=, $pop12, 14, $pop16 +; CHECK-NEXT: return $pop13 %m0 = extractelement <16 x i8> %mask, i32 0 %s0 = extractelement <16 x i8> %src, i8 %m0 %v0 = insertelement <16 x i8> undef, i8 %s0, i32 0 @@ -345,20 +350,23 @@ ; CHECK-LABEL: mashup_const_i8x16: ; CHECK: .functype mashup_const_i8x16 (v128, v128, i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: global.get $push8=, __stack_pointer -; CHECK-NEXT: i32.const $push9=, 16 -; CHECK-NEXT: i32.sub $push11=, $pop8, $pop9 -; CHECK-NEXT: local.tee $push10=, $3=, $pop11 -; CHECK-NEXT: v128.store 0($pop10), $0 +; CHECK-NEXT: global.get $push10=, __stack_pointer +; CHECK-NEXT: i32.const $push11=, 16 +; CHECK-NEXT: i32.sub $push14=, $pop10, $pop11 +; CHECK-NEXT: local.tee $push13=, $3=, $pop14 +; CHECK-NEXT: v128.store 0($pop13), $0 ; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 0 ; CHECK-NEXT: i32.const $push1=, 15 ; CHECK-NEXT: i32.and $push2=, $pop0, $pop1 ; CHECK-NEXT: i32.or $push3=, $3, $pop2 -; CHECK-NEXT: v128.const $push4=, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 0 -; CHECK-NEXT: v128.load8_lane $push5=, 0($pop3), $pop4, 0 -; CHECK-NEXT: i8x16.replace_lane $push6=, $pop5, 3, $2 -; CHECK-NEXT: i8x16.replace_lane $push7=, $pop6, 12, $2 -; CHECK-NEXT: return $pop7 +; CHECK-NEXT: v128.load8_splat $push4=, 0($pop3) +; CHECK-NEXT: i8x16.replace_lane $push5=, $pop4, 3, $2 +; CHECK-NEXT: i32.const $push6=, 42 +; CHECK-NEXT: i8x16.replace_lane $push7=, $pop5, 4, $pop6 +; CHECK-NEXT: i8x16.replace_lane $push8=, $pop7, 12, $2 +; CHECK-NEXT: i32.const $push12=, 42 +; CHECK-NEXT: i8x16.replace_lane $push9=, $pop8, 14, $pop12 +; CHECK-NEXT: return $pop9 %m0 = extractelement <16 x i8> %mask, i32 0 %s0 = extractelement <16 x i8> %src, i8 %m0 %v0 = insertelement <16 x i8> undef, i8 %s0, i32 0 @@ -378,20 +386,21 @@ ; CHECK-LABEL: mashup_splat_i8x16: ; CHECK: .functype mashup_splat_i8x16 (v128, v128, i32) -> (v128) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: global.get $push8=, __stack_pointer -; CHECK-NEXT: i32.const $push9=, 16 -; CHECK-NEXT: i32.sub $push11=, $pop8, $pop9 -; CHECK-NEXT: local.tee $push10=, $3=, $pop11 -; CHECK-NEXT: v128.store 0($pop10), $0 +; CHECK-NEXT: global.get $push9=, __stack_pointer +; CHECK-NEXT: i32.const $push10=, 16 +; CHECK-NEXT: i32.sub $push12=, $pop9, $pop10 +; CHECK-NEXT: local.tee $push11=, $3=, $pop12 +; CHECK-NEXT: v128.store 0($pop11), $0 ; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 0 ; CHECK-NEXT: i32.const $push1=, 15 ; CHECK-NEXT: i32.and $push2=, $pop0, $pop1 ; CHECK-NEXT: i32.or $push3=, $3, $pop2 -; CHECK-NEXT: i8x16.splat $push4=, $2 -; CHECK-NEXT: v128.load8_lane $push5=, 0($pop3), $pop4, 0 +; CHECK-NEXT: v128.load8_splat $push4=, 0($pop3) +; CHECK-NEXT: i8x16.replace_lane $push5=, $pop4, 3, $2 ; CHECK-NEXT: i32.const $push6=, 42 ; CHECK-NEXT: i8x16.replace_lane $push7=, $pop5, 4, $pop6 -; CHECK-NEXT: return $pop7 +; CHECK-NEXT: i8x16.replace_lane $push8=, $pop7, 12, $2 +; CHECK-NEXT: return $pop8 %m0 = extractelement <16 x i8> %mask, i32 0 %s0 = extractelement <16 x i8> %src, i8 %m0 %v0 = insertelement <16 x i8> undef, i8 %s0, i32 0 diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll --- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll @@ -210,8 +210,15 @@ ; CHECK-LABEL: demote_zero_v4f32: ; CHECK: .functype demote_zero_v4f32 (v128) -> (v128) ; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.const 0x0p0, 0x0p0, 0x0p0, 0x0p0 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: f32x4.demote_f64x2_zero +; CHECK-NEXT: f64x2.extract_lane 0 +; CHECK-NEXT: f32.demote_f64 +; CHECK-NEXT: f32x4.replace_lane 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: f64x2.extract_lane 1 +; CHECK-NEXT: f32.demote_f64 +; CHECK-NEXT: f32x4.replace_lane 1 ; CHECK-NEXT: # fallthrough-return %v = shufflevector <2 x double> %x, <2 x double> zeroinitializer, <4 x i32> diff --git a/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll b/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll --- a/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll @@ -26,10 +26,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 15 ; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 15 -; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: v128.any_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i16x8.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 0 +; CHECK-NEXT: i32.ne $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %ret = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x) ret i1 %ret } @@ -40,10 +42,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 15 ; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 15 -; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: i16x8.all_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i16x8.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 255 +; CHECK-NEXT: i32.eq $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %ret = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x) ret i1 %ret } @@ -54,12 +58,11 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 15 ; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push6=, 15 -; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop6 -; CHECK-NEXT: v128.any_true $push3=, $pop2 -; CHECK-NEXT: i32.const $push4=, 1 -; CHECK-NEXT: i32.xor $push5=, $pop3, $pop4 -; CHECK-NEXT: return $pop5 +; CHECK-NEXT: i32.const $push5=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop5 +; CHECK-NEXT: i16x8.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.eqz $push4=, $pop3 +; CHECK-NEXT: return $pop4 %any = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x) %none = xor i1 %any, 1 ret i1 %none @@ -73,9 +76,9 @@ ; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 ; CHECK-NEXT: i32.const $push6=, 15 ; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop6 -; CHECK-NEXT: i16x8.all_true $push3=, $pop2 -; CHECK-NEXT: i32.const $push4=, 1 -; CHECK-NEXT: i32.xor $push5=, $pop3, $pop4 +; CHECK-NEXT: i16x8.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 255 +; CHECK-NEXT: i32.ne $push5=, $pop3, $pop4 ; CHECK-NEXT: return $pop5 %all = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x) %notall = xor i1 %all, 1 @@ -88,10 +91,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 7 ; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 7 -; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: v128.any_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 7 +; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i8x16.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 0 +; CHECK-NEXT: i32.ne $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %ret = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x) ret i1 %ret } @@ -102,10 +107,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 7 ; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 7 -; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: i8x16.all_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 7 +; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i8x16.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 65535 +; CHECK-NEXT: i32.eq $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %ret = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x) ret i1 %ret } @@ -120,10 +127,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 7 ; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 7 -; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: v128.any_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 7 +; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i8x16.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 0 +; CHECK-NEXT: i32.ne $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %bits = trunc <16 x i8> %x to <16 x i1> %ret = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %bits) ret i1 %ret @@ -135,10 +144,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 7 ; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 7 -; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: i8x16.all_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 7 +; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i8x16.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 65535 +; CHECK-NEXT: i32.eq $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %bits = trunc <16 x i8> %x to <16 x i1> %ret = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %bits) ret i1 %ret @@ -150,10 +161,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 15 ; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 15 -; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: v128.any_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i16x8.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 0 +; CHECK-NEXT: i32.ne $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %bits = trunc <8 x i16> %x to <8 x i1> %ret = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %bits) ret i1 %ret @@ -165,10 +178,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 15 ; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 15 -; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: i16x8.all_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 15 +; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i16x8.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 255 +; CHECK-NEXT: i32.eq $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %bits = trunc <8 x i16> %x to <8 x i1> %ret = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %bits) ret i1 %ret @@ -180,10 +195,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 31 ; CHECK-NEXT: i32x4.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 31 -; CHECK-NEXT: i32x4.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: v128.any_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 31 +; CHECK-NEXT: i32x4.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i32x4.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 0 +; CHECK-NEXT: i32.ne $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %bits = trunc <4 x i32> %x to <4 x i1> %ret = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %bits) ret i1 %ret @@ -195,10 +212,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 31 ; CHECK-NEXT: i32x4.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 31 -; CHECK-NEXT: i32x4.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: i32x4.all_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 31 +; CHECK-NEXT: i32x4.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i32x4.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 15 +; CHECK-NEXT: i32.eq $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %bits = trunc <4 x i32> %x to <4 x i1> %ret = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %bits) ret i1 %ret @@ -210,10 +229,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 63 ; CHECK-NEXT: i64x2.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 63 -; CHECK-NEXT: i64x2.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: v128.any_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 63 +; CHECK-NEXT: i64x2.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i64x2.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 0 +; CHECK-NEXT: i32.ne $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %bits = trunc <2 x i64> %x to <2 x i1> %ret = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %bits) ret i1 %ret @@ -225,10 +246,12 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const $push0=, 63 ; CHECK-NEXT: i64x2.shl $push1=, $0, $pop0 -; CHECK-NEXT: i32.const $push4=, 63 -; CHECK-NEXT: i64x2.shr_s $push2=, $pop1, $pop4 -; CHECK-NEXT: i64x2.all_true $push3=, $pop2 -; CHECK-NEXT: return $pop3 +; CHECK-NEXT: i32.const $push6=, 63 +; CHECK-NEXT: i64x2.shr_s $push2=, $pop1, $pop6 +; CHECK-NEXT: i64x2.bitmask $push3=, $pop2 +; CHECK-NEXT: i32.const $push4=, 3 +; CHECK-NEXT: i32.eq $push5=, $pop3, $pop4 +; CHECK-NEXT: return $pop5 %bits = trunc <2 x i64> %x to <2 x i1> %ret = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %bits) ret i1 %ret @@ -277,10 +300,12 @@ ; CHECK-NEXT: i8x16.shuffle $push0=, $0, $0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 ; CHECK-NEXT: i32.const $push1=, 15 ; CHECK-NEXT: i16x8.shl $push2=, $pop0, $pop1 -; CHECK-NEXT: i32.const $push5=, 15 -; CHECK-NEXT: i16x8.shr_s $push3=, $pop2, $pop5 -; CHECK-NEXT: v128.any_true $push4=, $pop3 -; CHECK-NEXT: return $pop4 +; CHECK-NEXT: i32.const $push7=, 15 +; CHECK-NEXT: i16x8.shr_s $push3=, $pop2, $pop7 +; CHECK-NEXT: i16x8.bitmask $push4=, $pop3 +; CHECK-NEXT: i32.const $push5=, 0 +; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5 +; CHECK-NEXT: return $pop6 %bits = trunc <8 x i8> %x to <8 x i1> %ret = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %bits) ret i1 %ret @@ -293,10 +318,12 @@ ; CHECK-NEXT: i8x16.shuffle $push0=, $0, $0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 ; CHECK-NEXT: i32.const $push1=, 15 ; CHECK-NEXT: i16x8.shl $push2=, $pop0, $pop1 -; CHECK-NEXT: i32.const $push5=, 15 -; CHECK-NEXT: i16x8.shr_s $push3=, $pop2, $pop5 -; CHECK-NEXT: i16x8.all_true $push4=, $pop3 -; CHECK-NEXT: return $pop4 +; CHECK-NEXT: i32.const $push7=, 15 +; CHECK-NEXT: i16x8.shr_s $push3=, $pop2, $pop7 +; CHECK-NEXT: i16x8.bitmask $push4=, $pop3 +; CHECK-NEXT: i32.const $push5=, 255 +; CHECK-NEXT: i32.eq $push6=, $pop4, $pop5 +; CHECK-NEXT: return $pop6 %bits = trunc <8 x i8> %x to <8 x i1> %ret = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %bits) ret i1 %ret @@ -312,8 +339,10 @@ ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ; CHECK-NEXT: i8x16.eq $push1=, $0, $pop0 -; CHECK-NEXT: v128.any_true $push2=, $pop1 -; CHECK-NEXT: return $pop2 +; CHECK-NEXT: i8x16.bitmask $push2=, $pop1 +; CHECK-NEXT: i32.const $push3=, 0 +; CHECK-NEXT: i32.ne $push4=, $pop2, $pop3 +; CHECK-NEXT: return $pop4 %zero = icmp eq <16 x i8> %x, zeroinitializer %ret = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %zero) ret i1 %ret diff --git a/llvm/test/CodeGen/WebAssembly/xor_reassociate.ll b/llvm/test/CodeGen/WebAssembly/xor_reassociate.ll --- a/llvm/test/CodeGen/WebAssembly/xor_reassociate.ll +++ b/llvm/test/CodeGen/WebAssembly/xor_reassociate.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: f32.const 0x1p-23 ; CHECK-NEXT: f32.gt -; CHECK-NEXT: i32.ne +; CHECK-NEXT: i32.xor ; CHECK-NEXT: br_if 0 # 0: down to label0 ; CHECK-NEXT: # %bb.1: # %if.then.i ; CHECK-NEXT: i32.const 0 diff --git a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll --- a/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll +++ b/llvm/test/CodeGen/X86/2007-05-15-maskmovq.ll @@ -12,14 +12,14 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, (%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movq (%esp), %mm0 -; CHECK-NEXT: movq {{[0-9]+}}(%esp), %mm1 +; CHECK-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; CHECK-NEXT: movq (%esp), %mm1 ; CHECK-NEXT: maskmovq %mm0, %mm1 ; CHECK-NEXT: addl $16, %esp ; CHECK-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll b/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll --- a/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll +++ b/llvm/test/CodeGen/X86/2008-06-13-VolatileLoadStore.ll @@ -10,11 +10,13 @@ ; CHECK-LABEL: f: ; CHECK: # %bb.0: ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movsd %xmm1, atomic -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: movsd %xmm1, atomic2 -; CHECK-NEXT: movsd %xmm0, anything +; CHECK-NEXT: movsd %xmm0, atomic +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movsd %xmm0, atomic2 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, anything+4 +; CHECK-NEXT: movl %eax, anything ; CHECK-NEXT: movl ioport, %ecx ; CHECK-NEXT: movl ioport, %eax ; CHECK-NEXT: shrl $16, %eax diff --git a/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll b/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll --- a/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll +++ b/llvm/test/CodeGen/X86/2008-12-02-dagcombine-1.ll @@ -8,8 +8,11 @@ ; CHECK-LABEL: test: ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl $-2, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: subl %edx, %eax +; CHECK-NEXT: leal -2(%eax,%ecx), %eax ; CHECK-NEXT: retl entry: %0 = ptrtoint ptr %a to i32 diff --git a/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll b/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll --- a/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll +++ b/llvm/test/CodeGen/X86/2009-04-29-IndirectDestOperands.ll @@ -16,9 +16,9 @@ ; CHECK-NEXT: ## InlineAsm Start ; CHECK-NEXT: cpuid ; CHECK-NEXT: ## InlineAsm End -; CHECK-NEXT: movl %ebx, 8(%esi) -; CHECK-NEXT: movl %ecx, 12(%esi) ; CHECK-NEXT: movl %edx, 16(%esi) +; CHECK-NEXT: movl %ecx, 12(%esi) +; CHECK-NEXT: movl %ebx, 8(%esi) ; CHECK-NEXT: movl %eax, 4(%esi) ; CHECK-NEXT: popl %esi ; CHECK-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll --- a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll +++ b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll @@ -9,9 +9,8 @@ ; CHECK-NEXT: movl (%rdi,%rdx,4), %edx ; CHECK-NEXT: movzbl %dl, %r10d ; CHECK-NEXT: addl $4, %r10d -; CHECK-NEXT: shrq $6, %rdx -; CHECK-NEXT: andl $67108860, %edx # imm = 0x3FFFFFC -; CHECK-NEXT: movl (%rdi,%rdx), %edx +; CHECK-NEXT: shrq $8, %rdx +; CHECK-NEXT: movl (%rdi,%rdx,4), %edx ; CHECK-NEXT: movzbl %dl, %edi ; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: addl $5, %esi diff --git a/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll --- a/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll +++ b/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll @@ -17,17 +17,17 @@ ; CHECK-NEXT: movq %rdx, (%rsp) ; CHECK-NEXT: movq 24(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 16(%rdi), %rdx +; CHECK-NEXT: movq 56(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 32(%rdi), %rdx +; CHECK-NEXT: movq 48(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq 40(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 48(%rdi), %rdx +; CHECK-NEXT: movq 32(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq 56(%rdi), %rdx +; CHECK-NEXT: movq 16(%rdi), %rdx ; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %al, (%rsp) ; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq __stack_chk_guard(%rip), %rax diff --git a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll --- a/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll @@ -17,21 +17,19 @@ define dso_local i32 @main() nounwind uwtable { ; CHECK-LABEL: main: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl i(%rip), %esi -; CHECK-NEXT: movl j(%rip), %eax -; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq i(%rip), %rdx +; CHECK-NEXT: movq j(%rip), %rsi +; CHECK-NEXT: movsbl %sil, %eax +; CHECK-NEXT: idivb %dl +; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $8, %edx -; CHECK-NEXT: movsbl %al, %ecx -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: cbtw +; CHECK-NEXT: shrl $8, %esi +; CHECK-NEXT: movsbl %sil, %eax ; CHECK-NEXT: idivb %dl -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: idivb %sil -; CHECK-NEXT: movzbl %dl, %ecx +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: pinsrb $1, %ecx, %xmm0 +; CHECK-NEXT: pinsrb $1, %eax, %xmm0 ; CHECK-NEXT: pextrw $0, %xmm0, res(%rip) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll --- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -110,12 +110,12 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB2_2: # %.lr.ph ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: vmovups %xmm0, 16(%rsi) ; CHECK-NEXT: addq $32, %rsi ; CHECK-NEXT: decl %edi ; CHECK-NEXT: jne .LBB2_2 ; CHECK-NEXT: .LBB2_3: # %._crit_edge -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge diff --git a/llvm/test/CodeGen/X86/WidenArith.ll b/llvm/test/CodeGen/X86/WidenArith.ll --- a/llvm/test/CodeGen/X86/WidenArith.ll +++ b/llvm/test/CodeGen/X86/WidenArith.ll @@ -9,8 +9,15 @@ ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vsubps %ymm2, %ymm1, %ymm3 ; X86-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X86-NEXT: vcmpltps %ymm3, %ymm2, %ymm1 -; X86-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X86-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X86-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; X86-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: retl ; @@ -20,8 +27,15 @@ ; X64-NEXT: vmulps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vsubps %ymm2, %ymm1, %ymm3 ; X64-NEXT: vcmpltps %ymm1, %ymm0, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X64-NEXT: vcmpltps %ymm3, %ymm2, %ymm1 -; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: retq %c1 = fadd <8 x float> %a, %b diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -20,13 +20,15 @@ ; ; X64-LABEL: abd_ext_i8: ; X64: # %bb.0: -; X64-NEXT: movsbl %sil, %eax -; X64-NEXT: movsbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movsbq %dil, %rcx +; X64-NEXT: movsbq %sil, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = sext i8 %a to i64 %bext = sext i8 %b to i64 @@ -50,13 +52,15 @@ ; ; X64-LABEL: abd_ext_i8_undef: ; X64: # %bb.0: -; X64-NEXT: movsbl %sil, %eax -; X64-NEXT: movsbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movsbq %dil, %rcx +; X64-NEXT: movsbq %sil, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = sext i8 %a to i64 %bext = sext i8 %b to i64 @@ -80,13 +84,15 @@ ; ; X64-LABEL: abd_ext_i16: ; X64: # %bb.0: -; X64-NEXT: movswl %si, %eax -; X64-NEXT: movswl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movswq %di, %rcx +; X64-NEXT: movswq %si, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = sext i16 %a to i64 %bext = sext i16 %b to i64 @@ -110,13 +116,15 @@ ; ; X64-LABEL: abd_ext_i16_undef: ; X64: # %bb.0: -; X64-NEXT: movswl %si, %eax -; X64-NEXT: movswl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: # kill: def $esi killed $esi def $rsi +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movswq %di, %rcx +; X64-NEXT: movswq %si, %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = sext i16 %a to i64 %bext = sext i16 %b to i64 @@ -129,13 +137,19 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32: ; X86: # %bb.0: +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovlel %edx, %eax +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32: @@ -159,13 +173,19 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32_undef: ; X86: # %bb.0: +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovlel %edx, %eax +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sarl $31, %esi +; X86-NEXT: xorl %esi, %eax +; X86-NEXT: subl %esi, %eax +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32_undef: diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll --- a/llvm/test/CodeGen/X86/abdu.ll +++ b/llvm/test/CodeGen/X86/abdu.ll @@ -20,13 +20,13 @@ ; ; X64-LABEL: abd_ext_i8: ; X64: # %bb.0: -; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = zext i8 %a to i64 %bext = zext i8 %b to i64 @@ -50,13 +50,13 @@ ; ; X64-LABEL: abd_ext_i8_undef: ; X64: # %bb.0: -; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: movzbl %dil, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %aext = zext i8 %a to i64 %bext = zext i8 %b to i64 @@ -80,13 +80,13 @@ ; ; X64-LABEL: abd_ext_i16: ; X64: # %bb.0: -; X64-NEXT: movzwl %si, %eax ; X64-NEXT: movzwl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = zext i16 %a to i64 %bext = zext i16 %b to i64 @@ -110,13 +110,13 @@ ; ; X64-LABEL: abd_ext_i16_undef: ; X64: # %bb.0: -; X64-NEXT: movzwl %si, %eax ; X64-NEXT: movzwl %di, %ecx -; X64-NEXT: subl %eax, %ecx -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: negl %eax -; X64-NEXT: cmovsl %ecx, %eax -; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: movzwl %si, %eax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovsq %rcx, %rax +; X64-NEXT: # kill: def $ax killed $ax killed $rax ; X64-NEXT: retq %aext = zext i16 %a to i64 %bext = zext i16 %b to i64 @@ -129,13 +129,13 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32: @@ -159,13 +159,13 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; X86-LABEL: abd_ext_i32_undef: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: subl %ecx, %edx -; X86-NEXT: negl %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: subl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32_undef: diff --git a/llvm/test/CodeGen/X86/absolute-constant.ll b/llvm/test/CodeGen/X86/absolute-constant.ll --- a/llvm/test/CodeGen/X86/absolute-constant.ll +++ b/llvm/test/CodeGen/X86/absolute-constant.ll @@ -10,7 +10,10 @@ define void @bar(ptr %x) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: testb $foo, (%rdi) +; CHECK-NEXT: movsbl (%rdi), %eax +; CHECK-NEXT: movl $foo, %ecx +; CHECK-NEXT: movsbl %cl, %ecx +; CHECK-NEXT: testl %ecx, %eax ; CHECK-NEXT: je .LBB0_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: xorl %eax, %eax @@ -20,7 +23,10 @@ ; ; PIC-LABEL: bar: ; PIC: # %bb.0: # %entry -; PIC-NEXT: testb $foo, (%rdi) +; PIC-NEXT: movsbl (%rdi), %eax +; PIC-NEXT: movl $foo, %ecx +; PIC-NEXT: movsbl %cl, %ecx +; PIC-NEXT: testl %ecx, %eax ; PIC-NEXT: je .LBB0_1 ; PIC-NEXT: # %bb.2: # %if.then ; PIC-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/add-cmov.ll b/llvm/test/CodeGen/X86/add-cmov.ll --- a/llvm/test/CodeGen/X86/add-cmov.ll +++ b/llvm/test/CodeGen/X86/add-cmov.ll @@ -368,7 +368,7 @@ ; CHECK-NEXT: addq $66, %rsi ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: cmovneq %rax, %rsi -; CHECK-NEXT: decw (%rdx,%rsi) +; CHECK-NEXT: decw (%rsi,%rdx) ; CHECK-NEXT: retq %i = ptrtoint ptr %ptr to i64 %i66 = add i64 %i, 66 @@ -414,7 +414,7 @@ ; CHECK-NEXT: addq $66, %rdx ; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: cmovneq %rax, %rdx -; CHECK-NEXT: decw (%rsi,%rdx) +; CHECK-NEXT: decw (%rdx,%rsi) ; CHECK-NEXT: retq %i = ptrtoint ptr %ptr to i64 %i66 = add i64 %idx, 66 diff --git a/llvm/test/CodeGen/X86/add-of-mul.ll b/llvm/test/CodeGen/X86/add-of-mul.ll --- a/llvm/test/CodeGen/X86/add-of-mul.ll +++ b/llvm/test/CodeGen/X86/add-of-mul.ll @@ -26,7 +26,8 @@ define <4 x i32> @test_vector(<4 x i32> %x) { ; CHECK-LABEL: test_vector: ; CHECK: # %bb.0: -; CHECK-NEXT: pslld $2, %xmm0 +; CHECK-NEXT: paddd %xmm0, %xmm0 +; CHECK-NEXT: paddd %xmm0, %xmm0 ; CHECK-NEXT: retq %mul = mul <4 x i32> %x, %add = add <4 x i32> %mul, %x diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll --- a/llvm/test/CodeGen/X86/add-sub-bool.ll +++ b/llvm/test/CodeGen/X86/add-sub-bool.ll @@ -344,7 +344,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -367,7 +367,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -430,10 +430,10 @@ ; X86-LABEL: test_i32_add_sub_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: retl ; @@ -455,10 +455,10 @@ ; X86-LABEL: test_i32_add_sub_commute_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: retl ; @@ -480,10 +480,10 @@ ; X86-LABEL: test_i32_sub_add_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: retl ; @@ -508,7 +508,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -559,7 +559,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl @@ -584,7 +584,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; @@ -605,10 +605,10 @@ ; X86-LABEL: test_i32_sub_sum_var: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: addl {{[0-9]+}}(%esp), %eax -; X86-NEXT: btl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: btl %edx, %ecx ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: negl %eax ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll --- a/llvm/test/CodeGen/X86/addcarry.ll +++ b/llvm/test/CodeGen/X86/addcarry.ll @@ -316,21 +316,13 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq (%rsi), %rdx -; CHECK-NEXT: movq 8(%rsi), %rdi -; CHECK-NEXT: adcq $0, %rdi -; CHECK-NEXT: setb %r10b -; CHECK-NEXT: movzbl %r10b, %r10d -; CHECK-NEXT: addq %rcx, %rdi -; CHECK-NEXT: adcq 16(%rsi), %r10 -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: addq %r8, %r10 -; CHECK-NEXT: adcq 24(%rsi), %rcx -; CHECK-NEXT: addq %r9, %rcx -; CHECK-NEXT: movq %rdx, (%rax) -; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq %r10, 16(%rax) -; CHECK-NEXT: movq %rcx, 24(%rax) +; CHECK-NEXT: adcq 8(%rsi), %rcx +; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: adcq 24(%rsi), %r9 +; CHECK-NEXT: movq %rdx, (%rdi) +; CHECK-NEXT: movq %rcx, 8(%rdi) +; CHECK-NEXT: movq %r8, 16(%rdi) +; CHECK-NEXT: movq %r9, 24(%rdi) ; CHECK-NEXT: retq entry: %0 = extractvalue %S %arg.b, 0 @@ -391,15 +383,15 @@ define i128 @addcarry_to_subcarry(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: addcarry_to_subcarry: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rsi, %rcx +; CHECK-NEXT: notq %rcx +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: addq %rdi, %rcx +; CHECK-NEXT: setb %dl +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpq %rsi, %rdi -; CHECK-NEXT: notq %rsi -; CHECK-NEXT: setae %cl -; CHECK-NEXT: addb $-1, %cl -; CHECK-NEXT: adcq $0, %rax -; CHECK-NEXT: setb %cl -; CHECK-NEXT: movzbl %cl, %edx -; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: setae %al +; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: adcq $0, %rdx ; CHECK-NEXT: retq %notb = xor i64 %b, -1 @@ -418,9 +410,12 @@ ; CHECK-LABEL: addcarry_2x64: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) @@ -448,9 +443,12 @@ ; CHECK-LABEL: addcarry_hidden_2x64: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) @@ -482,9 +480,12 @@ ; CHECK-LABEL: addcarry_hidden2_2x64: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) @@ -516,9 +517,12 @@ ; CHECK-LABEL: addcarry_2x64_or_reversed: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) @@ -546,9 +550,12 @@ ; CHECK-LABEL: addcarry_2x64_xor_reversed: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %cl +; CHECK-NEXT: xorb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) @@ -576,10 +583,13 @@ ; CHECK-LABEL: addcarry_2x64_and_reversed: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: andb %dil, %cl ; CHECK-NEXT: movq %rsi, %rdx -; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: retq %t0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %x0, i64 %y0) %s0 = extractvalue { i64, i1 } %t0, 0 @@ -636,10 +646,13 @@ define { i64, i1 } @addcarry_fake_carry(i64 %a, i64 %b, i1 %carryin) nounwind { ; CHECK-LABEL: addcarry_fake_carry: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: btl $0, %edx -; CHECK-NEXT: adcq %rsi, %rax +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: addq %rsi, %rdi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: addq %rdi, %rax ; CHECK-NEXT: setb %dl +; CHECK-NEXT: orb %cl, %dl ; CHECK-NEXT: retq %t1 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %partial = extractvalue { i64, i1 } %t1, 0 @@ -742,17 +755,20 @@ define i32 @add_U320_without_i128_add(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_add: ; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq 16(%rdi), %rax ; CHECK-NEXT: movq 24(%rdi), %r10 ; CHECK-NEXT: movq 32(%rdi), %r11 +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: adcq %rcx, %rbx ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: adcq %rcx, %rdx +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: adcq $0, %rbx ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: movq %r10, %rcx ; CHECK-NEXT: adcq %r8, %rcx -; CHECK-NEXT: cmpq %rax, %rdx +; CHECK-NEXT: cmpq %rax, %rbx ; CHECK-NEXT: adcq $0, %rcx ; CHECK-NEXT: leaq (%r11,%r9), %rsi ; CHECK-NEXT: addq %r8, %r10 @@ -764,10 +780,12 @@ ; CHECK-NEXT: cmpq %rsi, %r8 ; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %r9, %r11 -; CHECK-NEXT: movq %rdx, 16(%rdi) +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %rbx, 16(%rdi) ; CHECK-NEXT: movq %rcx, 24(%rdi) ; CHECK-NEXT: movq %r8, 32(%rdi) ; CHECK-NEXT: adcl $0, %eax +; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq %7 = load i64, ptr %0, align 8 %8 = getelementptr inbounds %struct.U320, ptr %0, i64 0, i32 0, i64 1 @@ -820,10 +838,22 @@ define i32 @add_U320_without_i128_or(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_or: ; CHECK: # %bb.0: +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %sil +; CHECK-NEXT: orb %al, %sil +; CHECK-NEXT: addq 24(%rdi), %r8 +; CHECK-NEXT: setb %al +; CHECK-NEXT: addb $-1, %sil ; CHECK-NEXT: adcq %rcx, 16(%rdi) -; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %r8, 24(%rdi) +; CHECK-NEXT: addb $-1, %cl ; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al ; CHECK-NEXT: movzbl %al, %eax @@ -875,10 +905,22 @@ define i32 @add_U320_without_i128_xor(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_xor: ; CHECK: # %bb.0: +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %sil +; CHECK-NEXT: xorb %al, %sil +; CHECK-NEXT: addq 24(%rdi), %r8 +; CHECK-NEXT: setb %al +; CHECK-NEXT: addb $-1, %sil ; CHECK-NEXT: adcq %rcx, 16(%rdi) -; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: xorb %al, %cl +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %r8, 24(%rdi) +; CHECK-NEXT: addb $-1, %cl ; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al ; CHECK-NEXT: movzbl %al, %eax @@ -932,9 +974,15 @@ define i32 @bogus_add_U320_without_i128_and(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: bogus_add_U320_without_i128_and: ; CHECK: # %bb.0: +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) -; CHECK-NEXT: addq %rcx, 16(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %sil +; CHECK-NEXT: andb %al, %sil +; CHECK-NEXT: addb $-1, %sil +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: adcq %rcx, 16(%rdi) ; CHECK-NEXT: addq %r8, 24(%rdi) ; CHECK-NEXT: addq %r9, 32(%rdi) ; CHECK-NEXT: xorl %eax, %eax @@ -986,11 +1034,25 @@ define void @add_U320_without_i128_or_no_ret(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_without_i128_or_no_ret: ; CHECK: # %bb.0: +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %sil +; CHECK-NEXT: orb %al, %sil +; CHECK-NEXT: addq 24(%rdi), %r8 +; CHECK-NEXT: setb %al +; CHECK-NEXT: addb $-1, %sil ; CHECK-NEXT: adcq %rcx, 16(%rdi) -; CHECK-NEXT: adcq %r8, 24(%rdi) -; CHECK-NEXT: adcq %r9, 32(%rdi) +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: addq 32(%rdi), %r9 +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: movzbl %cl, %eax +; CHECK-NEXT: addq %r9, %rax +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %r8, 24(%rdi) +; CHECK-NEXT: movq %rax, 32(%rdi) ; CHECK-NEXT: retq %7 = load i64, ptr %0, align 8 %8 = getelementptr inbounds %struct.U320, ptr %0, i64 0, i32 0, i64 1 @@ -1035,12 +1097,24 @@ define i32 @add_U320_uaddo(ptr nocapture dereferenceable(40) %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5) nounwind { ; CHECK-LABEL: add_U320_uaddo: ; CHECK: # %bb.0: +; CHECK-NEXT: addq 8(%rdi), %rdx +; CHECK-NEXT: setb %al ; CHECK-NEXT: addq %rsi, (%rdi) -; CHECK-NEXT: adcq %rdx, 8(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %sil +; CHECK-NEXT: orb %al, %sil +; CHECK-NEXT: addq 24(%rdi), %r8 +; CHECK-NEXT: setb %al +; CHECK-NEXT: addb $-1, %sil ; CHECK-NEXT: adcq %rcx, 16(%rdi) -; CHECK-NEXT: adcq %r8, 24(%rdi) +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: addb $-1, %cl ; CHECK-NEXT: adcq %r9, 32(%rdi) ; CHECK-NEXT: setb %al +; CHECK-NEXT: movq %rdx, 8(%rdi) +; CHECK-NEXT: movq %r8, 24(%rdi) ; CHECK-NEXT: movzbl %al, %eax ; CHECK-NEXT: retq %7 = load i64, ptr %0, align 8 @@ -1103,14 +1177,22 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rsi), %rcx -; CHECK-NEXT: addq (%rdx), %rcx -; CHECK-NEXT: movq %rcx, (%rdi) -; CHECK-NEXT: movq 8(%rsi), %rcx -; CHECK-NEXT: adcq 8(%rdx), %rcx -; CHECK-NEXT: movq %rcx, 8(%rdi) -; CHECK-NEXT: movq 16(%rsi), %rcx -; CHECK-NEXT: adcq 16(%rdx), %rcx -; CHECK-NEXT: movq %rcx, 16(%rdi) +; CHECK-NEXT: movq (%rdx), %rdi +; CHECK-NEXT: leaq (%rcx,%rdi), %r8 +; CHECK-NEXT: movq %r8, (%rax) +; CHECK-NEXT: movq 8(%rsi), %r8 +; CHECK-NEXT: addq 8(%rdx), %r8 +; CHECK-NEXT: setb %r9b +; CHECK-NEXT: addq %rdi, %rcx +; CHECK-NEXT: adcq $0, %r8 +; CHECK-NEXT: setb %cl +; CHECK-NEXT: orb %r9b, %cl +; CHECK-NEXT: movzbl %cl, %ecx +; CHECK-NEXT: movq %r8, 8(%rax) +; CHECK-NEXT: movq 16(%rsi), %rsi +; CHECK-NEXT: addq 16(%rdx), %rsi +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: movq %rsi, 16(%rax) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 %5 = load i64, ptr %2, align 8 @@ -1150,9 +1232,12 @@ define zeroext i1 @uaddo_U128_without_i128_or(i64 %0, i64 %1, i64 %2, i64 %3, ptr nocapture %4) nounwind { ; CHECK-LABEL: uaddo_U128_without_i128_or: ; CHECK: # %bb.0: +; CHECK-NEXT: addq %rcx, %rsi +; CHECK-NEXT: setb %cl ; CHECK-NEXT: addq %rdx, %rdi -; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: adcq $0, %rsi ; CHECK-NEXT: setb %al +; CHECK-NEXT: orb %cl, %al ; CHECK-NEXT: movq %rsi, (%r8) ; CHECK-NEXT: movq %rdi, 8(%r8) ; CHECK-NEXT: retq @@ -1177,12 +1262,18 @@ ; CHECK-LABEL: add_U192_without_i128_or: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: addq %r9, %rdx +; CHECK-NEXT: setb %dil ; CHECK-NEXT: addq %r8, %rsi -; CHECK-NEXT: adcq %r9, %rdx -; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movq %rcx, (%rdi) -; CHECK-NEXT: movq %rdx, 8(%rdi) -; CHECK-NEXT: movq %rsi, 16(%rdi) +; CHECK-NEXT: adcq $0, %rdx +; CHECK-NEXT: setb %r8b +; CHECK-NEXT: orb %dil, %r8b +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movzbl %r8b, %edi +; CHECK-NEXT: addq %rcx, %rdi +; CHECK-NEXT: movq %rdi, (%rax) +; CHECK-NEXT: movq %rdx, 8(%rax) +; CHECK-NEXT: movq %rsi, 16(%rax) ; CHECK-NEXT: retq %8 = add i64 %4, %1 %9 = icmp ult i64 %8, %1 @@ -1214,9 +1305,14 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rdx), %rcx ; CHECK-NEXT: movq 8(%rdx), %rdi +; CHECK-NEXT: addq 8(%rsi), %rdi +; CHECK-NEXT: setb %r8b ; CHECK-NEXT: addq (%rsi), %rcx -; CHECK-NEXT: adcq 8(%rsi), %rdi +; CHECK-NEXT: adcq $0, %rdi +; CHECK-NEXT: setb %r9b +; CHECK-NEXT: orb %r8b, %r9b ; CHECK-NEXT: movq 16(%rdx), %r8 +; CHECK-NEXT: addb $-1, %r9b ; CHECK-NEXT: adcq 16(%rsi), %r8 ; CHECK-NEXT: movq 24(%rdx), %rdx ; CHECK-NEXT: adcq 24(%rsi), %rdx @@ -1274,15 +1370,22 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movq (%rdx), %rcx ; CHECK-NEXT: movq 8(%rdx), %rdi +; CHECK-NEXT: addq 8(%rsi), %rdi +; CHECK-NEXT: setb %r8b ; CHECK-NEXT: addq (%rsi), %rcx -; CHECK-NEXT: adcq 8(%rsi), %rdi +; CHECK-NEXT: adcq $0, %rdi +; CHECK-NEXT: setb %r9b +; CHECK-NEXT: orb %r8b, %r9b ; CHECK-NEXT: movq 16(%rdx), %r8 ; CHECK-NEXT: movq 24(%rdx), %rdx -; CHECK-NEXT: adcq 16(%rsi), %r8 +; CHECK-NEXT: addq 16(%rsi), %r8 ; CHECK-NEXT: adcq 24(%rsi), %rdx +; CHECK-NEXT: movzbl %r9b, %esi +; CHECK-NEXT: addq %r8, %rsi +; CHECK-NEXT: adcq $0, %rdx ; CHECK-NEXT: movq %rcx, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq %r8, 16(%rax) +; CHECK-NEXT: movq %rsi, 16(%rax) ; CHECK-NEXT: movq %rdx, 24(%rax) ; CHECK-NEXT: retq %4 = load i64, ptr %1, align 8 diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -316,7 +316,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -326,7 +326,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -336,7 +336,7 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -976,38 +976,77 @@ ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512F-SLOW: # %bb.0: +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-SLOW-NEXT: vzeroupper +; AVX512F-SLOW-NEXT: retq ; -; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq +; AVX512F-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512F-FAST: # %bb.0: +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,3] +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512F-FAST-NEXT: vzeroupper +; AVX512F-FAST-NEXT: retq ; -; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512DQ-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,7] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,5,0,3] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,5,0,3] +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -1027,22 +1066,22 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1050,21 +1089,21 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1072,21 +1111,21 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -1160,20 +1199,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1181,19 +1220,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb %xmm1, %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] +; SSE42-NEXT: pshufb %xmm2, %xmm3 +; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1201,18 +1240,18 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1285,20 +1324,20 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1306,19 +1345,19 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; SSE42-NEXT: pshufb %xmm1, %xmm3 -; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] -; SSE42-NEXT: pshufb %xmm1, %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: palignr {{.*#+}} xmm3 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0] +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] +; SSE42-NEXT: pshufb %xmm2, %xmm3 +; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1326,18 +1365,18 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1410,19 +1449,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1430,17 +1469,17 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: movdqa 32(%rdi), %xmm2 ; SSE42-NEXT: movdqa 48(%rdi), %xmm3 -; SSE42-NEXT: paddb 48(%rsi), %xmm3 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 +; SSE42-NEXT: paddb 48(%rsi), %xmm3 ; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE42-NEXT: movdqa %xmm1, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm4 -; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm1 -; SSE42-NEXT: paddb 16(%rdx), %xmm1 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm4 +; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1448,16 +1487,16 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1535,25 +1574,25 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1561,20 +1600,20 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1582,19 +1621,20 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1665,21 +1705,21 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1687,16 +1727,16 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1704,16 +1744,16 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1785,19 +1825,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1805,15 +1845,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1821,15 +1861,15 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1901,19 +1941,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1921,16 +1961,16 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: paddb 16(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1945,11 +1985,11 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2059,15 +2099,15 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: @@ -2075,15 +2115,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: @@ -2097,11 +2137,11 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2211,15 +2251,15 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: paddb 48(%rsi), %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm2 +; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: @@ -2227,15 +2267,15 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: @@ -2249,11 +2289,11 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2285,12 +2325,12 @@ ; ; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-FAST-NEXT: vzeroupper @@ -2311,12 +2351,12 @@ ; ; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpermi2q %ymm0, %ymm1, %ymm2 ; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-FAST-NEXT: vzeroupper @@ -2337,9 +2377,9 @@ ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] +; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,13,0,15] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper @@ -2414,22 +2454,23 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -2439,11 +2480,11 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -2455,11 +2496,11 @@ ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -2548,10 +2589,10 @@ ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -2580,13 +2621,14 @@ ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] -; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2598,13 +2640,14 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] -; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2683,36 +2726,36 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,15,4,5,6,15,8,9,10,15,12,13,14] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -2725,10 +2768,10 @@ ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -2803,8 +2846,8 @@ ; SSE42-NEXT: paddb %xmm1, %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2818,11 +2861,11 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2851,7 +2894,6 @@ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -2869,7 +2911,6 @@ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -2953,36 +2994,36 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm2 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -2995,10 +3036,10 @@ ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 @@ -3084,13 +3125,13 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3120,7 +3161,6 @@ ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3138,7 +3178,6 @@ ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -3189,10 +3228,10 @@ ; SSE2-NEXT: paddb (%rdx), %xmm3 ; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: paddb 48(%rdx), %xmm1 ; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: paddb 48(%rdx), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 48(%rcx) +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq @@ -3211,10 +3250,10 @@ ; SSE42-NEXT: paddb (%rdx), %xmm4 ; SSE42-NEXT: movdqa 16(%rdx), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 48(%rcx) +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: movdqa %xmm4, (%rcx) ; SSE42-NEXT: retq @@ -3230,12 +3269,12 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; @@ -3345,8 +3384,8 @@ ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -3360,8 +3399,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -3486,18 +3525,18 @@ ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] @@ -3514,9 +3553,8 @@ ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3530,9 +3568,8 @@ ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3616,10 +3653,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3641,35 +3678,37 @@ ; ; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3746,26 +3785,26 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3879,10 +3918,10 @@ ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5],xmm3[6],xmm1[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3904,35 +3943,35 @@ ; ; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4008,12 +4047,12 @@ ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; @@ -4116,8 +4155,8 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -4130,8 +4169,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -4150,13 +4189,14 @@ ; ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4164,13 +4204,14 @@ ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4247,29 +4288,29 @@ ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %xmm2 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4282,11 +4323,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4299,11 +4340,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,0,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4378,10 +4419,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -4411,11 +4452,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4428,11 +4469,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,0] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4511,17 +4552,19 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -4548,11 +4591,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] ; AVX512F-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4565,11 +4608,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4608,8 +4651,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4622,8 +4665,8 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4636,8 +4679,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4760,15 +4803,15 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[2] -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4796,11 +4839,11 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4813,11 +4856,11 @@ ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4870,8 +4913,8 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -4884,8 +4927,8 @@ ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -4896,11 +4939,11 @@ ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4988,17 +5031,17 @@ ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5007,14 +5050,14 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: @@ -5022,10 +5065,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5034,10 +5077,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5046,10 +5089,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5078,17 +5121,17 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5096,14 +5139,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: @@ -5111,10 +5154,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5123,10 +5166,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5135,10 +5178,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5167,17 +5210,17 @@ ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5185,14 +5228,14 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: @@ -5200,10 +5243,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5212,10 +5255,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5224,10 +5267,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5255,31 +5298,31 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa 16(%rdx), %xmm1 +; SSE-NEXT: movdqa (%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdx), %xmm3 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) -; SSE-NEXT: movdqa %xmm3, 48(%rcx) -; SSE-NEXT: movdqa %xmm2, (%rcx) -; SSE-NEXT: movdqa %xmm1, 16(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rcx) +; SSE-NEXT: movdqa %xmm3, 32(%rcx) +; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm1, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: @@ -5287,10 +5330,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5299,10 +5342,10 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5311,10 +5354,10 @@ ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5342,44 +5385,44 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5387,10 +5430,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5398,10 +5441,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5547,44 +5590,44 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: movdqa 16(%rdx), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rdx), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rdx), %xmm1 +; SSE-NEXT: paddb (%rsi), %xmm0 +; SSE-NEXT: movdqa (%rdx), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rdx), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rdx), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: paddb 48(%rdx), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rcx) -; SSE-NEXT: movdqa %xmm3, (%rcx) -; SSE-NEXT: movdqa %xmm2, 16(%rcx) +; SSE-NEXT: movdqa %xmm0, 32(%rcx) +; SSE-NEXT: movdqa %xmm3, 16(%rcx) +; SSE-NEXT: movdqa %xmm2, (%rcx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rcx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5592,10 +5635,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -5603,10 +5646,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -283,7 +283,7 @@ ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -292,7 +292,7 @@ ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper @@ -301,7 +301,7 @@ ; AVX512BW-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-NEXT: vzeroupper @@ -609,16 +609,16 @@ ; ; AVX512F-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512F-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512F-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512F-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512F-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -626,40 +626,37 @@ ; ; AVX512DQ-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: movl (%rdi), %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] +; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] +; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> @@ -711,12 +708,12 @@ ; ; AVX512F-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; AVX512F-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512F-NEXT: movl (%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper @@ -724,36 +721,29 @@ ; ; AVX512DQ-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512DQ-NEXT: movl (%rdi), %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq -; -; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX512BW-NEXT: movl (%rdi), %eax +; AVX512BW-NEXT: vmovd %eax, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16> %broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> @@ -855,19 +845,19 @@ ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1 ; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm2, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm3 -; SSE2-NEXT: movdqa %xmm3, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -876,17 +866,17 @@ ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 ; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: pshufb %xmm3, %xmm2 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE42-NEXT: pshufb %xmm3, %xmm2 -; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE42-NEXT: pshufb %xmm3, %xmm1 +; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -896,16 +886,16 @@ ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16: @@ -966,18 +956,18 @@ ; SSE2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -989,10 +979,10 @@ ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb %xmm2, %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1003,10 +993,10 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8: @@ -1064,18 +1054,18 @@ ; SSE2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1087,10 +1077,10 @@ ; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb %xmm2, %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1101,10 +1091,10 @@ ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4: @@ -1164,15 +1154,15 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pandn (%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1180,24 +1170,24 @@ ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE42-NEXT: movdqa %xmm1, %xmm2 -; SSE42-NEXT: pblendvb %xmm0, 32(%rdi), %xmm2 -; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm2 +; SSE42-NEXT: pblendvb %xmm0, 32(%rdi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 16(%rsi), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpblendvb %xmm1, 48(%rdi), %xmm0, %xmm2 +; AVX-NEXT: vpblendvb %xmm1, 32(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: @@ -1268,10 +1258,10 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1279,16 +1269,16 @@ ; SSE42-NEXT: movdqa 32(%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: pshufb %xmm2, %xmm1 ; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] ; SSE42-NEXT: movdqa %xmm3, %xmm4 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE42-NEXT: pshufb %xmm2, %xmm1 -; SSE42-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE42-NEXT: paddb 16(%rsi), %xmm3 -; SSE42-NEXT: paddb (%rsi), %xmm4 -; SSE42-NEXT: movdqa %xmm4, (%rdx) -; SSE42-NEXT: movdqa %xmm3, 16(%rdx) +; SSE42-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE42-NEXT: pshufb %xmm2, %xmm0 +; SSE42-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE42-NEXT: paddb (%rsi), %xmm3 +; SSE42-NEXT: paddb 16(%rsi), %xmm4 +; SSE42-NEXT: movdqa %xmm4, 16(%rdx) +; SSE42-NEXT: movdqa %xmm3, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1296,15 +1286,15 @@ ; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: @@ -1361,30 +1351,30 @@ ; SSE2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,0,65535,65535,65535] -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1392,10 +1382,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: @@ -1452,27 +1442,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pandn (%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand 48(%rdi), %xmm0 +; SSE2-NEXT: pand 32(%rdi), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1480,10 +1470,10 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: @@ -1542,22 +1532,22 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: paddb 16(%rsi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: @@ -1565,11 +1555,11 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1626,35 +1616,35 @@ ; SSE2-NEXT: movaps (%rdi), %xmm0 ; SSE2-NEXT: movaps 32(%rdi), %xmm1 ; SSE2-NEXT: movaps 48(%rdi), %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: paddb 16(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1738,36 +1728,36 @@ ; SSE2-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; SSE2: # %bb.0: ; SSE2-NEXT: movapd (%rdi), %xmm0 -; SSE2-NEXT: movapd 32(%rdi), %xmm1 +; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 32(%rdi), %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1780,62 +1770,32 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-SLOW-NEXT: vzeroupper -; AVX512F-SLOW-NEXT: retq -; -; AVX512F-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512F-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq -; -; AVX512DQ-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-SLOW-NEXT: vzeroupper -; AVX512DQ-SLOW-NEXT: retq -; -; AVX512DQ-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512DQ-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0 -; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-FAST-NEXT: vzeroupper -; AVX512DQ-FAST-NEXT: retq +; AVX512F-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq ; -; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpbroadcastq (%rdi), %ymm0 -; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq +; AVX512DQ-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: -; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpermi2q 32(%rdi), %ymm0, %ymm1 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) -; AVX512BW-FAST-NEXT: vzeroupper -; AVX512BW-FAST-NEXT: retq +; AVX512BW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 %in.vec.cast = bitcast <64 x i8> %in.vec to <8 x i64> %broadcast.of.zextinreg = shufflevector <8 x i64> %in.vec.cast, <8 x i64> poison, <4 x i32> @@ -1930,11 +1890,10 @@ ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1945,11 +1904,10 @@ ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2020,10 +1978,10 @@ ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -2048,10 +2006,12 @@ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2062,10 +2022,12 @@ ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2159,11 +2121,10 @@ ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512F-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2173,11 +2134,10 @@ ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX512DQ-NEXT: vpternlogd $202, (%rdi){1to8}, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2229,13 +2189,13 @@ ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2247,12 +2207,12 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: @@ -2271,33 +2231,31 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2392,11 +2350,10 @@ ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1 ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2406,11 +2363,10 @@ ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpternlogq $202, (%rdi){1to4}, %ymm0, %ymm1 ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2502,33 +2458,31 @@ ; ; AVX512F-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] +; AVX512F-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] -; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2567,10 +2521,10 @@ ; SSE2-NEXT: paddb (%rsi), %xmm2 ; SSE2-NEXT: movdqa 16(%rsi), %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx) +; SSE2-NEXT: movdqa %xmm0, 32(%rdx) ; SSE2-NEXT: movdqa %xmm3, 16(%rdx) ; SSE2-NEXT: movdqa %xmm2, (%rdx) ; SSE2-NEXT: retq @@ -2585,10 +2539,10 @@ ; SSE42-NEXT: paddb (%rsi), %xmm3 ; SSE42-NEXT: movdqa 16(%rsi), %xmm0 ; SSE42-NEXT: paddb %xmm1, %xmm0 -; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 48(%rdx) +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: movdqa %xmm3, (%rdx) ; SSE42-NEXT: retq @@ -2600,12 +2554,12 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX-NEXT: vpblendvb %xmm0, 48(%rdi), %xmm1, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -2700,8 +2654,8 @@ ; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: paddb 16(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -2712,8 +2666,8 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2: @@ -2927,10 +2881,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -2950,10 +2904,12 @@ ; AVX512F-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512F-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2],xmm2[3],mem[4,5],xmm2[6],mem[7] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -2963,10 +2919,12 @@ ; AVX512DQ-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] -; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2],xmm2[3],mem[4,5],xmm2[6],mem[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -3029,10 +2987,10 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -3135,13 +3093,13 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; @@ -3243,13 +3201,13 @@ ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa (%rdi), %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; @@ -3333,10 +3291,10 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -3344,10 +3302,10 @@ ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -3440,13 +3398,13 @@ ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[1,3],ymm1[4,4],ymm0[5,7] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,1,0,1] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3468,11 +3426,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3480,11 +3438,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,0,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3571,11 +3529,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3583,11 +3541,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,0] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3644,14 +3602,14 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -3673,11 +3631,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512F-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3685,11 +3643,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,13,14,15] ; AVX512DQ-NEXT: vpermd (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3715,14 +3673,14 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movaps 48(%rdi), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) ; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -3731,10 +3689,10 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -3830,14 +3788,14 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa (%rdi), %xmm2 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -3858,11 +3816,11 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -3870,11 +3828,11 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpermq (%rdi), %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -3904,10 +3862,10 @@ ; SSE2-NEXT: movapd 48(%rdi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -3916,10 +3874,10 @@ ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -3927,11 +3885,11 @@ ; AVX-NEXT: vmovapd (%rdi), %ymm0 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm0[0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3987,40 +3945,40 @@ ; SSE: # %bb.0: ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX: # %bb.0: ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4028,10 +3986,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4039,10 +3997,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4065,40 +4023,40 @@ ; SSE-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4106,10 +4064,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4117,10 +4075,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4143,40 +4101,40 @@ ; SSE-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4184,10 +4142,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4195,10 +4153,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4221,39 +4179,39 @@ ; SSE-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) -; AVX-NEXT: retq +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4261,10 +4219,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4272,10 +4230,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4299,59 +4257,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4374,40 +4332,40 @@ ; SSE-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastss (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4415,10 +4373,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4426,10 +4384,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4454,40 +4412,40 @@ ; SSE-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4495,10 +4453,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4506,10 +4464,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4534,39 +4492,39 @@ ; SSE-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4574,10 +4532,10 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4585,10 +4543,10 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4614,59 +4572,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4691,60 +4649,60 @@ ; SSE-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4769,59 +4727,59 @@ ; SSE-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastd (%rdi), %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4847,59 +4805,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4924,59 +4882,59 @@ ; SSE-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm2 ; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm3 ; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) -; SSE-NEXT: movdqa %xmm3, 48(%rdx) -; SSE-NEXT: movdqa %xmm2, (%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm0 +; SSE-NEXT: movdqa %xmm0, 48(%rdx) +; SSE-NEXT: movdqa %xmm3, 32(%rdx) +; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastq (%rdi), %zmm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastq (%rdi), %zmm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5002,59 +4960,59 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: movdqa (%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 ; SSE-NEXT: paddb 32(%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: paddb 48(%rsi), %xmm1 ; SSE-NEXT: movdqa %xmm1, 48(%rdx) -; SSE-NEXT: movdqa %xmm3, (%rdx) -; SSE-NEXT: movdqa %xmm2, 16(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i64_widen_to_i256_factor4_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5080,22 +5038,22 @@ ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 48(%rsi), %xmm2 -; SSE-NEXT: paddb %xmm1, %xmm2 -; SSE-NEXT: paddb 16(%rsi), %xmm1 -; SSE-NEXT: movdqa 32(%rsi), %xmm3 -; SSE-NEXT: paddb %xmm0, %xmm3 -; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa %xmm0, (%rdx) -; SSE-NEXT: movdqa %xmm3, 32(%rdx) -; SSE-NEXT: movdqa %xmm1, 16(%rdx) -; SSE-NEXT: movdqa %xmm2, 48(%rdx) +; SSE-NEXT: movdqa (%rsi), %xmm2 +; SSE-NEXT: paddb %xmm0, %xmm2 +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: paddb %xmm1, %xmm3 +; SSE-NEXT: paddb 32(%rsi), %xmm0 +; SSE-NEXT: paddb 48(%rsi), %xmm1 +; SSE-NEXT: movdqa %xmm1, 48(%rdx) +; SSE-NEXT: movdqa %xmm0, 32(%rdx) +; SSE-NEXT: movdqa %xmm3, 16(%rdx) +; SSE-NEXT: movdqa %xmm2, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX: # %bb.0: ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq 16(%rdi), %rax +; AVX-NEXT: movq (%rdi), %rax ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: movq %rax, %r8 ; AVX-NEXT: movq %rax, %r9 @@ -5115,7 +5073,7 @@ ; AVX-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 ; AVX-NEXT: shrq $48, %r8 ; AVX-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 -; AVX-NEXT: movq 24(%rdi), %rax +; AVX-NEXT: movq 8(%rdi), %rax ; AVX-NEXT: shrq $56, %rcx ; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; AVX-NEXT: movl %eax, %ecx @@ -5137,7 +5095,7 @@ ; AVX-NEXT: movq %rax, %rcx ; AVX-NEXT: shrq $48, %rcx ; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq (%rdi), %rcx +; AVX-NEXT: movq 16(%rdi), %rcx ; AVX-NEXT: shrq $56, %rax ; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX-NEXT: movl %ecx, %eax @@ -5159,7 +5117,7 @@ ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: shrq $48, %rax ; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX-NEXT: movq 8(%rdi), %rax +; AVX-NEXT: movq 24(%rdi), %rax ; AVX-NEXT: shrq $56, %rcx ; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 ; AVX-NEXT: movl %eax, %ecx @@ -5183,14 +5141,14 @@ ; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 ; AVX-NEXT: shrq $56, %rax ; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 48(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: popq %rbx ; AVX-NEXT: retq ; @@ -5286,39 +5244,310 @@ ; AVX2-NEXT: shrq $56, %rax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq 16(%rdi), %rax +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: movq %rax, %r9 +; AVX512F-NEXT: movq %rax, %r10 +; AVX512F-NEXT: movl %eax, %r11d +; AVX512F-NEXT: movl %eax, %ebx +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: shrl $16, %ebx +; AVX512F-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512F-NEXT: shrl $24, %r11d +; AVX512F-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $32, %r10 +; AVX512F-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $40, %r9 +; AVX512F-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512F-NEXT: shrq $48, %r8 +; AVX512F-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512F-NEXT: movq 24(%rdi), %rax +; AVX512F-NEXT: shrq $56, %rcx +; AVX512F-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $8, %ecx +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $16, %ecx +; AVX512F-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $24, %ecx +; AVX512F-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $40, %rcx +; AVX512F-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512F-NEXT: movq (%rdi), %rcx +; AVX512F-NEXT: shrq $56, %rax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: vmovd %ecx, %xmm1 +; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $16, %eax +; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movl %ecx, %eax +; AVX512F-NEXT: shrl $24, %eax +; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $32, %rax +; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $40, %rax +; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: shrq $48, %rax +; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: movq 8(%rdi), %rax +; AVX512F-NEXT: shrq $56, %rcx +; AVX512F-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $8, %ecx +; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $16, %ecx +; AVX512F-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movl %eax, %ecx +; AVX512F-NEXT: shrl $24, %ecx +; AVX512F-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $32, %rcx +; AVX512F-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $40, %rcx +; AVX512F-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX512F-NEXT: shrq $56, %rax +; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512F-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) -; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX512DQ-NEXT: pushq %rbx +; AVX512DQ-NEXT: movq 16(%rdi), %rax +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: movq %rax, %r8 +; AVX512DQ-NEXT: movq %rax, %r9 +; AVX512DQ-NEXT: movq %rax, %r10 +; AVX512DQ-NEXT: movl %eax, %r11d +; AVX512DQ-NEXT: movl %eax, %ebx +; AVX512DQ-NEXT: vmovd %eax, %xmm0 +; AVX512DQ-NEXT: shrl $8, %eax +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrl $16, %ebx +; AVX512DQ-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrl $24, %r11d +; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $32, %r10 +; AVX512DQ-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $40, %r9 +; AVX512DQ-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512DQ-NEXT: shrq $48, %r8 +; AVX512DQ-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq 24(%rdi), %rax +; AVX512DQ-NEXT: shrq $56, %rcx +; AVX512DQ-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $8, %ecx +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $16, %ecx +; AVX512DQ-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $24, %ecx +; AVX512DQ-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $32, %rcx +; AVX512DQ-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $40, %rcx +; AVX512DQ-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $48, %rcx +; AVX512DQ-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: movq (%rdi), %rcx +; AVX512DQ-NEXT: shrq $56, %rax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $8, %eax +; AVX512DQ-NEXT: vmovd %ecx, %xmm1 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $16, %eax +; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %ecx, %eax +; AVX512DQ-NEXT: shrl $24, %eax +; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $32, %rax +; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $40, %rax +; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rcx, %rax +; AVX512DQ-NEXT: shrq $48, %rax +; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq 8(%rdi), %rax +; AVX512DQ-NEXT: shrq $56, %rcx +; AVX512DQ-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $8, %ecx +; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $16, %ecx +; AVX512DQ-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movl %eax, %ecx +; AVX512DQ-NEXT: shrl $24, %ecx +; AVX512DQ-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $32, %rcx +; AVX512DQ-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $40, %rcx +; AVX512DQ-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: movq %rax, %rcx +; AVX512DQ-NEXT: shrq $48, %rcx +; AVX512DQ-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX512DQ-NEXT: shrq $56, %rax +; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm1 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx) +; AVX512DQ-NEXT: popq %rbx ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: movq 16(%rdi), %rax +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: movq %rax, %r9 +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: movl %eax, %r11d +; AVX512BW-NEXT: movl %eax, %ebx +; AVX512BW-NEXT: vmovd %eax, %xmm0 +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: shrl $16, %ebx +; AVX512BW-NEXT: vpinsrb $2, %ebx, %xmm0, %xmm0 +; AVX512BW-NEXT: shrl $24, %r11d +; AVX512BW-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $32, %r10 +; AVX512BW-NEXT: vpinsrb $4, %r10d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $40, %r9 +; AVX512BW-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 +; AVX512BW-NEXT: shrq $48, %r8 +; AVX512BW-NEXT: vpinsrb $6, %r8d, %xmm0, %xmm0 +; AVX512BW-NEXT: movq 24(%rdi), %rax +; AVX512BW-NEXT: shrq $56, %rcx +; AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $8, %ecx +; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $24, %ecx +; AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $32, %rcx +; AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $40, %rcx +; AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $48, %rcx +; AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; AVX512BW-NEXT: movq (%rdi), %rcx +; AVX512BW-NEXT: shrq $56, %rax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: vmovd %ecx, %xmm1 +; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %ecx, %eax +; AVX512BW-NEXT: shrl $24, %eax +; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $32, %rax +; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $40, %rax +; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: shrq $48, %rax +; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: movq 8(%rdi), %rax +; AVX512BW-NEXT: shrq $56, %rcx +; AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $8, %ecx +; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $24, %ecx +; AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $32, %rcx +; AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $40, %rcx +; AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: shrq $48, %rcx +; AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX512BW-NEXT: shrq $56, %rax +; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) +; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64 diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll --- a/llvm/test/CodeGen/X86/atomic-fp.ll +++ b/llvm/test/CodeGen/X86/atomic-fp.ll @@ -93,8 +93,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl %ecx, (%esp) ; X86-NOSSE-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %ecx, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll (%eax) ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -258,8 +258,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll glob64 ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -421,8 +421,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll -559038737 ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -589,8 +589,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %eax, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl %ebp, %esp @@ -691,8 +691,8 @@ ; X86-NOSSE-NEXT: fstpl {{[0-9]+}}(%esp) ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl %edx, (%esp) ; X86-NOSSE-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movl %edx, (%esp) ; X86-NOSSE-NEXT: fildll (%esp) ; X86-NOSSE-NEXT: fistpll (%ecx,%eax,8) ; X86-NOSSE-NEXT: leal -4(%ebp), %esp diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll --- a/llvm/test/CodeGen/X86/atomic-idempotent.ll +++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll @@ -202,10 +202,10 @@ ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE2-NEXT: movl %edi, 8(%esi) -; X86-SSE2-NEXT: movl %edx, 12(%esi) -; X86-SSE2-NEXT: movl %eax, (%esi) +; X86-SSE2-NEXT: movl %edi, 12(%esi) +; X86-SSE2-NEXT: movl %edx, 8(%esi) ; X86-SSE2-NEXT: movl %ecx, 4(%esi) +; X86-SSE2-NEXT: movl %eax, (%esi) ; X86-SSE2-NEXT: movl %esi, %eax ; X86-SSE2-NEXT: leal -8(%ebp), %esp ; X86-SSE2-NEXT: popl %esi @@ -242,10 +242,10 @@ ; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SLM-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLM-NEXT: movl %edi, 8(%esi) -; X86-SLM-NEXT: movl %edx, 12(%esi) -; X86-SLM-NEXT: movl %eax, (%esi) +; X86-SLM-NEXT: movl %edi, 12(%esi) +; X86-SLM-NEXT: movl %edx, 8(%esi) ; X86-SLM-NEXT: movl %ecx, 4(%esi) +; X86-SLM-NEXT: movl %eax, (%esi) ; X86-SLM-NEXT: movl %esi, %eax ; X86-SLM-NEXT: leal -8(%ebp), %esp ; X86-SLM-NEXT: popl %esi @@ -282,11 +282,11 @@ ; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-ATOM-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-ATOM-NEXT: movl %eax, 8(%esi) -; X86-ATOM-NEXT: movl %edi, 12(%esi) -; X86-ATOM-NEXT: movl %ecx, (%esi) -; X86-ATOM-NEXT: movl %esi, %eax +; X86-ATOM-NEXT: movl %eax, 12(%esi) +; X86-ATOM-NEXT: movl %edi, 8(%esi) ; X86-ATOM-NEXT: movl %edx, 4(%esi) +; X86-ATOM-NEXT: movl %esi, %eax +; X86-ATOM-NEXT: movl %ecx, (%esi) ; X86-ATOM-NEXT: leal -8(%ebp), %esp ; X86-ATOM-NEXT: popl %esi ; X86-ATOM-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/atomic-mi.ll b/llvm/test/CodeGen/X86/atomic-mi.ll --- a/llvm/test/CodeGen/X86/atomic-mi.ll +++ b/llvm/test/CodeGen/X86/atomic-mi.ll @@ -751,10 +751,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: andl 16(%ebp), %edx ; X32-NEXT: andl 12(%ebp), %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: andl 16(%ebp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp @@ -973,10 +973,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: orl 16(%ebp), %edx ; X32-NEXT: orl 12(%ebp), %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: orl 16(%ebp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp @@ -1195,10 +1195,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: xorl 16(%ebp), %edx ; X32-NEXT: xorl 12(%ebp), %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: xorl 16(%ebp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp @@ -1603,10 +1603,10 @@ ; X32-NEXT: fistpll {{[0-9]+}}(%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: notl %edx ; X32-NEXT: notl %ecx -; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: notl %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) ; X32-NEXT: fildll (%esp) ; X32-NEXT: fistpll (%eax) ; X32-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -436,10 +436,10 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) -; X86-SSE-NEXT: movl %edx, 12(%esi) -; X86-SSE-NEXT: movl %eax, (%esi) +; X86-SSE-NEXT: movl %edi, 12(%esi) +; X86-SSE-NEXT: movl %edx, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) +; X86-SSE-NEXT: movl %eax, (%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $20, %esp ; X86-SSE-NEXT: .cfi_def_cfa_offset 12 @@ -517,10 +517,10 @@ ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOSSE-NEXT: movl %edi, 8(%esi) -; X86-NOSSE-NEXT: movl %edx, 12(%esi) -; X86-NOSSE-NEXT: movl %eax, (%esi) +; X86-NOSSE-NEXT: movl %edi, 12(%esi) +; X86-NOSSE-NEXT: movl %edx, 8(%esi) ; X86-NOSSE-NEXT: movl %ecx, 4(%esi) +; X86-NOSSE-NEXT: movl %eax, (%esi) ; X86-NOSSE-NEXT: movl %esi, %eax ; X86-NOSSE-NEXT: addl $20, %esp ; X86-NOSSE-NEXT: .cfi_def_cfa_offset 12 diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll @@ -1497,13 +1497,12 @@ ; CHECK-NEXT: lock cmpxchgq %rcx, (%rdi) ; CHECK-NEXT: jne .LBB51_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: movl $123, %ecx ; CHECK-NEXT: testb $32, %al +; CHECK-NEXT: movl $123, %eax ; CHECK-NEXT: jne .LBB51_4 ; CHECK-NEXT: # %bb.3: # %if.then -; CHECK-NEXT: movq 32(%rdi), %rcx +; CHECK-NEXT: movq 32(%rdi), %rax ; CHECK-NEXT: .LBB51_4: # %return -; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: retq entry: %0 = atomicrmw xor ptr %v, i64 16 monotonic, align 8 diff --git a/llvm/test/CodeGen/X86/atomic-xor.ll b/llvm/test/CodeGen/X86/atomic-xor.ll --- a/llvm/test/CodeGen/X86/atomic-xor.ll +++ b/llvm/test/CodeGen/X86/atomic-xor.ll @@ -40,10 +40,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/atomic128.ll b/llvm/test/CodeGen/X86/atomic128.ll --- a/llvm/test/CodeGen/X86/atomic128.ll +++ b/llvm/test/CodeGen/X86/atomic128.ll @@ -63,10 +63,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK32-NEXT: movl %edi, 8(%esi) -; CHECK32-NEXT: movl %edx, 12(%esi) -; CHECK32-NEXT: movl %eax, (%esi) +; CHECK32-NEXT: movl %edi, 12(%esi) +; CHECK32-NEXT: movl %edx, 8(%esi) ; CHECK32-NEXT: movl %ecx, 4(%esi) +; CHECK32-NEXT: movl %eax, (%esi) ; CHECK32-NEXT: movl %esi, %eax ; CHECK32-NEXT: addl $20, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 12 @@ -173,10 +173,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -241,10 +241,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -309,10 +309,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -377,10 +377,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -448,10 +448,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -519,10 +519,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -590,10 +590,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -661,10 +661,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl %esi, var+8 -; CHECK32-NEXT: movl %edx, var+12 -; CHECK32-NEXT: movl %eax, var +; CHECK32-NEXT: movl %esi, var+12 +; CHECK32-NEXT: movl %edx, var+8 ; CHECK32-NEXT: movl %ecx, var+4 +; CHECK32-NEXT: movl %eax, var ; CHECK32-NEXT: addl $24, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 8 ; CHECK32-NEXT: popl %esi @@ -731,10 +731,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK32-NEXT: movl %edi, 8(%esi) -; CHECK32-NEXT: movl %edx, 12(%esi) -; CHECK32-NEXT: movl %eax, (%esi) +; CHECK32-NEXT: movl %edi, 12(%esi) +; CHECK32-NEXT: movl %edx, 8(%esi) ; CHECK32-NEXT: movl %ecx, 4(%esi) +; CHECK32-NEXT: movl %eax, (%esi) ; CHECK32-NEXT: movl %esi, %eax ; CHECK32-NEXT: addl $20, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 12 @@ -803,10 +803,10 @@ ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK32-NEXT: movl %edi, 8(%esi) -; CHECK32-NEXT: movl %edx, 12(%esi) -; CHECK32-NEXT: movl %eax, (%esi) +; CHECK32-NEXT: movl %edi, 12(%esi) +; CHECK32-NEXT: movl %edx, 8(%esi) ; CHECK32-NEXT: movl %ecx, 4(%esi) +; CHECK32-NEXT: movl %eax, (%esi) ; CHECK32-NEXT: movl %esi, %eax ; CHECK32-NEXT: addl $20, %esp ; CHECK32-NEXT: .cfi_def_cfa_offset 12 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -16,9 +16,10 @@ ; ; AVX-LABEL: avg_v4i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovd %xmm0, (%rax) ; AVX-NEXT: retq %1 = load <4 x i8>, ptr %a @@ -42,13 +43,45 @@ ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i8>, ptr %a %2 = load <8 x i8>, ptr %b %3 = zext <8 x i8> %1 to <8 x i32> @@ -69,12 +102,54 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <16 x i8>, ptr %a %2 = load <16 x i8>, ptr %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -90,28 +165,28 @@ define void @avg_v24i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v24i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v24i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v24i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovq %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %xmm0, (%rax) @@ -120,8 +195,8 @@ ; ; AVX512-LABEL: avg_v24i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vmovq %xmm1, (%rax) ; AVX512-NEXT: vmovdqu %xmm0, (%rax) @@ -142,36 +217,89 @@ define void @avg_v32i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v32i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpavgb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -190,12 +318,12 @@ define void @avg_v48i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v48i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 -; SSE2-NEXT: pavgb 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 +; SSE2-NEXT: pavgb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) @@ -203,12 +331,12 @@ ; ; AVX1-LABEL: avg_v48i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovdqu %xmm2, (%rax) @@ -216,10 +344,10 @@ ; ; AVX2-LABEL: avg_v48i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-NEXT: vpavgb 32(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX2-NEXT: vpavgb 32(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vmovdqu %xmm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -227,10 +355,10 @@ ; ; AVX512F-LABEL: avg_v48i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512F-NEXT: vpavgb 32(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512F-NEXT: vpavgb 32(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vmovdqu %xmm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper @@ -238,8 +366,8 @@ ; ; AVX512BW-LABEL: avg_v48i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, (%rax) ; AVX512BW-NEXT: vmovdqu %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -259,14 +387,14 @@ define void @avg_v64i8(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v64i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgb (%rdi), %xmm0 -; SSE2-NEXT: pavgb 16(%rdi), %xmm1 -; SSE2-NEXT: pavgb 32(%rdi), %xmm2 -; SSE2-NEXT: pavgb 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rsi), %xmm1 +; SSE2-NEXT: pavgb 32(%rsi), %xmm2 +; SSE2-NEXT: pavgb 48(%rsi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) @@ -275,46 +403,173 @@ ; ; AVX1-LABEL: avg_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgb 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgb 48(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpavgb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm7, %ymm6, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpavgb %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm2, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm7, %xmm7 +; AVX512F-NEXT: vpmovdb %zmm6, %xmm6 +; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpavgb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovdb %zmm5, %xmm3 +; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpavgb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm7, %xmm7 +; AVX512BW-NEXT: vpmovdb %zmm6, %xmm6 +; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 +; AVX512BW-NEXT: vpmovdb %zmm5, %xmm5 +; AVX512BW-NEXT: vpmovdb %zmm4, %xmm4 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512BW-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpavgb %zmm4, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -339,13 +594,34 @@ ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: retq %1 = load <4 x i16>, ptr %a %2 = load <4 x i16>, ptr %b %3 = zext <4 x i16> %1 to <4 x i32> @@ -366,12 +642,41 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i16>, ptr %a %2 = load <8 x i16>, ptr %b %3 = zext <8 x i16> %1 to <8 x i32> @@ -387,36 +692,55 @@ define void @avg_v16i16(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -435,14 +759,14 @@ define void @avg_v32i16(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v32i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 -; SSE2-NEXT: pavgw 32(%rdi), %xmm2 -; SSE2-NEXT: pavgw 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw 32(%rsi), %xmm2 +; SSE2-NEXT: pavgw 48(%rsi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) @@ -451,46 +775,93 @@ ; ; AVX1-LABEL: avg_v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3 -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgw 32(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpavgw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -509,16 +880,16 @@ define void @avg_v40i16(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: avg_v40i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pavgw (%rdi), %xmm0 -; SSE2-NEXT: pavgw 16(%rdi), %xmm1 -; SSE2-NEXT: pavgw 32(%rdi), %xmm2 -; SSE2-NEXT: pavgw 48(%rdi), %xmm3 -; SSE2-NEXT: movdqa 64(%rsi), %xmm4 -; SSE2-NEXT: pavgw 64(%rdi), %xmm4 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw 32(%rsi), %xmm2 +; SSE2-NEXT: pavgw 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa 64(%rdi), %xmm4 +; SSE2-NEXT: pavgw 64(%rsi), %xmm4 ; SSE2-NEXT: movdqu %xmm4, (%rax) ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) @@ -528,16 +899,16 @@ ; ; AVX1-LABEL: avg_v40i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX1-NEXT: vpavgw 64(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX1-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX1-NEXT: vpavgw (%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgw 32(%rdi), %xmm3, %xmm3 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm4, %xmm4 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX1-NEXT: vpavgw 64(%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX1-NEXT: vpavgw (%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgw 16(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vpavgw 32(%rsi), %xmm3, %xmm3 +; AVX1-NEXT: vpavgw 48(%rsi), %xmm4, %xmm4 ; AVX1-NEXT: vmovdqu %xmm4, (%rax) ; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: vmovdqu %xmm2, (%rax) @@ -547,12 +918,12 @@ ; ; AVX2-LABEL: avg_v40i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rsi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX2-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-NEXT: vpavgw 64(%rsi), %xmm2, %xmm2 ; AVX2-NEXT: vmovdqu %xmm2, (%rax) ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) @@ -561,12 +932,12 @@ ; ; AVX512F-LABEL: avg_v40i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512F-NEXT: vpavgw 64(%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-NEXT: vpavgw 64(%rsi), %xmm2, %xmm2 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vmovdqu %xmm2, (%rax) @@ -575,10 +946,10 @@ ; ; AVX512BW-LABEL: avg_v40i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512BW-NEXT: vpavgw 64(%rdi), %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512BW-NEXT: vpavgw 64(%rsi), %xmm1, %xmm1 ; AVX512BW-NEXT: vmovdqu %xmm1, (%rax) ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -606,9 +977,10 @@ ; ; AVX-LABEL: avg_v4i8_2: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vmovd %xmm0, (%rax) ; AVX-NEXT: retq %1 = load <4 x i8>, ptr %a @@ -632,13 +1004,45 @@ ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i8_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i8_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i8>, ptr %a %2 = load <8 x i8>, ptr %b %3 = zext <8 x i8> %1 to <8 x i32> @@ -659,12 +1063,54 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v16i8_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v16i8_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v16i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <16 x i8>, ptr %a %2 = load <16 x i8>, ptr %b %3 = zext <16 x i8> %1 to <16 x i32> @@ -690,26 +1136,79 @@ ; ; AVX1-LABEL: avg_v32i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v32i8_2: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpavgb %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -740,28 +1239,96 @@ ; ; AVX1-LABEL: avg_v64i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rsi), %ymm0 -; AVX1-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 -; AVX2-NEXT: vmovups %ymm1, (%rax) -; AVX2-NEXT: vmovups %ymm0, (%rax) +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm2, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: avg_v64i8_2: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rsi), %zmm0 -; AVX512-NEXT: vmovups %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: avg_v64i8_2: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm3, (%rax) +; AVX512F-NEXT: vpmovdb %zmm2, (%rax) +; AVX512F-NEXT: vpmovdb %zmm1, (%rax) +; AVX512F-NEXT: vpmovdb %zmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: avg_v64i8_2: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %1 = load <64 x i8>, ptr %a %2 = load <64 x i8>, ptr %b %3 = zext <64 x i8> %1 to <64 x i32> @@ -784,13 +1351,34 @@ ; SSE2-NEXT: movq %xmm1, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v4i16_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v4i16_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v4i16_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v4i16_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: retq %1 = load <4 x i16>, ptr %a %2 = load <4 x i16>, ptr %b %3 = zext <4 x i16> %1 to <4 x i32> @@ -811,12 +1399,41 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i16_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i16_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i16_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i16>, ptr %a %2 = load <8 x i16>, ptr %b %3 = zext <8 x i16> %1 to <8 x i32> @@ -842,26 +1459,45 @@ ; ; AVX1-LABEL: avg_v16i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: avg_v16i16_2: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -896,46 +1532,93 @@ ; ; AVX1-LABEL: avg_v32i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 -; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) -; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v32i16_2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpavgw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -961,7 +1644,9 @@ ; ; AVX-LABEL: avg_v4i8_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rax) ; AVX-NEXT: retq @@ -982,12 +1667,35 @@ ; SSE2-NEXT: movq %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i8_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i8_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovq %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i8_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i8_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovq %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i8>, ptr %a %2 = zext <8 x i8> %1 to <8 x i32> %3 = add nuw nsw <8 x i32> %2, @@ -1005,12 +1713,40 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v16i8_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v16i8_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v16i8_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v16i8_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <16 x i8>, ptr %a %2 = zext <16 x i8> %1 to <16 x i32> %3 = add nuw nsw <16 x i32> %2, @@ -1033,17 +1769,40 @@ ; ; AVX1-LABEL: avg_v32i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX1-NEXT: # xmm0 = mem[0,0] -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i8_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1051,7 +1810,11 @@ ; ; AVX512-LABEL: avg_v32i8_const: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper @@ -1084,41 +1847,109 @@ ; ; AVX1-LABEL: avg_v64i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX1-NEXT: # xmm0 = mem[0,0] -; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm3 -; AVX1-NEXT: vpavgb 48(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vpavgb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpavgb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpavgb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpavgb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqu %ymm2, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8_const: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpavgb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpavgb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqu %ymm2, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8_const: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512BW-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512BW-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpavgb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1142,7 +1973,8 @@ ; ; AVX-LABEL: avg_v4i16_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, (%rax) ; AVX-NEXT: retq @@ -1163,12 +1995,33 @@ ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; -; AVX-LABEL: avg_v8i16_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-LABEL: avg_v8i16_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v8i16_const: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v8i16_const: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %1 = load <8 x i16>, ptr %a %2 = zext <8 x i16> %1 to <8 x i32> %3 = add nuw nsw <8 x i32> %2, @@ -1191,16 +2044,25 @@ ; ; AVX1-LABEL: avg_v16i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v16i16_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1208,7 +2070,8 @@ ; ; AVX512-LABEL: avg_v16i16_const: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, (%rax) ; AVX512-NEXT: vzeroupper @@ -1241,23 +2104,43 @@ ; ; AVX1-LABEL: avg_v32i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] -; AVX1-NEXT: vpavgw (%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpavgw 16(%rdi), %xmm0, %xmm2 -; AVX1-NEXT: vpavgw 32(%rdi), %xmm0, %xmm3 -; AVX1-NEXT: vpavgw 48(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpavgw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpavgw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpavgw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpavgw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vmovdqu %xmm3, (%rax) -; AVX1-NEXT: vmovdqu %xmm2, (%rax) ; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm3, (%rax) ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX2-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1 -; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper @@ -1265,10 +2148,14 @@ ; ; AVX512F-LABEL: avg_v32i16_const: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1 -; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-NEXT: vpavgw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper @@ -1276,7 +2163,11 @@ ; ; AVX512BW-LABEL: avg_v32i16_const: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpavgw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper @@ -1739,141 +2630,114 @@ ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movaps (%rdi), %xmm1 -; SSE2-NEXT: movaps (%rsi), %xmm0 -; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps (%rsi), %xmm1 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: addq %rax, %rcx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: addq %rbp, %rax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r13,%rbp), %r13 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r12,%rbp), %r12 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r15,%rbp), %r15 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r14,%rbp), %r14 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rbx,%rbp), %rbx -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r11,%rbp), %r11 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r10,%rbp), %r10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r9,%rbp), %r9 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%r8,%rbp), %r8 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rdi,%rbp), %rdi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rsi,%rbp), %rsi -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp -; SSE2-NEXT: leaq -1(%rdx,%rbp), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE2-NEXT: leaq -1(%rbp,%rdx), %rdx -; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: xorl %ebp, %ebp -; SSE2-NEXT: addq $-1, %rcx -; SSE2-NEXT: movl $0, %edx -; SSE2-NEXT: adcq $-1, %rdx -; SSE2-NEXT: addq $-1, %rax -; SSE2-NEXT: adcq $-1, %rbp -; SSE2-NEXT: shldq $63, %rax, %rbp -; SSE2-NEXT: shldq $63, %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: movq %rbp, %xmm0 -; SSE2-NEXT: shrq %r13 -; SSE2-NEXT: movq %r13, %xmm3 -; SSE2-NEXT: shrq %r12 -; SSE2-NEXT: movq %r12, %xmm2 -; SSE2-NEXT: shrq %r15 -; SSE2-NEXT: movq %r15, %xmm5 -; SSE2-NEXT: shrq %r14 -; SSE2-NEXT: movq %r14, %xmm4 -; SSE2-NEXT: shrq %rbx -; SSE2-NEXT: movq %rbx, %xmm6 -; SSE2-NEXT: shrq %r11 -; SSE2-NEXT: movq %r11, %xmm7 -; SSE2-NEXT: shrq %r10 -; SSE2-NEXT: movq %r10, %xmm9 -; SSE2-NEXT: shrq %r9 -; SSE2-NEXT: movq %r9, %xmm8 -; SSE2-NEXT: shrq %r8 -; SSE2-NEXT: movq %r8, %xmm10 -; SSE2-NEXT: shrq %rdi -; SSE2-NEXT: movq %rdi, %xmm11 -; SSE2-NEXT: shrq %rsi -; SSE2-NEXT: movq %rsi, %xmm12 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm13 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm14 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE2-NEXT: shrq %rax -; SSE2-NEXT: movq %rax, %xmm15 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rcx,%rsi), %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rdx,%rsi), %ecx +; SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rdi,%rsi), %edi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r9,%rsi), %r9d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rbx,%rsi), %ebx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rbp,%rsi), %ebp +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r12,%rsi), %r12d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%rax,%rsi), %eax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r13,%rsi), %r13d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r15,%rsi), %r15d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r14,%rsi), %r14d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r11,%rsi), %r11d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: leal -1(%r10,%rsi), %esi +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: leal -1(%r8,%r10), %r8d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; SSE2-NEXT: leal -1(%rcx,%r10), %r10d +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE2-NEXT: leal -1(%rdx,%rcx), %ecx +; SSE2-NEXT: shrl %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: shrl %r10d +; SSE2-NEXT: movd %r10d, %xmm1 +; SSE2-NEXT: shrl %r8d +; SSE2-NEXT: movd %r8d, %xmm3 +; SSE2-NEXT: shrl %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: shrl %r11d +; SSE2-NEXT: movd %r11d, %xmm4 +; SSE2-NEXT: shrl %r14d +; SSE2-NEXT: movd %r14d, %xmm5 +; SSE2-NEXT: shrl %r15d +; SSE2-NEXT: movd %r15d, %xmm6 +; SSE2-NEXT: shrl %r13d +; SSE2-NEXT: movd %r13d, %xmm7 +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: movd %eax, %xmm8 +; SSE2-NEXT: shrl %r12d +; SSE2-NEXT: movd %r12d, %xmm9 +; SSE2-NEXT: shrl %ebp +; SSE2-NEXT: movd %ebp, %xmm10 +; SSE2-NEXT: shrl %ebx +; SSE2-NEXT: movd %ebx, %xmm11 +; SSE2-NEXT: shrl %r9d +; SSE2-NEXT: movd %r9d, %xmm12 +; SSE2-NEXT: shrl %edi +; SSE2-NEXT: movd %edi, %xmm13 +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; SSE2-NEXT: shrl %eax +; SSE2-NEXT: movd %eax, %xmm15 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: psllq $48, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,0,1,1] -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm8, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; SSE2-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,0,1] -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm13, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: movupd %xmm2, (%rax) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm7[0] +; SSE2-NEXT: movdqu %xmm15, (%rax) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -1894,102 +2758,92 @@ ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpextrw $4, %xmm0, %eax +; AVX1-NEXT: vpextrw $6, %xmm0, %eax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $5, %xmm0, %eax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $6, %xmm0, %r10d -; AVX1-NEXT: vpextrw $7, %xmm0, %edx -; AVX1-NEXT: vpextrw $0, %xmm3, %edi -; AVX1-NEXT: vpextrw $1, %xmm3, %r8d +; AVX1-NEXT: vpextrw $7, %xmm0, %ecx +; AVX1-NEXT: vpextrw $0, %xmm0, %edx +; AVX1-NEXT: vpextrw $1, %xmm0, %esi +; AVX1-NEXT: vpextrw $2, %xmm0, %edi +; AVX1-NEXT: vpextrw $3, %xmm0, %r8d +; AVX1-NEXT: vpextrw $0, %xmm3, %r10d +; AVX1-NEXT: vpextrw $1, %xmm3, %r11d +; AVX1-NEXT: vpextrw $4, %xmm3, %r14d +; AVX1-NEXT: vpextrw $5, %xmm3, %r15d +; AVX1-NEXT: vpextrw $7, %xmm3, %r12d +; AVX1-NEXT: vpextrw $6, %xmm3, %r13d +; AVX1-NEXT: vpextrw $3, %xmm3, %eax ; AVX1-NEXT: vpextrw $2, %xmm3, %r9d -; AVX1-NEXT: vpextrw $3, %xmm3, %r11d -; AVX1-NEXT: vpextrw $4, %xmm3, %ebx -; AVX1-NEXT: vpextrw $5, %xmm3, %r14d -; AVX1-NEXT: vpextrw $6, %xmm3, %r15d -; AVX1-NEXT: vpextrw $7, %xmm3, %esi -; AVX1-NEXT: vpextrw $1, %xmm0, %r13d -; AVX1-NEXT: vpextrw $0, %xmm0, %r12d -; AVX1-NEXT: vpextrw $1, %xmm1, %ecx -; AVX1-NEXT: addq %r13, %rcx -; AVX1-NEXT: vpextrw $0, %xmm1, %eax -; AVX1-NEXT: addq %r12, %rax -; AVX1-NEXT: vpextrw $7, %xmm2, %r12d -; AVX1-NEXT: leaq -1(%rsi,%r12), %rsi -; AVX1-NEXT: vpextrw $6, %xmm2, %r12d -; AVX1-NEXT: leaq -1(%r15,%r12), %rbp -; AVX1-NEXT: vpextrw $5, %xmm2, %r15d -; AVX1-NEXT: leaq -1(%r14,%r15), %r13 -; AVX1-NEXT: vpextrw $4, %xmm2, %r14d -; AVX1-NEXT: leaq -1(%rbx,%r14), %r12 -; AVX1-NEXT: vpextrw $3, %xmm2, %ebx -; AVX1-NEXT: leaq -1(%r11,%rbx), %r15 -; AVX1-NEXT: vpextrw $2, %xmm2, %r11d -; AVX1-NEXT: leaq -1(%r9,%r11), %r14 +; AVX1-NEXT: vpextrw $2, %xmm2, %ebx +; AVX1-NEXT: leal -1(%r9,%rbx), %r9d +; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $3, %xmm2, %r9d +; AVX1-NEXT: leal -1(%rax,%r9), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX1-NEXT: vpextrw $6, %xmm2, %eax +; AVX1-NEXT: leal -1(%r13,%rax), %ebp +; AVX1-NEXT: vpextrw $7, %xmm2, %eax +; AVX1-NEXT: leal -1(%r12,%rax), %eax +; AVX1-NEXT: vpextrw $5, %xmm2, %r9d +; AVX1-NEXT: leaq -1(%r15,%r9), %r13 +; AVX1-NEXT: vpextrw $4, %xmm2, %r9d +; AVX1-NEXT: leaq -1(%r14,%r9), %r12 ; AVX1-NEXT: vpextrw $1, %xmm2, %r9d -; AVX1-NEXT: leaq -1(%r8,%r9), %rbx -; AVX1-NEXT: vpextrw $0, %xmm2, %r8d -; AVX1-NEXT: leaq -1(%rdi,%r8), %r11 -; AVX1-NEXT: vpextrw $7, %xmm1, %edi -; AVX1-NEXT: leaq -1(%rdx,%rdi), %r9 -; AVX1-NEXT: vpextrw $6, %xmm1, %edx -; AVX1-NEXT: leaq -1(%r10,%rdx), %r8 -; AVX1-NEXT: vpextrw $5, %xmm1, %edx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX1-NEXT: leaq -1(%rdi,%rdx), %rdi -; AVX1-NEXT: vpextrw $4, %xmm1, %edx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX1-NEXT: leaq -1(%r10,%rdx), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $3, %xmm0, %edx -; AVX1-NEXT: vpextrw $3, %xmm1, %r10d -; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: vpextrw $2, %xmm0, %edx -; AVX1-NEXT: vpextrw $2, %xmm1, %r10d -; AVX1-NEXT: leaq -1(%rdx,%r10), %rdx -; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: xorl %edx, %edx -; AVX1-NEXT: addq $-1, %rcx -; AVX1-NEXT: movl $0, %r10d -; AVX1-NEXT: adcq $-1, %r10 -; AVX1-NEXT: addq $-1, %rax -; AVX1-NEXT: adcq $-1, %rdx -; AVX1-NEXT: shldq $63, %rax, %rdx -; AVX1-NEXT: shldq $63, %rcx, %r10 -; AVX1-NEXT: shrq %rsi -; AVX1-NEXT: vmovq %rsi, %xmm0 -; AVX1-NEXT: shrq %rbp -; AVX1-NEXT: vmovq %rbp, %xmm1 +; AVX1-NEXT: leaq -1(%r11,%r9), %r15 +; AVX1-NEXT: vpextrw $0, %xmm2, %r9d +; AVX1-NEXT: leaq -1(%r10,%r9), %r14 +; AVX1-NEXT: vpextrw $3, %xmm1, %r9d +; AVX1-NEXT: leaq -1(%r8,%r9), %r11 +; AVX1-NEXT: vpextrw $2, %xmm1, %r8d +; AVX1-NEXT: leaq -1(%rdi,%r8), %r10 +; AVX1-NEXT: vpextrw $1, %xmm1, %edi +; AVX1-NEXT: leaq -1(%rsi,%rdi), %r8 +; AVX1-NEXT: vpextrw $0, %xmm1, %esi +; AVX1-NEXT: leaq -1(%rdx,%rsi), %rdi +; AVX1-NEXT: vpextrw $7, %xmm1, %edx +; AVX1-NEXT: leaq -1(%rcx,%rdx), %rsi +; AVX1-NEXT: vpextrw $6, %xmm1, %ecx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX1-NEXT: leaq -1(%rdx,%rcx), %rdx +; AVX1-NEXT: vpextrw $5, %xmm0, %ecx +; AVX1-NEXT: vpextrw $5, %xmm1, %r9d +; AVX1-NEXT: leaq -1(%rcx,%r9), %rcx +; AVX1-NEXT: vpextrw $4, %xmm0, %r9d +; AVX1-NEXT: vpextrw $4, %xmm1, %ebx +; AVX1-NEXT: leaq -1(%r9,%rbx), %r9 +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: shrl %ebp +; AVX1-NEXT: vmovd %ebp, %xmm1 ; AVX1-NEXT: shrq %r13 ; AVX1-NEXT: vmovq %r13, %xmm2 ; AVX1-NEXT: shrq %r12 ; AVX1-NEXT: vmovq %r12, %xmm3 +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: vmovd %eax, %xmm4 +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX1-NEXT: shrl %eax +; AVX1-NEXT: vmovd %eax, %xmm5 ; AVX1-NEXT: shrq %r15 -; AVX1-NEXT: vmovq %r15, %xmm4 +; AVX1-NEXT: vmovq %r15, %xmm6 ; AVX1-NEXT: shrq %r14 -; AVX1-NEXT: vmovq %r14, %xmm5 -; AVX1-NEXT: shrq %rbx -; AVX1-NEXT: vmovq %rbx, %xmm6 +; AVX1-NEXT: vmovq %r14, %xmm7 ; AVX1-NEXT: shrq %r11 -; AVX1-NEXT: vmovq %r11, %xmm7 -; AVX1-NEXT: shrq %r9 -; AVX1-NEXT: vmovq %r9, %xmm8 +; AVX1-NEXT: vmovq %r11, %xmm8 +; AVX1-NEXT: shrq %r10 +; AVX1-NEXT: vmovq %r10, %xmm9 ; AVX1-NEXT: shrq %r8 -; AVX1-NEXT: vmovq %r8, %xmm9 +; AVX1-NEXT: vmovq %r8, %xmm10 ; AVX1-NEXT: shrq %rdi -; AVX1-NEXT: vmovq %rdi, %xmm10 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm11 -; AVX1-NEXT: vmovq %r10, %xmm12 +; AVX1-NEXT: vmovq %rdi, %xmm11 +; AVX1-NEXT: shrq %rsi +; AVX1-NEXT: vmovq %rsi, %xmm12 +; AVX1-NEXT: shrq %rdx ; AVX1-NEXT: vmovq %rdx, %xmm13 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm14 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shrq %rax -; AVX1-NEXT: vmovq %rax, %xmm15 +; AVX1-NEXT: shrq %rcx +; AVX1-NEXT: vmovq %rcx, %xmm14 +; AVX1-NEXT: shrq %r9 +; AVX1-NEXT: vmovq %r9, %xmm15 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] @@ -2003,14 +2857,13 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX1-NEXT: vpsllq $48, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX1-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: popq %rbx @@ -2029,187 +2882,140 @@ ; AVX2-NEXT: pushq %r13 ; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vmovq %xmm7, %r13 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vmovq %xmm2, %rbp -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm8 -; AVX2-NEXT: vmovq %xmm8, %r8 -; AVX2-NEXT: vpextrq $1, %xmm8, %r15 -; AVX2-NEXT: vpextrq $1, %xmm2, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %rbx -; AVX2-NEXT: vpextrq $1, %xmm6, %rsi -; AVX2-NEXT: vpextrq $1, %xmm5, %rdx -; AVX2-NEXT: vpextrq $1, %xmm1, %rcx -; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vmovq %xmm3, %rdi -; AVX2-NEXT: vpextrq $1, %xmm0, %r10 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX2-NEXT: vpextrq $1, %xmm9, %r11 -; AVX2-NEXT: addq %r15, %r11 -; AVX2-NEXT: vpextrq $1, %xmm8, %r9 -; AVX2-NEXT: addq %r14, %r9 -; AVX2-NEXT: movq %r9, %r14 -; AVX2-NEXT: vpextrq $1, %xmm7, %r9 -; AVX2-NEXT: addq %rbx, %r9 -; AVX2-NEXT: movq %r9, %rbx -; AVX2-NEXT: vpextrq $1, %xmm4, %r15 -; AVX2-NEXT: addq %rsi, %r15 -; AVX2-NEXT: vpextrq $1, %xmm5, %r12 -; AVX2-NEXT: addq %rdx, %r12 -; AVX2-NEXT: vpextrq $1, %xmm3, %r9 -; AVX2-NEXT: addq %rcx, %r9 -; AVX2-NEXT: vpextrq $1, %xmm6, %rsi -; AVX2-NEXT: addq %rax, %rsi -; AVX2-NEXT: vmovq %xmm6, %rdx -; AVX2-NEXT: addq %rdi, %rdx +; AVX2-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: vpextrq $1, %xmm2, %rcx -; AVX2-NEXT: addq %r10, %rcx -; AVX2-NEXT: vmovq %xmm9, %r10 -; AVX2-NEXT: leaq -1(%r8,%r10), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm8, %rdi -; AVX2-NEXT: leaq -1(%rbp,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm7, %rdi -; AVX2-NEXT: leaq -1(%r13,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vmovq %xmm4, %rdx +; AVX2-NEXT: vpextrq $1, %xmm4, %rsi +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX2-NEXT: vmovq %xmm4, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm5, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: leaq -1(%rax,%rdi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm1, %rdi -; AVX2-NEXT: vmovq %xmm3, %r8 -; AVX2-NEXT: leaq -1(%rdi,%r8), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: vmovq %xmm2, %r8 -; AVX2-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: addq $-1, %r11 -; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r8d -; AVX2-NEXT: adcq $-1, %r8 -; AVX2-NEXT: addq $-1, %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %edi -; AVX2-NEXT: adcq $-1, %rdi -; AVX2-NEXT: addq $-1, %rbx -; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %r11d -; AVX2-NEXT: adcq $-1, %r11 -; AVX2-NEXT: addq $-1, %r15 -; AVX2-NEXT: movl $0, %r10d -; AVX2-NEXT: adcq $-1, %r10 -; AVX2-NEXT: addq $-1, %r12 -; AVX2-NEXT: movl $0, %r14d -; AVX2-NEXT: adcq $-1, %r14 -; AVX2-NEXT: addq $-1, %r9 -; AVX2-NEXT: movl $0, %ebp -; AVX2-NEXT: adcq $-1, %rbp -; AVX2-NEXT: addq $-1, %rsi -; AVX2-NEXT: movl $0, %r13d -; AVX2-NEXT: adcq $-1, %r13 -; AVX2-NEXT: addq $-1, %rdx -; AVX2-NEXT: movl $0, %ebx -; AVX2-NEXT: adcq $-1, %rbx -; AVX2-NEXT: addq $-1, %rcx -; AVX2-NEXT: movl $0, %eax -; AVX2-NEXT: adcq $-1, %rax -; AVX2-NEXT: shldq $63, %rcx, %rax -; AVX2-NEXT: shldq $63, %rdx, %rbx -; AVX2-NEXT: shldq $63, %rsi, %r13 -; AVX2-NEXT: shldq $63, %r9, %rbp -; AVX2-NEXT: shldq $63, %r12, %r14 -; AVX2-NEXT: shldq $63, %r15, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %rdi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r8 -; AVX2-NEXT: vmovq %r8, %xmm0 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vmovq %rdi, %xmm2 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm3 +; AVX2-NEXT: vpextrq $1, %xmm4, %r8 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vmovq %xmm1, %r10 +; AVX2-NEXT: vpextrq $1, %xmm1, %r11 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %r14 +; AVX2-NEXT: vpextrq $1, %xmm1, %r15 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %r9 +; AVX2-NEXT: vpextrq $1, %xmm1, %rbx +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %r13 +; AVX2-NEXT: vpextrq $1, %xmm1, %r12 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX2-NEXT: vmovq %xmm7, %rax +; AVX2-NEXT: leal -1(%r9,%rax), %eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vpextrq $1, %xmm7, %rax +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-NEXT: leal -1(%rbx,%rax), %eax +; AVX2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT: vmovq %xmm7, %rax +; AVX2-NEXT: leal -1(%r13,%rax), %ebp +; AVX2-NEXT: vpextrq $1, %xmm7, %rax +; AVX2-NEXT: leal -1(%r12,%rax), %r12d +; AVX2-NEXT: vpextrq $1, %xmm6, %rax +; AVX2-NEXT: leaq -1(%r15,%rax), %rax +; AVX2-NEXT: vmovq %xmm6, %r9 +; AVX2-NEXT: leaq -1(%r14,%r9), %r13 +; AVX2-NEXT: vpextrq $1, %xmm5, %r9 +; AVX2-NEXT: leaq -1(%r11,%r9), %r15 +; AVX2-NEXT: vmovq %xmm5, %r9 +; AVX2-NEXT: leaq -1(%r10,%r9), %r14 +; AVX2-NEXT: vpextrq $1, %xmm4, %r9 +; AVX2-NEXT: leaq -1(%r8,%r9), %r11 +; AVX2-NEXT: vmovq %xmm4, %r8 +; AVX2-NEXT: leaq -1(%rdi,%r8), %r10 +; AVX2-NEXT: vpextrq $1, %xmm3, %rdi +; AVX2-NEXT: leaq -1(%rsi,%rdi), %r8 +; AVX2-NEXT: vmovq %xmm3, %rsi +; AVX2-NEXT: leaq -1(%rdx,%rsi), %rdi +; AVX2-NEXT: vpextrq $1, %xmm2, %rdx +; AVX2-NEXT: leaq -1(%rcx,%rdx), %rsi +; AVX2-NEXT: vmovq %xmm2, %rcx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX2-NEXT: leaq -1(%rdx,%rcx), %rdx +; AVX2-NEXT: vpextrq $1, %xmm0, %rcx +; AVX2-NEXT: vpextrq $1, %xmm1, %r9 +; AVX2-NEXT: leaq -1(%rcx,%r9), %rcx +; AVX2-NEXT: vmovq %xmm0, %r9 +; AVX2-NEXT: vmovq %xmm1, %rbx +; AVX2-NEXT: leaq -1(%r9,%rbx), %r9 +; AVX2-NEXT: shrq %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: shrq %r13 +; AVX2-NEXT: vmovq %r13, %xmm1 +; AVX2-NEXT: shrq %r15 +; AVX2-NEXT: vmovq %r15, %xmm2 +; AVX2-NEXT: shrq %r14 +; AVX2-NEXT: vmovq %r14, %xmm3 +; AVX2-NEXT: shrq %r11 ; AVX2-NEXT: vmovq %r11, %xmm4 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm5 -; AVX2-NEXT: vmovq %r10, %xmm6 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; AVX2-NEXT: shrq %r10 +; AVX2-NEXT: vmovq %r10, %xmm5 +; AVX2-NEXT: shrq %r8 +; AVX2-NEXT: vmovq %r8, %xmm6 +; AVX2-NEXT: shrq %rdi +; AVX2-NEXT: vmovq %rdi, %xmm7 +; AVX2-NEXT: shrl %r12d +; AVX2-NEXT: vmovd %r12d, %xmm8 +; AVX2-NEXT: shrl %ebp +; AVX2-NEXT: vmovd %ebp, %xmm9 +; AVX2-NEXT: shrq %rsi +; AVX2-NEXT: vmovq %rsi, %xmm10 +; AVX2-NEXT: shrq %rdx +; AVX2-NEXT: vmovq %rdx, %xmm11 +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: vmovd %eax, %xmm12 +; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX2-NEXT: shrl %eax +; AVX2-NEXT: vmovd %eax, %xmm13 ; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm7 -; AVX2-NEXT: vmovq %r14, %xmm8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm9 -; AVX2-NEXT: vmovq %rbp, %xmm10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: vmovq %rcx, %xmm11 -; AVX2-NEXT: vmovq %r13, %xmm12 -; AVX2-NEXT: vmovq %rbx, %xmm13 -; AVX2-NEXT: vmovq %rax, %xmm14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: shrq %rax -; AVX2-NEXT: vmovq %rax, %xmm15 +; AVX2-NEXT: vmovq %rcx, %xmm14 +; AVX2-NEXT: shrq %r9 +; AVX2-NEXT: vmovq %r9, %xmm15 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5],xmm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] +; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-NEXT: vpslld $16, %xmm3, %xmm3 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 @@ -2228,160 +3034,140 @@ ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vpextrq $1, %xmm2, %rcx +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: vmovq %xmm4, %rdx +; AVX512-NEXT: vpextrq $1, %xmm4, %rsi ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512-NEXT: vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: vpextrq $1, %xmm4, %rbp -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; AVX512-NEXT: vmovq %xmm4, %rdi -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vmovq %xmm5, %r8 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vmovq %xmm3, %r9 -; AVX512-NEXT: vpextrq $1, %xmm3, %r10 -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vmovq %xmm3, %r11 -; AVX512-NEXT: vpextrq $1, %xmm3, %rbx -; AVX512-NEXT: vpextrq $1, %xmm5, %rax -; AVX512-NEXT: vpextrq $1, %xmm4, %r12 +; AVX512-NEXT: vpextrq $1, %xmm4, %r8 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX512-NEXT: vmovq %xmm1, %r10 +; AVX512-NEXT: vpextrq $1, %xmm1, %r11 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %r14 ; AVX512-NEXT: vpextrq $1, %xmm1, %r15 -; AVX512-NEXT: vpextrq $1, %xmm0, %r14 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %r9 +; AVX512-NEXT: vpextrq $1, %xmm1, %rbx +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX512-NEXT: vmovq %xmm1, %r13 +; AVX512-NEXT: vpextrq $1, %xmm1, %r12 +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512-NEXT: vmovq %xmm7, %rax +; AVX512-NEXT: leal -1(%r9,%rax), %eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vpextrq $1, %xmm7, %rax ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX512-NEXT: vpextrq $1, %xmm8, %rsi -; AVX512-NEXT: addq %rax, %rsi -; AVX512-NEXT: vpextrq $1, %xmm7, %rdx -; AVX512-NEXT: addq %r12, %rdx -; AVX512-NEXT: vpextrq $1, %xmm4, %rcx -; AVX512-NEXT: addq %r15, %rcx -; AVX512-NEXT: vpextrq $1, %xmm3, %rax -; AVX512-NEXT: addq %r14, %rax -; AVX512-NEXT: vpextrq $1, %xmm9, %r14 -; AVX512-NEXT: leaq -1(%rbx,%r14), %r13 -; AVX512-NEXT: vmovq %xmm9, %rbx -; AVX512-NEXT: leaq -1(%r11,%rbx), %r12 -; AVX512-NEXT: vpextrq $1, %xmm2, %r11 -; AVX512-NEXT: leaq -1(%r10,%r11), %r15 -; AVX512-NEXT: vmovq %xmm2, %r10 -; AVX512-NEXT: leaq -1(%r9,%r10), %r14 -; AVX512-NEXT: vmovq %xmm8, %r9 +; AVX512-NEXT: leal -1(%rbx,%rax), %eax +; AVX512-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: vmovq %xmm7, %rax +; AVX512-NEXT: leal -1(%r13,%rax), %ebp +; AVX512-NEXT: vpextrq $1, %xmm7, %rax +; AVX512-NEXT: leal -1(%r12,%rax), %r12d +; AVX512-NEXT: vpextrq $1, %xmm6, %rax +; AVX512-NEXT: leaq -1(%r15,%rax), %rax +; AVX512-NEXT: vmovq %xmm6, %r9 +; AVX512-NEXT: leaq -1(%r14,%r9), %r13 +; AVX512-NEXT: vpextrq $1, %xmm5, %r9 +; AVX512-NEXT: leaq -1(%r11,%r9), %r15 +; AVX512-NEXT: vmovq %xmm5, %r9 +; AVX512-NEXT: leaq -1(%r10,%r9), %r14 +; AVX512-NEXT: vpextrq $1, %xmm4, %r9 ; AVX512-NEXT: leaq -1(%r8,%r9), %r11 -; AVX512-NEXT: vmovq %xmm7, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %r10 -; AVX512-NEXT: vpextrq $1, %xmm6, %rdi -; AVX512-NEXT: leaq -1(%rbp,%rdi), %r9 -; AVX512-NEXT: vmovq %xmm6, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vpextrq $1, %xmm5, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm5, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: leaq -1(%r8,%rdi), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm1, %rdi ; AVX512-NEXT: vmovq %xmm4, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: vmovq %xmm0, %rdi -; AVX512-NEXT: vmovq %xmm3, %r8 -; AVX512-NEXT: leaq -1(%rdi,%r8), %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: addq $-1, %rsi -; AVX512-NEXT: movl $0, %edi -; AVX512-NEXT: adcq $-1, %rdi -; AVX512-NEXT: addq $-1, %rdx -; AVX512-NEXT: movl $0, %ebp -; AVX512-NEXT: adcq $-1, %rbp -; AVX512-NEXT: addq $-1, %rcx -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: adcq $-1, %rbx -; AVX512-NEXT: addq $-1, %rax -; AVX512-NEXT: adcq $-1, %r8 -; AVX512-NEXT: shldq $63, %rax, %r8 -; AVX512-NEXT: shldq $63, %rcx, %rbx -; AVX512-NEXT: shldq $63, %rdx, %rbp -; AVX512-NEXT: shldq $63, %rsi, %rdi +; AVX512-NEXT: leaq -1(%rdi,%r8), %r10 +; AVX512-NEXT: vpextrq $1, %xmm3, %rdi +; AVX512-NEXT: leaq -1(%rsi,%rdi), %r8 +; AVX512-NEXT: vmovq %xmm3, %rsi +; AVX512-NEXT: leaq -1(%rdx,%rsi), %rdi +; AVX512-NEXT: vpextrq $1, %xmm2, %rdx +; AVX512-NEXT: leaq -1(%rcx,%rdx), %rsi +; AVX512-NEXT: vmovq %xmm2, %rcx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX512-NEXT: leaq -1(%rdx,%rcx), %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r9 +; AVX512-NEXT: leaq -1(%rcx,%r9), %rcx +; AVX512-NEXT: vmovq %xmm0, %r9 +; AVX512-NEXT: vmovq %xmm1, %rbx +; AVX512-NEXT: leaq -1(%r9,%rbx), %r9 +; AVX512-NEXT: shrq %rax +; AVX512-NEXT: vmovq %rax, %xmm0 ; AVX512-NEXT: shrq %r13 -; AVX512-NEXT: vmovq %r13, %xmm0 -; AVX512-NEXT: shrq %r12 -; AVX512-NEXT: vmovq %r12, %xmm1 +; AVX512-NEXT: vmovq %r13, %xmm1 ; AVX512-NEXT: shrq %r15 ; AVX512-NEXT: vmovq %r15, %xmm2 ; AVX512-NEXT: shrq %r14 ; AVX512-NEXT: vmovq %r14, %xmm3 -; AVX512-NEXT: vmovq %rdi, %xmm4 ; AVX512-NEXT: shrq %r11 -; AVX512-NEXT: vmovq %r11, %xmm5 -; AVX512-NEXT: vmovq %rbp, %xmm6 +; AVX512-NEXT: vmovq %r11, %xmm4 ; AVX512-NEXT: shrq %r10 -; AVX512-NEXT: vmovq %r10, %xmm7 +; AVX512-NEXT: vmovq %r10, %xmm5 +; AVX512-NEXT: shrq %r8 +; AVX512-NEXT: vmovq %r8, %xmm6 +; AVX512-NEXT: shrq %rdi +; AVX512-NEXT: vmovq %rdi, %xmm7 +; AVX512-NEXT: shrl %r12d +; AVX512-NEXT: vmovd %r12d, %xmm8 +; AVX512-NEXT: shrl %ebp +; AVX512-NEXT: vmovd %ebp, %xmm9 +; AVX512-NEXT: shrq %rsi +; AVX512-NEXT: vmovq %rsi, %xmm10 +; AVX512-NEXT: shrq %rdx +; AVX512-NEXT: vmovq %rdx, %xmm11 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX512-NEXT: shrl %eax +; AVX512-NEXT: vmovd %eax, %xmm12 +; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload +; AVX512-NEXT: shrl %eax +; AVX512-NEXT: vmovd %eax, %xmm13 +; AVX512-NEXT: shrq %rcx +; AVX512-NEXT: vmovq %rcx, %xmm14 ; AVX512-NEXT: shrq %r9 -; AVX512-NEXT: vmovq %r9, %xmm8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm11 -; AVX512-NEXT: vmovq %rbx, %xmm12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm13 -; AVX512-NEXT: vmovq %r8, %xmm14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shrq %rax -; AVX512-NEXT: vmovq %rax, %xmm15 +; AVX512-NEXT: vmovq %r9, %xmm15 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX512-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512-NEXT: vpsllq $48, %xmm2, %xmm2 +; AVX512-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX512-NEXT: vpbroadcastw %xmm3, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512-NEXT: vmovdqu %xmm0, (%rax) ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/avoid-sfb.ll b/llvm/test/CodeGen/X86/avoid-sfb.ll --- a/llvm/test/CodeGen/X86/avoid-sfb.ll +++ b/llvm/test/CodeGen/X86/avoid-sfb.ll @@ -561,12 +561,12 @@ ; CHECK-NEXT: movl %ecx, 28(%rdi) ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl %edx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl %esi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_stack: @@ -579,8 +579,8 @@ ; DISABLED-NEXT: movups %xmm0, 16(%rdi) ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 -; DISABLED-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; DISABLED-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; DISABLED-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; DISABLED-NEXT: retq ; ; AVX-LABEL: test_stack: diff --git a/llvm/test/CodeGen/X86/avx-logic.ll b/llvm/test/CodeGen/X86/avx-logic.ll --- a/llvm/test/CodeGen/X86/avx-logic.ll +++ b/llvm/test/CodeGen/X86/avx-logic.ll @@ -338,23 +338,25 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { ; AVX1-LABEL: andn_disguised_i8_elts: ; AVX1: # %bb.0: -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: andn_disguised_i8_elts: ; INT256: # %bb.0: ; INT256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; INT256-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; INT256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; INT256-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq %add = add <8 x i32> %y, %x @@ -417,17 +419,17 @@ define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) { ; AVX1-LABEL: andn_variable_mask_operand_concat: ; AVX1: # %bb.0: -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm1 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: andn_variable_mask_operand_concat: diff --git a/llvm/test/CodeGen/X86/avx-shift.ll b/llvm/test/CodeGen/X86/avx-shift.ll --- a/llvm/test/CodeGen/X86/avx-shift.ll +++ b/llvm/test/CodeGen/X86/avx-shift.ll @@ -215,11 +215,12 @@ define <16 x i16> @sext_v16i16(<16 x i16> %a) { ; CHECK-LABEL: sext_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllw $8, %xmm0, %xmm1 -; CHECK-NEXT: vpsraw $8, %xmm1, %xmm1 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 -; CHECK-NEXT: vpsraw $8, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm1, %xmm1 +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %b = trunc <16 x i16> %a to <16 x i8> @@ -230,11 +231,12 @@ define <8 x i32> @sext_v8i32(<8 x i32> %a) { ; CHECK-LABEL: sext_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslld $16, %xmm0, %xmm1 -; CHECK-NEXT: vpsrad $16, %xmm1, %xmm1 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm1 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpslld $16, %xmm0, %xmm0 -; CHECK-NEXT: vpsrad $16, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxwd %xmm1, %xmm1 +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %b = trunc <8 x i32> %a to <8 x i16> diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -33,8 +33,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: vbroadcastsd (%ecx), %ymm0 -; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -601,8 +601,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: movl 4(%ecx), %esi ; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/avx-vextractf128.ll b/llvm/test/CodeGen/X86/avx-vextractf128.ll --- a/llvm/test/CodeGen/X86/avx-vextractf128.ll +++ b/llvm/test/CodeGen/X86/avx-vextractf128.ll @@ -116,8 +116,8 @@ ; CHECK-LABEL: t9: ; CHECK: ## %bb.0: ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vmovups %ymm0, (%rdi) -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovups %xmm0, (%rdi) +; CHECK-NEXT: vmovups %xmm0, 16(%rdi) ; CHECK-NEXT: retq store i64 0, ptr %p %q = getelementptr i64, ptr %p, i64 1 diff --git a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll --- a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll +++ b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll @@ -8,14 +8,14 @@ ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmovaps (%ecx), %xmm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test1: ; X64: ## %bb.0: -; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vmovaps (%rdi), %xmm0 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll --- a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll @@ -82,14 +82,19 @@ define <8 x float> @test7(float %a, <8 x float> %b, <8 x float> %c) { ; X86-LABEL: test7: ; X86: # %bb.0: -; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 -; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm1 +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: vxorps %xmm2, %xmm3, %xmm2 +; X86-NEXT: vbroadcastss %xmm2, %ymm2 +; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; X86-NEXT: retl ; ; X64-LABEL: test7: ; X64: # %bb.0: +; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vxorps %xmm3, %xmm0, %xmm0 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 -; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 +; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; X64-NEXT: retq %t0 = insertelement <8 x float> undef, float %a, i32 0 %t1 = fsub <8 x float> , %t0 @@ -102,14 +107,19 @@ define <8 x float> @test8(float %a, <8 x float> %b, <8 x float> %c) { ; X86-LABEL: test8: ; X86: # %bb.0: -; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm2 -; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm1 +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: vxorps %xmm2, %xmm3, %xmm2 +; X86-NEXT: vbroadcastss %xmm2, %ymm2 +; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; X86-NEXT: retl ; ; X64-LABEL: test8: ; X64: # %bb.0: +; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vxorps %xmm3, %xmm0, %xmm0 ; X64-NEXT: vbroadcastss %xmm0, %ymm0 -; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 +; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 ; X64-NEXT: retq %t0 = fsub float -0.0, %a %t1 = insertelement <8 x float> undef, float %t0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx2-shift.ll b/llvm/test/CodeGen/X86/avx2-shift.ll --- a/llvm/test/CodeGen/X86/avx2-shift.ll +++ b/llvm/test/CodeGen/X86/avx2-shift.ll @@ -395,11 +395,21 @@ } define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind { -; CHECK-LABEL: sext_v16i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsllw $8, %ymm0, %ymm0 -; CHECK-NEXT: vpsraw $8, %ymm0, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: sext_v16i16: +; X86: # %bb.0: +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpmovsxbw %xmm0, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: sext_v16i16: +; X64: # %bb.0: +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpmovsxbw %xmm0, %ymm0 +; X64-NEXT: retq %b = trunc <16 x i16> %a to <16 x i8> %c = sext <16 x i8> %b to <16 x i16> ret <16 x i16> %c diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-arith.ll @@ -29,15 +29,15 @@ ; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0 -; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm3 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogq $220, %zmm2, %zmm3, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm1, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm4 -; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm2, %zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: add_v64i8_broadcasts: diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -1695,13 +1695,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB49_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB49_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1766,13 +1766,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB51_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB51_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1838,13 +1838,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB53_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm0, %zmm1 +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB53_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1910,14 +1910,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v2f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [2.0E+0,2.0E+0] -; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB55_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm1 -; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 -; CHECK-NEXT: vmovupd %xmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm0, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovupd %xmm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB55_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1982,13 +1981,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB57_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0 +; CHECK-NEXT: vaddpd %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB57_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -2054,13 +2053,13 @@ ; CHECK-LABEL: bcast_unfold_fma231_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB59_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB59_1 ; CHECK-NEXT: # %bb.2: # %bb10 diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll --- a/llvm/test/CodeGen/X86/avx512-build-vector.ll +++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll @@ -15,9 +15,9 @@ ; CHECK-LABEL: test3: ; CHECK: ## %bb.0: ; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] +; CHECK-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 ; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %b = extractelement <4 x float> %a, i32 2 diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -3005,9 +3005,8 @@ ; KNL-LABEL: zext_4xi1_to_4x32: ; KNL: # %bb.0: ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; KNL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_4xi1_to_4x32: @@ -3020,8 +3019,8 @@ ; AVX512DQNOBW-LABEL: zext_4xi1_to_4x32: ; AVX512DQNOBW: # %bb.0: ; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512DQNOBW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512DQNOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <4 x i8> %x, %y %1 = zext <4 x i1> %mask to <4 x i32> @@ -3032,8 +3031,8 @@ ; KNL-LABEL: zext_2xi1_to_2xi64: ; KNL: # %bb.0: ; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; KNL-NEXT: retq ; ; SKX-LABEL: zext_2xi1_to_2xi64: @@ -3046,8 +3045,8 @@ ; AVX512DQNOBW-LABEL: zext_2xi1_to_2xi64: ; AVX512DQNOBW: # %bb.0: ; AVX512DQNOBW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512DQNOBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512DQNOBW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <2 x i8> %x, %y %1 = zext <2 x i1> %mask to <2 x i64> diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll --- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -732,11 +732,12 @@ define void @load_v3i1_broadcast_2_v1i1_store(ptr %a0,ptr %a1) { ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: testb $4, (%rdi) -; AVX512-NEXT: movl $255, %ecx -; AVX512-NEXT: cmovel %eax, %ecx -; AVX512-NEXT: kmovd %ecx, %k0 +; AVX512-NEXT: movzbl (%rdi), %eax +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: btl $2, %eax +; AVX512-NEXT: movl $255, %eax +; AVX512-NEXT: cmovael %ecx, %eax +; AVX512-NEXT: kmovd %eax, %k0 ; AVX512-NEXT: kshiftrb $2, %k0, %k0 ; AVX512-NEXT: kshiftlb $7, %k0, %k0 ; AVX512-NEXT: kshiftrb $7, %k0, %k0 @@ -745,11 +746,12 @@ ; ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: xorl %eax, %eax -; AVX512NOTDQ-NEXT: testb $4, (%rdi) -; AVX512NOTDQ-NEXT: movl $255, %ecx -; AVX512NOTDQ-NEXT: cmovel %eax, %ecx -; AVX512NOTDQ-NEXT: kmovd %ecx, %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: xorl %ecx, %ecx +; AVX512NOTDQ-NEXT: btl $2, %eax +; AVX512NOTDQ-NEXT: movl $255, %eax +; AVX512NOTDQ-NEXT: cmovael %ecx, %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -6,7 +6,7 @@ ; KNL-LABEL: hadd_16: ; KNL: # %bb.0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovd %xmm0, %eax @@ -15,7 +15,7 @@ ; SKX-LABEL: hadd_16: ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax @@ -33,7 +33,7 @@ ; KNL-LABEL: hsub_16: ; KNL: # %bb.0: ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; KNL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vmovd %xmm0, %eax @@ -42,7 +42,7 @@ ; SKX-LABEL: hsub_16: ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -973,20 +973,22 @@ ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: movl $4, %eax -; KNL-NEXT: subl %ecx, %eax +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_v2i1: ; SKX: ## %bb.0: ; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %ecx -; SKX-NEXT: andl $1, %ecx -; SKX-NEXT: movl $4, %eax -; SKX-NEXT: subl %ecx, %eax +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: retq %t1 = icmp ugt <2 x i64> %a, %b %t2 = extractelement <2 x i1> %t1, i32 0 @@ -1090,10 +1092,11 @@ ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 -; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: movl $4, %eax -; KNL-NEXT: subl %ecx, %eax +; KNL-NEXT: kmovw %k0, %eax +; KNL-NEXT: andb $1, %al +; KNL-NEXT: movb $4, %cl +; KNL-NEXT: subb %al, %cl +; KNL-NEXT: movzbl %cl, %eax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -1101,10 +1104,11 @@ ; SKX: ## %bb.0: ; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; SKX-NEXT: kshiftrq $63, %k0, %k0 -; SKX-NEXT: kmovd %k0, %ecx -; SKX-NEXT: andl $1, %ecx -; SKX-NEXT: movl $4, %eax -; SKX-NEXT: subl %ecx, %eax +; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: andb $1, %al +; SKX-NEXT: movb $4, %cl +; SKX-NEXT: subb %al, %cl +; SKX-NEXT: movzbl %cl, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <64 x i8> %a, %b diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1132,9 +1132,9 @@ define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpeq_d: ; X86: ## %bb.0: -; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] ; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -1204,9 +1204,9 @@ define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpgt_d: ; X86: ## %bb.0: -; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] ; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -5975,31 +5975,74 @@ declare <8 x i64> @llvm.x86.avx512.movntdqa(ptr) nounwind readonly define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { -; CHECK-LABEL: test_cmp_d_512: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc8] -; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltd %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xe9] -; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-LABEL: test_cmp_d_512: +; X86: ## %bb.0: +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X86-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X86-NEXT: vmovd %ecx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc0] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpled %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_cmp_d_512: +; X64: ## %bb.0: +; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc0] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpled %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) @@ -6025,23 +6068,30 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0] -; X86-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02] -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05] -; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9] -; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: shll $16, %edx ## encoding: [0xc1,0xe2,0x10] +; X86-NEXT: orl %ecx, %edx ## encoding: [0x09,0xca] +; X86-NEXT: vmovd %edx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6050,23 +6100,33 @@ ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0] -; X64-NEXT: vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02] -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05] -; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9] -; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xc0] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -6092,31 +6152,74 @@ declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { -; CHECK-LABEL: test_ucmp_d_512: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] -; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc9,0x01] -; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe9,0x06] -; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-LABEL: test_ucmp_d_512: +; X86: ## %bb.0: +; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X86-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X86-NEXT: vmovd %ecx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x01] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_ucmp_d_512: +; X64: ## %bb.0: +; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x01] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq ## encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) @@ -6142,23 +6245,30 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01] -; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02] -; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05] -; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06] -; X86-NEXT: kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] ; X86-NEXT: kmovw %k0, %edx ## encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: shll $16, %edx ## encoding: [0xc1,0xe2,0x10] +; X86-NEXT: orl %ecx, %edx ## encoding: [0x09,0xca] +; X86-NEXT: vmovd %edx, %xmm2 ## encoding: [0xc5,0xf9,0x6e,0xd2] +; X86-NEXT: vpbroadcastd %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x58,0xd2] +; X86-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x01] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06] +; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl ## encoding: [0xc3] @@ -6167,23 +6277,33 @@ ; X64: ## %bb.0: ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1] -; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01] -; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02] -; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04] -; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05] -; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06] -; X64-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X64-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: shll $16, %ecx ## encoding: [0xc1,0xe1,0x10] +; X64-NEXT: orl %eax, %ecx ## encoding: [0x09,0xc1] +; X64-NEXT: movq %rcx, %rax ## encoding: [0x48,0x89,0xc8] +; X64-NEXT: shlq $32, %rax ## encoding: [0x48,0xc1,0xe0,0x20] +; X64-NEXT: orq %rcx, %rax ## encoding: [0x48,0x09,0xc8] +; X64-NEXT: vmovq %rax, %xmm2 ## encoding: [0xc4,0xe1,0xf9,0x6e,0xd0] +; X64-NEXT: vpbroadcastq %xmm2, %xmm2 ## encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X64-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x01] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x02] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: ## xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xc1,0x04] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x05] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc1,0x06] +; X64-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 ## encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq ## encoding: [0xc3] @@ -6797,9 +6917,10 @@ ; X86-LABEL: test_vptestmd: ; X86: ## %bb.0: ; X86-NEXT: vptestmd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax ## encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1] ; X86-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -6827,9 +6948,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_d_512: ; X86: ## %bb.0: ; X86-NEXT: vptestnmd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc1] -; X86-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax ## encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 ## encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] @@ -6882,8 +7004,10 @@ define i16 @test_kand(i16 %a0, i16 %a1) { ; X86-LABEL: test_kand: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x41,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: andl $8, %eax ## encoding: [0x83,0xe0,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] @@ -6904,17 +7028,21 @@ define i16 @test_kandn(i16 %a0, i16 %a1) { ; X86-LABEL: test_kandn: ; X86: ## %bb.0: -; X86-NEXT: movl $65527, %eax ## encoding: [0xb8,0xf7,0xff,0x00,0x00] -; X86-NEXT: ## imm = 0xFFF7 -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ## encoding: [0x0b,0x44,0x24,0x04] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: movw $8, %ax ## encoding: [0x66,0xb8,0x08,0x00] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: kandnw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x42,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandnw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x42,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_kandn: ; X64: ## %bb.0: ; X64-NEXT: movl %edi, %eax ## encoding: [0x89,0xf8] -; X64-NEXT: orl $-9, %eax ## encoding: [0x83,0xc8,0xf7] +; X64-NEXT: orl $65527, %eax ## encoding: [0x0d,0xf7,0xff,0x00,0x00] +; X64-NEXT: ## imm = 0xFFF7 ; X64-NEXT: andl %esi, %eax ## encoding: [0x21,0xf0] ; X64-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ## encoding: [0xc3] @@ -6946,8 +7074,10 @@ define i16 @test_kor(i16 %a0, i16 %a1) { ; X86-LABEL: test_kor: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: orw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x0b,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: korw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x45,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: orl $8, %eax ## encoding: [0x83,0xc8,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] @@ -6970,8 +7100,10 @@ define i16 @test_kxnor(i16 %a0, i16 %a1) { ; X86-LABEL: test_kxnor: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x33,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kxorw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x47,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: xorl $8, %eax ## encoding: [0x83,0xf0,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] @@ -6992,8 +7124,10 @@ define i16 @test_kxor(i16 %a0, i16 %a1) { ; X86-LABEL: test_kxor: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x33,0x44,0x24,0x08] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 ## encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kxorw %k1, %k0, %k0 ## encoding: [0xc5,0xfc,0x47,0xc1] +; X86-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: xorl $8, %eax ## encoding: [0x83,0xf0,0x08] ; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ## encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -86,17 +86,13 @@ define void @mask16_mem(ptr %ptr) { ; CHECK-LABEL: mask16_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw (%rdi), %k0 -; CHECK-NEXT: knotw %k0, %k0 -; CHECK-NEXT: kmovw %k0, (%rdi) +; CHECK-NEXT: notw (%rdi) ; CHECK-NEXT: retq ; ; X86-LABEL: mask16_mem: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovw (%eax), %k0 -; X86-NEXT: knotw %k0, %k0 -; X86-NEXT: kmovw %k0, (%eax) +; X86-NEXT: notw (%eax) ; X86-NEXT: retl %x = load i16, ptr %ptr, align 4 %m0 = bitcast i16 %x to <16 x i1> @@ -107,36 +103,15 @@ } define void @mask8_mem(ptr %ptr) { -; KNL-LABEL: mask8_mem: -; KNL: ## %bb.0: -; KNL-NEXT: notb (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: mask8_mem: -; SKX: ## %bb.0: -; SKX-NEXT: kmovb (%rdi), %k0 -; SKX-NEXT: knotb %k0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq -; -; AVX512BW-LABEL: mask8_mem: -; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: notb (%rdi) -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: mask8_mem: -; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: kmovb (%rdi), %k0 -; AVX512DQ-NEXT: knotb %k0, %k0 -; AVX512DQ-NEXT: kmovb %k0, (%rdi) -; AVX512DQ-NEXT: retq +; CHECK-LABEL: mask8_mem: +; CHECK: ## %bb.0: +; CHECK-NEXT: notb (%rdi) +; CHECK-NEXT: retq ; ; X86-LABEL: mask8_mem: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: kmovb (%eax), %k0 -; X86-NEXT: knotb %k0, %k0 -; X86-NEXT: kmovb %k0, (%eax) +; X86-NEXT: notb (%eax) ; X86-NEXT: retl %x = load i8, ptr %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> @@ -156,8 +131,11 @@ ; ; X86-LABEL: mand16: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: korw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %ma = bitcast i16 %x to <16 x i1> %mb = bitcast i16 %y to <16 x i1> @@ -1352,8 +1330,8 @@ ; ; X86-LABEL: test17: ; X86: ## %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setg %al ; X86-NEXT: kshiftrq $6, %k0, %k1 @@ -3882,8 +3860,11 @@ ; ; X86-LABEL: test_v16i1_add: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> @@ -3902,8 +3883,11 @@ ; ; X86-LABEL: test_v16i1_sub: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> @@ -3922,8 +3906,11 @@ ; ; X86-LABEL: test_v16i1_mul: ; X86: ## %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kandw %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-NEXT: retl %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> @@ -3942,8 +3929,11 @@ ; ; X86-LABEL: test_v8i1_add: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorb %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $al killed $al killed $eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> @@ -3962,8 +3952,11 @@ ; ; X86-LABEL: test_v8i1_sub: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kxorb %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $al killed $al killed $eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> @@ -3982,8 +3975,11 @@ ; ; X86-LABEL: test_v8i1_mul: ; X86: ## %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: andb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: kandb %k1, %k0, %k0 +; X86-NEXT: kmovd %k0, %eax +; X86-NEXT: ## kill: def $al killed $al killed $eax ; X86-NEXT: retl %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> @@ -4712,6 +4708,8 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z) { ; KNL-LABEL: ktest_6: ; KNL: ## %bb.0: +; KNL-NEXT: pushq %rax +; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; KNL-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4 @@ -4731,22 +4729,17 @@ ; KNL-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3 ; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; KNL-NEXT: vpternlogq $200, %zmm1, %zmm0, %zmm2 -; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; KNL-NEXT: vpor %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %k0 ; KNL-NEXT: kortestw %k0, %k0 ; KNL-NEXT: je LBB77_1 ; KNL-NEXT: ## %bb.2: ## %exit +; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; KNL-NEXT: LBB77_1: ## %bar -; KNL-NEXT: pushq %rax -; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo -; KNL-NEXT: addq $8, %rsp +; KNL-NEXT: popq %rax ; KNL-NEXT: retq ; ; SKX-LABEL: ktest_6: @@ -4793,6 +4786,8 @@ ; ; AVX512DQ-LABEL: ktest_6: ; AVX512DQ: ## %bb.0: +; AVX512DQ-NEXT: pushq %rax +; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4 @@ -4812,22 +4807,17 @@ ; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512DQ-NEXT: vpternlogq $200, %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0 -; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %k0 ; AVX512DQ-NEXT: kortestw %k0, %k0 ; AVX512DQ-NEXT: je LBB77_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit +; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; AVX512DQ-NEXT: LBB77_1: ## %bar -; AVX512DQ-NEXT: pushq %rax -; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo -; AVX512DQ-NEXT: addq $8, %rsp +; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: retq ; ; X86-LABEL: ktest_6: diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1316,9 +1316,10 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,0,8,4,6,4,12] ; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1327,8 +1328,9 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,0,8,4,6,4,12] ; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -1342,10 +1344,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,0,8,4,6,4,12] +; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1718,9 +1721,15 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,4,3,6] -; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm0 +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 +; CHECK-NEXT: vmovd %xmm1, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vextractps $3, %xmm2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpextrd $2, %xmm1, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1729,11 +1738,17 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,3,6] -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm2 +; CHECK-NEXT: vmovdqa (%rdi), %xmm3 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm4 +; CHECK-NEXT: vmovd %xmm3, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vextractps $3, %xmm4, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpextrd $2, %xmm3, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1745,11 +1760,17 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,4,3,6] +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm1 +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 +; CHECK-NEXT: vmovd %xmm2, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vextractps $3, %xmm3, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpextrd $2, %xmm2, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -2695,40 +2716,24 @@ } define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) { -; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps {{.*#+}} xmm0 = [4,1] -; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 -; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-FAST-NEXT: vzeroupper -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0 -; CHECK-FAST-PERLANE-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> ret <2 x i64> %res } define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { -; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1] -; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm2 -; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-FAST-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} -; CHECK-FAST-NEXT: vzeroupper -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] -; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq 8(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovq 32(%rdi), %xmm3 # xmm3 = mem[0],zero +; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0] +; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -2737,22 +2742,13 @@ } define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { -; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1] -; CHECK-FAST-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; CHECK-FAST-NEXT: vzeroupper -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 -; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] -; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovq 8(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovq 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0] +; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -3167,11 +3163,12 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] -; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,8,9,6,1,4,4] +; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1 +; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3182,10 +3179,11 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermt2ps %ymm2, %ymm3, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> @@ -3482,26 +3480,16 @@ } define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { -; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 -; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1} -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 -; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm0 {%k1} -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [9,5,2,3,2,8,8,1] +; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1} +; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3510,26 +3498,16 @@ } define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) { -; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-FAST-NEXT: retq -; -; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: -; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 -; CHECK-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] -; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm4, %ymm0, %k1 -; CHECK-FAST-PERLANE-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} -; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0 -; CHECK-FAST-PERLANE-NEXT: retq +; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] +; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -4672,10 +4650,11 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] +; CHECK-NEXT: vmovsd 8(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovsd 32(%rdi), %xmm3 # xmm3 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],xmm3[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4687,10 +4666,11 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0] -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 -; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] +; CHECK-NEXT: vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],xmm2[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -629,7 +629,8 @@ ; ; SKX-LABEL: usat_trunc_wb_128_mem: ; SKX: ## %bb.0: -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: vpmovuswb %xmm0, %xmm0 +; SKX-NEXT: vmovq %xmm0, (%rdi) ; SKX-NEXT: retq %x3 = icmp ult <8 x i16> %i, %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> @@ -654,7 +655,8 @@ define void @usat_trunc_qb_512_mem(<8 x i64> %i, ptr %res) { ; ALL-LABEL: usat_trunc_qb_512_mem: ; ALL: ## %bb.0: -; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: vpmovusqb %zmm0, %xmm0 +; ALL-NEXT: vmovq %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x3 = icmp ult <8 x i64> %i, @@ -864,18 +866,11 @@ } define void @smax_usat_trunc_wb_128_mem(<8 x i16> %i, ptr %res) { -; KNL-LABEL: smax_usat_trunc_wb_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: smax_usat_trunc_wb_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) -; SKX-NEXT: retq +; ALL-LABEL: smax_usat_trunc_wb_128_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; ALL-NEXT: vmovq %xmm0, (%rdi) +; ALL-NEXT: retq %x1 = icmp sgt <8 x i16> %i, %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> %x3 = icmp slt <8 x i16> %x2, @@ -907,7 +902,8 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: vpmovusqb %zmm0, %xmm0 +; ALL-NEXT: vmovq %xmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <8 x i64> %i, diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll @@ -311,10 +311,6 @@ ; We implement the scalar broadcast intrinsics with vector initializers. ; Verify that the IR generated will produce the broadcast at the end. define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) { -; ALL-LABEL: test_mm512_broadcastsd_pd: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 -; ALL-NEXT: retq entry: %0 = extractelement <2 x double> %a, i32 0 %vecinit.i = insertelement <8 x double> undef, double %0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1187,14 +1187,22 @@ } define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 { -; AVX512-LABEL: test45: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] -; AVX512-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] -; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] -; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; AVX512-NEXT: retq ## encoding: [0xc3] +; KNL-LABEL: test45: +; KNL: ## %bb.0: +; KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] +; KNL-NEXT: vpsrlw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xd0,0x0f] +; KNL-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] +; KNL-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; KNL-NEXT: retq ## encoding: [0xc3] +; +; AVX512BW-LABEL: test45: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x75,0xc1] +; AVX512BW-NEXT: vpmovzxwq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x34,0xc0] +; AVX512BW-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A] +; AVX512BW-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; AVX512BW-NEXT: retq ## encoding: [0xc3] ; ; SKX-LABEL: test45: ; SKX: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -49,8 +49,8 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) nounwind { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x5c,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpblendmb %zmm3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0xcb] ; X86-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0xd3] ; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] @@ -108,8 +108,8 @@ ; X86-LABEL: test_int_x86_avx512_mask_storeu_b_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x7f,0x01] ; X86-NEXT: vmovdqu64 %zmm0, (%eax) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -187,9 +187,9 @@ ; X86-LABEL: test_int_x86_avx512_mask_loadu_b_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: vmovdqu64 (%ecx), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vpblendmb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x66,0x08] ; X86-NEXT: vmovdqu8 (%ecx), %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x11] ; X86-NEXT: retl # encoding: [0xc3] @@ -455,12 +455,11 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind { ; X86-LABEL: test_mask_pcmpeq_b: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] -; X86-NEXT: andl {{[0-9]+}}(%esp), %edx # encoding: [0x23,0x54,0x24,0x08] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -491,9 +490,9 @@ define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind { ; X86-LABEL: test_mask_pcmpeq_w: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -533,12 +532,11 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind { ; X86-LABEL: test_mask_pcmpgt_b: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] -; X86-NEXT: andl {{[0-9]+}}(%esp), %edx # encoding: [0x23,0x54,0x24,0x08] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -569,9 +567,9 @@ define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind { ; X86-LABEL: test_mask_pcmpgt_w: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1941,45 +1939,66 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind { ; X86-LABEL: test_mask_cmp_b_512: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp # encoding: [0x55] +; X86-NEXT: pushl %ebx # encoding: [0x53] +; X86-NEXT: pushl %edi # encoding: [0x57] ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] -; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] -; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] -; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] -; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] -; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x18] +; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1] +; X86-NEXT: kmovd %esi, %k0 # encoding: [0xc5,0xfb,0x92,0xc6] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: vpcmpgtb %zmm0, %zmm1, %k2 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xd0] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpleb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x02] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: addl %ebx, %edi # encoding: [0x01,0xdf] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] +; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x04] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x05] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebp # encoding: [0xc5,0xfb,0x93,0xea] +; X86-NEXT: addl %ebx, %ebp # encoding: [0x01,0xdd] +; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] +; X86-NEXT: vpcmpgtb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xd1] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k1 # encoding: [0xc4,0xe1,0xe5,0x41,0xc9] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kandd %k0, %k2, %k0 # encoding: [0xc4,0xe1,0xed,0x41,0xc0] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %ebp, %eax # encoding: [0x01,0xe8] +; X86-NEXT: adcl %edi, %edx # encoding: [0x11,0xfa] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] ; X86-NEXT: popl %esi # encoding: [0x5e] +; X86-NEXT: popl %edi # encoding: [0x5f] +; X86-NEXT: popl %ebx # encoding: [0x5b] +; X86-NEXT: popl %ebp # encoding: [0x5d] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2114,45 +2133,66 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind { ; X86-LABEL: test_mask_x86_avx512_ucmp_b_512: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp # encoding: [0x55] +; X86-NEXT: pushl %ebx # encoding: [0x53] +; X86-NEXT: pushl %edi # encoding: [0x57] ; X86-NEXT: pushl %esi # encoding: [0x56] -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] -; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] -; X86-NEXT: adcl %eax, %ecx # encoding: [0x11,0xc1] -; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] -; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] -; X86-NEXT: adcl %edx, %ecx # encoding: [0x11,0xd1] -; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] -; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x18] +; X86-NEXT: vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1] +; X86-NEXT: kmovd %esi, %k0 # encoding: [0xc5,0xfb,0x92,0xc6] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: vpcmpltub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x01] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpleub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x02] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edx # encoding: [0xc5,0xfb,0x93,0xd3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %edi # encoding: [0xc5,0xfb,0x93,0xfa] +; X86-NEXT: addl %ebx, %edi # encoding: [0x01,0xdf] +; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] +; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x04] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda] +; X86-NEXT: addl %edi, %ebx # encoding: [0x01,0xfb] +; X86-NEXT: adcl %edx, %eax # encoding: [0x11,0xd0] +; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x05] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9] +; X86-NEXT: kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb] +; X86-NEXT: kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0] +; X86-NEXT: kmovd %k2, %ebp # encoding: [0xc5,0xfb,0x93,0xea] +; X86-NEXT: addl %ebx, %ebp # encoding: [0x01,0xdd] +; X86-NEXT: adcl %eax, %edi # encoding: [0x11,0xc7] +; X86-NEXT: vpcmpnleub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x06] +; X86-NEXT: kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20] +; X86-NEXT: kandd %k1, %k3, %k1 # encoding: [0xc4,0xe1,0xe5,0x41,0xc9] ; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kandd %k0, %k2, %k0 # encoding: [0xc4,0xe1,0xed,0x41,0xc0] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: addl %ebp, %eax # encoding: [0x01,0xe8] +; X86-NEXT: adcl %edi, %edx # encoding: [0x11,0xfa] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] -; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # encoding: [0x03,0x44,0x24,0x08] -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x0c] ; X86-NEXT: popl %esi # encoding: [0x5e] +; X86-NEXT: popl %edi # encoding: [0x5f] +; X86-NEXT: popl %ebx # encoding: [0x5b] +; X86-NEXT: popl %ebp # encoding: [0x5d] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2589,13 +2629,14 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi # encoding: [0x56] ; X86-NEXT: vptestmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandq %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x41,0xc9] +; X86-NEXT: kshiftrq $32, %k1, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd1,0x20] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k1, %esi # encoding: [0xc5,0xfb,0x93,0xf1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] -; X86-NEXT: andl %ecx, %edx # encoding: [0x21,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: andl %esi, %eax # encoding: [0x21,0xf0] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: popl %esi # encoding: [0x5e] @@ -2622,9 +2663,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_w_512: ; X86: # %bb.0: ; X86-NEXT: vptestmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2650,13 +2692,14 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %esi # encoding: [0x56] ; X86-NEXT: vptestnmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x48,0x26,0xc1] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: kandq %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfc,0x41,0xc9] +; X86-NEXT: kshiftrq $32, %k1, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd1,0x20] +; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] +; X86-NEXT: kmovd %k1, %esi # encoding: [0xc5,0xfb,0x93,0xf1] ; X86-NEXT: kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] -; X86-NEXT: andl %ecx, %edx # encoding: [0x21,0xca] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: andl %esi, %eax # encoding: [0x21,0xf0] +; X86-NEXT: kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: adcl %ecx, %edx # encoding: [0x11,0xca] ; X86-NEXT: popl %esi # encoding: [0x5e] @@ -2683,9 +2726,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_w_512: ; X86: # %bb.0: ; X86-NEXT: vptestnmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x48,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512bw-mask-op.ll b/llvm/test/CodeGen/X86/avx512bw-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512bw-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512bw-mask-op.ll @@ -38,9 +38,7 @@ define void @mask32_mem(ptr %ptr) { ; CHECK-LABEL: mask32_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd (%rdi), %k0 -; CHECK-NEXT: knotd %k0, %k0 -; CHECK-NEXT: kmovd %k0, (%rdi) +; CHECK-NEXT: notl (%rdi) ; CHECK-NEXT: retq %x = load i32, ptr %ptr, align 4 %m0 = bitcast i32 %x to <32 x i1> @@ -56,9 +54,7 @@ define void @mask64_mem(ptr %ptr) { ; CHECK-LABEL: mask64_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovq (%rdi), %k0 -; CHECK-NEXT: knotq %k0, %k0 -; CHECK-NEXT: kmovq %k0, (%rdi) +; CHECK-NEXT: notq (%rdi) ; CHECK-NEXT: retq %x = load i64, ptr %ptr, align 4 %m0 = bitcast i64 %x to <64 x i1> diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -927,9 +927,9 @@ define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { ; X86-LABEL: test_mask_pcmpeq_b_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -961,9 +961,9 @@ define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpeq_w_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -996,9 +996,9 @@ define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { ; X86-LABEL: test_mask_pcmpgt_b_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1030,9 +1030,9 @@ define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpgt_w_256: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -1065,9 +1065,9 @@ define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpeq_b_128: ; X86: # %bb.0: -; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1131,9 +1131,9 @@ define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { ; X86-LABEL: test_mask_pcmpgt_b_128: ; X86: # %bb.0: -; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax # encoding: [0x66,0x23,0x44,0x24,0x04] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4846,7 +4846,7 @@ ; X64-LABEL: test_cmp_b_256: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc1,0x02] @@ -4946,7 +4946,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02] @@ -5040,7 +5040,7 @@ ; X64-LABEL: test_ucmp_b_256: ; X64: # %bb.0: ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x02] @@ -5140,7 +5140,7 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1] -; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02] @@ -5186,31 +5186,68 @@ declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_cmp_w_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] -; CHECK-NEXT: vpcmpgtw %ymm0, %ymm1, %k1 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc8] -; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xe9] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_cmp_w_256: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmplew %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_cmp_w_256: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 # encoding: [0x62,0xf1,0x75,0x28,0x65,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmplew %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) @@ -5236,23 +5273,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0] -; X86-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02] -; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05] -; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -5261,23 +5304,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0] -; X64-NEXT: vpcmplew %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02] -; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x05] -; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc9] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x29,0x65,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmplew %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -5303,31 +5351,68 @@ declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: test_ucmp_w_256: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] -; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k1 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01] -; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k2 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k3 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltuw %ymm1, %ymm0, %k4 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleuw %ymm1, %ymm0, %k5 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x06] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_ucmp_w_256: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_ucmp_w_256: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) @@ -5353,23 +5438,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01] -; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02] -; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05] -; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -5378,23 +5469,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1] -; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01] -; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k3 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02] -; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k5 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x05] -; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x06] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -5420,30 +5516,66 @@ declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { -; CHECK-LABEL: test_cmp_b_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] -; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %k1 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc8] -; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltb %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x05] -; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k5 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xe9] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_cmp_b_128: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_cmp_b_128: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x75,0x08,0x64,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) @@ -5469,23 +5601,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0] -; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02] -; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05] -; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5493,23 +5631,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0] -; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02] -; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x05] -; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc9] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpgtb %xmm0, %xmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x09,0x64,0xc0] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) @@ -5534,30 +5677,66 @@ declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { -; CHECK-LABEL: test_ucmp_b_128: -; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] -; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k1 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01] -; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02] -; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k3 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x04] -; CHECK-NEXT: vpcmpnltub %xmm1, %xmm0, %k4 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x05] -; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k5 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x06] -; CHECK-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; CHECK-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; CHECK-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; CHECK-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; CHECK-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; CHECK-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; CHECK-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; CHECK-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; CHECK-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; CHECK-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; CHECK-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] -; CHECK-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] -; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; X86-LABEL: test_ucmp_b_128: +; X86: # %bb.0: +; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vmovd %eax, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd0] +; X86-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd0,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X86-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X86-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_ucmp_b_128: +; X64: # %bb.0: +; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9] +; X64-NEXT: vpblendw $128, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80] +; X64-NEXT: # xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) @@ -5583,23 +5762,29 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01] -; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02] -; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05] -; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06] -; X86-NEXT: kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca] -; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] -; X86-NEXT: vmovd %edx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc2] -; X86-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x01] -; X86-NEXT: kmovd %k3, %ecx # encoding: [0xc5,0xfb,0x93,0xcb] -; X86-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x02] -; X86-NEXT: kmovd %k4, %ecx # encoding: [0xc5,0xfb,0x93,0xcc] -; X86-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04] -; X86-NEXT: kmovd %k5, %ecx # encoding: [0xc5,0xfb,0x93,0xcd] -; X86-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x05] -; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] -; X86-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x06] +; X86-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vmovd %ecx, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xd1] +; X86-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x22,0xd1,0x01] +; X86-NEXT: vpbroadcastq %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd2] +; X86-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x01] +; X86-NEXT: vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x02] +; X86-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X86-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X86-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X86-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x04] +; X86-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd1,0x05] +; X86-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc1,0x06] ; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -5607,23 +5792,28 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1] -; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01] -; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k3 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02] -; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x04] -; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k5 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x05] -; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x06] -; X64-NEXT: kmovd %k2, %eax # encoding: [0xc5,0xfb,0x93,0xc2] -; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] -; X64-NEXT: vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] -; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01] -; X64-NEXT: kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3] -; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02] -; X64-NEXT: kmovd %k4, %eax # encoding: [0xc5,0xfb,0x93,0xc4] -; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04] -; X64-NEXT: kmovd %k5, %eax # encoding: [0xc5,0xfb,0x93,0xc5] -; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05] -; X64-NEXT: kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1] -; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06] +; X64-NEXT: kunpckwd %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc0] +; X64-NEXT: kunpckdq %k0, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc0] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpbroadcastq %rax, %xmm2 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd0] +; X64-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x01] +; X64-NEXT: vpcmpleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x02] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x02] +; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb] +; X64-NEXT: vpblendw $8, %xmm3, %xmm2, %xmm2 # encoding: [0xc4,0xe3,0x69,0x0e,0xd3,0x08] +; X64-NEXT: # xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7] +; X64-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x04] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x04] +; X64-NEXT: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xd0,0x05] +; X64-NEXT: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpinsrw $6, %eax, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xc4,0xc0,0x06] ; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) @@ -6119,9 +6309,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_b_128: ; X86: # %bb.0: ; X86-NEXT: vptestmb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] @@ -6146,9 +6337,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_b_256: ; X86: # %bb.0: ; X86-NEXT: vptestmb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -6199,9 +6391,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestm_w_256: ; X86: # %bb.0: ; X86-NEXT: vptestmw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] @@ -6228,9 +6421,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_b_128: ; X86: # %bb.0: ; X86-NEXT: vptestnmb %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl # encoding: [0xc3] @@ -6255,9 +6449,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_b_256: ; X86: # %bb.0: ; X86-NEXT: vptestnmb %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: andl %ecx, %eax # encoding: [0x21,0xc8] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandd %k1, %k0, %k1 # encoding: [0xc4,0xe1,0xfd,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -6308,9 +6503,10 @@ ; X86-LABEL: test_int_x86_avx512_ptestnm_w_256: ; X86: # %bb.0: ; X86-NEXT: vptestnmw %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: andw %cx, %ax # encoding: [0x66,0x21,0xc8] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X86-NEXT: kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-fast-isel.ll @@ -7,9 +7,9 @@ define zeroext i8 @test_mm512_mask_fpclass_pd_mask(i8 zeroext %__U, <8 x double> %__A) { ; X86-LABEL: test_mm512_mask_fpclass_pd_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vfpclasspd $4, %zmm0, %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vfpclasspd $4, %zmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andb {{[0-9]+}}(%esp), %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -49,9 +49,9 @@ define zeroext i16 @test_mm512_mask_fpclass_ps_mask(i16 zeroext %__U, <16 x float> %__A) { ; X86-LABEL: test_mm512_mask_fpclass_ps_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vfpclassps $4, %zmm0, %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vfpclassps $4, %zmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll @@ -17,9 +17,7 @@ define void @mask8_mem(ptr %ptr) { ; CHECK-LABEL: mask8_mem: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovb (%rdi), %k0 -; CHECK-NEXT: knotb %k0, %k0 -; CHECK-NEXT: kmovb %k0, (%rdi) +; CHECK-NEXT: notb (%rdi) ; CHECK-NEXT: retq %x = load i8, ptr %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-fast-isel.ll @@ -336,9 +336,9 @@ define zeroext i8 @test_mm256_mask_fpclass_ps_mask(i8 zeroext %__U, <8 x float> %__A) { ; X86-LABEL: test_mm256_mask_fpclass_ps_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vfpclassps $2, %ymm0, %k0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vfpclassps $2, %ymm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andb {{[0-9]+}}(%esp), %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll --- a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll +++ b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll @@ -92,9 +92,9 @@ ; ; X86-LABEL: TEST_mm512_mask_test_epi32_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -198,9 +198,9 @@ ; ; X86-LABEL: TEST_mm512_mask_testn_epi32_mask: ; X86: # %bb.0: # %entry -; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-fma-intrinsics.ll @@ -528,12 +528,12 @@ define void @fmadd_sh_mask_memfold(ptr %a, ptr %b, i8 %c) { ; X86-LABEL: fmadd_sh_mask_memfold: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] ; X86-NEXT: vmovsh (%ecx), %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x01] ; X86-NEXT: vmovsh (%eax), %xmm1 # encoding: [0x62,0xf5,0x7e,0x08,0x10,0x08] ; X86-NEXT: vfmadd213sh %xmm0, %xmm0, %xmm1 # encoding: [0x62,0xf6,0x7d,0x08,0xa9,0xc8] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x0c] ; X86-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xc1] ; X86-NEXT: vmovsh %xmm0, (%ecx) # encoding: [0x62,0xf5,0x7e,0x08,0x11,0x01] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1995,25 +1995,27 @@ define <8 x half> @test21(half %a, half %b, half %c) nounwind { ; X64-LABEL: test21: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-NEXT: vpbroadcastw %xmm1, %xmm1 -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: vpbroadcastw %xmm1, %xmm1 +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq ; ; X86-LABEL: test21: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 -; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 -; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X86-NEXT: vpbroadcastw %xmm1, %xmm1 -; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpbroadcastw %xmm2, %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-NEXT: vpbroadcastw %xmm2, %xmm1 +; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X86-NEXT: retl %1 = insertelement <8 x half> , half %a, i32 0 %2 = insertelement <8 x half> %1, half %b, i32 1 @@ -2099,7 +2101,9 @@ ; X64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] ; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15] +; X64-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X64-NEXT: vmovsh %xmm0, %xmm2, %xmm0 ; X64-NEXT: retq @@ -2115,7 +2119,9 @@ ; X86-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 +; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15] +; X86-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-NEXT: vmovsh %xmm0, %xmm2, %xmm0 ; X86-NEXT: movl %ebp, %esp @@ -2130,8 +2136,9 @@ define <8 x i16> @pr59628_xmm(i16 %arg) { ; X64-LABEL: pr59628_xmm: ; X64: # %bb.0: -; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vpbroadcastw %edi, %xmm1 +; X64-NEXT: vmovsh %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1 ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-upgrade.ll @@ -283,8 +283,8 @@ define void @test_mask_compress_store_b_512(ptr %addr, <64 x i8> %data, i64 %mask) { ; X86-LABEL: test_mask_compress_store_b_512: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpcompressb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x63,0x00] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics.ll @@ -282,8 +282,8 @@ define void @test_mask_compress_store_b_512(ptr %addr, <64 x i8> %data, i64 %mask) { ; X86-LABEL: test_mask_compress_store_b_512: ; X86: # %bb.0: -; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpcompressb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x63,0x00] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll --- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -19496,8 +19496,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19682,8 +19682,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -19859,8 +19859,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -20033,8 +20033,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovaps (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -21162,8 +21162,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl $3, %eax @@ -21343,8 +21343,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -21529,8 +21529,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -21706,8 +21706,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -21880,8 +21880,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %xmm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $14, %k0, %k0 ; NoVLX-NEXT: kshiftrw $14, %k0, %k0 @@ -22068,8 +22068,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -22260,8 +22260,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -22443,8 +22443,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 @@ -22623,8 +22623,8 @@ ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem: ; NoVLX: # %bb.0: # %entry ; NoVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vmovapd (%rsi), %ymm1 +; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kshiftlw $12, %k0, %k0 ; NoVLX-NEXT: kshiftrw $12, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/bit-test-shift.ll b/llvm/test/CodeGen/X86/bit-test-shift.ll --- a/llvm/test/CodeGen/X86/bit-test-shift.ll +++ b/llvm/test/CodeGen/X86/bit-test-shift.ll @@ -5,10 +5,12 @@ define i32 @x(i32 %t) nounwind readnone ssp { ; CHECK-LABEL: x: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: shll $23, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: andl $-26, %eax +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp) +; CHECK-NEXT: je .LBB0_2 +; CHECK-NEXT: # %bb.1: # %entry +; CHECK-NEXT: movl $-26, %eax +; CHECK-NEXT: .LBB0_2: # %entry ; CHECK-NEXT: retl entry: %and = and i32 %t, 256 diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -163,17 +163,17 @@ ; ; AVX1-LABEL: v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpcmpgtw %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -182,10 +182,12 @@ ; AVX2-LABEL: v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -357,20 +357,17 @@ ; AVX2-LABEL: v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -426,27 +423,46 @@ ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; -; AVX12-LABEL: v16f32: -; AVX12: # %bb.0: -; AVX12-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 -; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX12-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 -; AVX12-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX12-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vcmpltps %ymm5, %ymm7, %ymm1 -; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX12-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX12-NEXT: vcmpltps %ymm4, %ymm6, %ymm2 -; AVX12-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX12-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX12-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 -; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX12-NEXT: vpmovmskb %xmm0, %eax -; AVX12-NEXT: # kill: def $ax killed $ax killed $eax -; AVX12-NEXT: vzeroupper -; AVX12-NEXT: retq +; AVX1-LABEL: v16f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vcmpltps %ymm5, %ymm7, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vcmpltps %ymm4, %ymm6, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: # kill: def $ax killed $ax killed $eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vcmpltps %ymm5, %ymm7, %ymm1 +; AVX2-NEXT: vcmpltps %ymm4, %ymm6, %ymm2 +; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: # kill: def $ax killed $ax killed $eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: v16f32: ; AVX512F: # %bb.0: @@ -585,3 +601,5 @@ %res = bitcast <64 x i1> %y to i64 ret i64 %res } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX12: {{.*}} diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-128.ll @@ -566,18 +566,14 @@ ; ; AVX512F-LABEL: bitcast_16i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 -; AVX512F-NEXT: kmovw %k0, (%rdi) -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vpmovmskb %xmm0, %eax +; AVX512F-NEXT: movw %ax, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: bitcast_16i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovb2m %xmm0, %k0 -; AVX512BW-NEXT: kmovw %k0, (%rdi) +; AVX512BW-NEXT: vpmovmskb %xmm0, %eax +; AVX512BW-NEXT: movw %ax, (%rdi) ; AVX512BW-NEXT: retq %a1 = icmp slt <16 x i8> %a0, zeroinitializer %a2 = bitcast <16 x i1> %a1 to i16 @@ -638,17 +634,13 @@ ; ; AVX512F-LABEL: bitcast_4i32_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vmovmskps %xmm0, %eax ; AVX512F-NEXT: movb %al, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: bitcast_4i32_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtd %xmm0, %xmm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vmovmskps %xmm0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: retq %a1 = icmp slt <4 x i32> %a0, zeroinitializer @@ -672,17 +664,13 @@ ; ; AVX512F-LABEL: bitcast_2i64_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: vmovmskpd %xmm0, %eax ; AVX512F-NEXT: movb %al, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: bitcast_2i64_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax +; AVX512BW-NEXT: vmovmskpd %xmm0, %eax ; AVX512BW-NEXT: movb %al, (%rdi) ; AVX512BW-NEXT: retq %a1 = icmp slt <2 x i64> %a0, zeroinitializer diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll @@ -330,26 +330,12 @@ ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: bitcast_32i8_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: kmovw %k1, 2(%rdi) -; AVX512F-NEXT: kmovw %k0, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: bitcast_32i8_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovb2m %ymm0, %k0 -; AVX512BW-NEXT: kmovd %k0, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: bitcast_32i8_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: movl %eax, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = icmp slt <32 x i8> %a0, zeroinitializer %a2 = bitcast <32 x i1> %a1 to i32 store i32 %a2, ptr %p @@ -447,23 +433,12 @@ ; AVX12-NEXT: vzeroupper ; AVX12-NEXT: retq ; -; AVX512F-LABEL: bitcast_4i64_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtq %ymm0, %ymm1, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: movb %al, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: bitcast_4i64_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtq %ymm0, %ymm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: bitcast_4i64_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovmskpd %ymm0, %eax +; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %a1 = icmp slt <4 x i64> %a0, zeroinitializer %a2 = bitcast <4 x i1> %a1 to i4 store i4 %a2, ptr %p diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll @@ -450,24 +450,12 @@ ; ; AVX512F-LABEL: bitcast_64i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k2 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-NEXT: kmovw %k3, 6(%rdi) -; AVX512F-NEXT: kmovw %k2, 4(%rdi) -; AVX512F-NEXT: kmovw %k1, 2(%rdi) -; AVX512F-NEXT: kmovw %k0, (%rdi) +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpmovmskb %ymm1, %eax +; AVX512F-NEXT: shlq $32, %rax +; AVX512F-NEXT: vpmovmskb %ymm0, %ecx +; AVX512F-NEXT: orq %rax, %rcx +; AVX512F-NEXT: movq %rcx, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -615,13 +603,10 @@ ; ; AVX1-LABEL: bitcast_8i64_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: movb %al, (%rdi) diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -46,30 +46,27 @@ } define i1 @trunc_v2i64_cmp(<2 x i64> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v2i64_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllq $63, %xmm0 -; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax -; SSE2-SSSE3-NEXT: testl %eax, %eax -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v2i64_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v2i64_cmp: +; SSE: # %bb.0: +; SSE-NEXT: psllq $63, %xmm0 +; SSE-NEXT: movmskpd %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX12-LABEL: trunc_v2i64_cmp: ; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX12-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX12-NEXT: vtestpd %xmm0, %xmm0 ; AVX12-NEXT: sete %al ; AVX12-NEXT: retq ; ; AVX512-LABEL: trunc_v2i64_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] -; AVX512-NEXT: vptest %xmm1, %xmm0 +; AVX512-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512-NEXT: vptestmq %xmm0, %xmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: retq %1 = trunc <2 x i64> %a0 to <2 x i1> @@ -79,15 +76,30 @@ } define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind { -; SSE-LABEL: bitcast_v4i32_to_v2i2: -; SSE: # %bb.0: -; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $2, %cl -; SSE-NEXT: andb $3, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v4i32_to_v2i2: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $2, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $3, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v4i32_to_v2i2: +; SSE41: # %bb.0: +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $2, %cl +; SSE41-NEXT: andb $3, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX-LABEL: bitcast_v4i32_to_v2i2: ; AVX: # %bb.0: @@ -107,31 +119,29 @@ } define i1 @trunc_v4i32_cmp(<4 x i32> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v4i32_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $15, %eax -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v4i32_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i32_cmp: +; SSE: # %bb.0: +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: cmpl $15, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX12-LABEL: trunc_v4i32_cmp: ; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX12-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX12-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX12-NEXT: vtestps %xmm1, %xmm0 ; AVX12-NEXT: setb %al ; AVX12-NEXT: retq ; ; AVX512-LABEL: trunc_v4i32_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967297,4294967297] -; AVX512-NEXT: vptest %xmm1, %xmm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: testb $15, %al +; AVX512-NEXT: sete %al ; AVX512-NEXT: retq %1 = trunc <4 x i32> %a0 to <4 x i1> %2 = bitcast <4 x i1> %1 to i4 @@ -140,16 +150,32 @@ } define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind { -; SSE-LABEL: bitcast_v8i16_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i16_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $4, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $15, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i16_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $15, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX12-LABEL: bitcast_v8i16_to_v2i4: ; AVX12: # %bb.0: @@ -181,23 +207,19 @@ } define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v8i16_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v8i16_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v8i16_cmp: +; SSE: # %bb.0: +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX12-LABEL: trunc_v8i16_cmp: ; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX12-NEXT: vpsllw $15, %xmm0, %xmm0 +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: testl $43690, %eax # imm = 0xAAAA ; AVX12-NEXT: setne %al ; AVX12-NEXT: retq ; @@ -232,24 +254,14 @@ ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX12-LABEL: bitcast_v16i8_to_v2i8: -; AVX12: # %bb.0: -; AVX12-NEXT: vpmovmskb %xmm0, %ecx -; AVX12-NEXT: movl %ecx, %eax -; AVX12-NEXT: shrl $8, %eax -; AVX12-NEXT: addb %cl, %al -; AVX12-NEXT: # kill: def $al killed $al killed $eax -; AVX12-NEXT: retq -; -; AVX512-LABEL: bitcast_v16i8_to_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovb2m %xmm0, %k0 -; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: kmovd %k1, %eax -; AVX512-NEXT: addb %cl, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq +; AVX-LABEL: bitcast_v16i8_to_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vpmovmskb %xmm0, %ecx +; AVX-NEXT: movl %ecx, %eax +; AVX-NEXT: shrl $8, %eax +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq %1 = icmp slt <16 x i8> %a0, zeroinitializer %2 = bitcast <16 x i1> %1 to <2 x i8> %3 = extractelement <2 x i8> %2, i32 0 @@ -259,32 +271,21 @@ } define i1 @trunc_v16i8_cmp(<16 x i8> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v16i8_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v16i8_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setae %al -; SSE41-NEXT: retq -; -; AVX12-LABEL: trunc_v16i8_cmp: -; AVX12: # %bb.0: -; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX12-NEXT: setae %al -; AVX12-NEXT: retq +; SSE-LABEL: trunc_v16i8_cmp: +; SSE: # %bb.0: +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; -; AVX512-LABEL: trunc_v16i8_cmp: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %xmm1, %xmm0 -; AVX512-NEXT: setae %al -; AVX512-NEXT: retq +; AVX-LABEL: trunc_v16i8_cmp: +; AVX: # %bb.0: +; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX-NEXT: vpmovmskb %xmm0, %eax +; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX-NEXT: setne %al +; AVX-NEXT: retq %1 = trunc <16 x i8> %a0 to <16 x i1> %2 = bitcast <16 x i1> %1 to i16 %3 = icmp ne i16 %2, -1 @@ -296,16 +297,32 @@ ; define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind { -; SSE-LABEL: bitcast_v4i64_to_v2i2: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: movmskps %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $2, %cl -; SSE-NEXT: andb $3, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v4i64_to_v2i2: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $2, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $3, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v4i64_to_v2i2: +; SSE41: # %bb.0: +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: movmskps %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $2, %cl +; SSE41-NEXT: andb $3, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX-LABEL: bitcast_v4i64_to_v2i2: ; AVX: # %bb.0: @@ -326,41 +343,39 @@ } define i1 @trunc_v4i64_cmp(<4 x i64> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v4i64_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: testl %eax, %eax -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v4i64_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setne %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v4i64_cmp: +; SSE: # %bb.0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $31, %xmm0 +; SSE-NEXT: movmskps %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v4i64_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vtestps %xmm0, %xmm0 ; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v4i64_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX2-NEXT: vtestpd %ymm0, %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_v4i64_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX512-NEXT: vptest %ymm1, %ymm0 +; AVX512-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512-NEXT: vptestmq %ymm0, %ymm0, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -371,17 +386,34 @@ } define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind { -; SSE-LABEL: bitcast_v8i32_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i32_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $4, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $15, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i32_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $15, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX-LABEL: bitcast_v8i32_to_v2i4: ; AVX: # %bb.0: @@ -402,33 +434,35 @@ } define i1 @trunc_v8i132_cmp(<8 x i32> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v8i132_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $15, %eax -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; ; SSE41-LABEL: trunc_v8i132_cmp: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setae %al +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i132_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setae %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i132_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 ; AVX2-NEXT: setae %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -510,33 +544,38 @@ } define i1 @trunc_v16i16_cmp(<16 x i16> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v16i16_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v16i16_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: sete %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v16i16_cmp: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v16i16_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i16_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -585,10 +624,9 @@ ; ; AVX512-LABEL: bitcast_v32i8_to_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovb2m %ymm0, %k0 -; AVX512-NEXT: kshiftrd $16, %k0, %k1 -; AVX512-NEXT: kmovd %k0, %ecx -; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: vpmovmskb %ymm0, %ecx +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: shrl $16, %eax ; AVX512-NEXT: addl %ecx, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -602,42 +640,41 @@ } define i1 @trunc_v32i8_cmp(<32 x i8> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v32i8_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: xorl $65535, %eax # imm = 0xFFFF -; SSE2-SSSE3-NEXT: sete %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v32i8_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i8_cmp: +; SSE: # %bb.0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: sete %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i8_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i8_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: trunc_v32i8_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: setb %al +; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512-NEXT: vpmovmskb %ymm0, %eax +; AVX512-NEXT: cmpl $-1, %eax +; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = trunc <32 x i8> %a0 to <32 x i1> @@ -651,29 +688,45 @@ ; define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind { -; SSE-LABEL: bitcast_v8i64_to_v2i4: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packssdw %xmm2, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb $4, %cl -; SSE-NEXT: andb $15, %al -; SSE-NEXT: addb %cl, %al -; SSE-NEXT: # kill: def $al killed $al killed $eax -; SSE-NEXT: retq +; SSE2-SSSE3-LABEL: bitcast_v8i64_to_v2i4: +; SSE2-SSSE3: # %bb.0: +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax +; SSE2-SSSE3-NEXT: movl %eax, %ecx +; SSE2-SSSE3-NEXT: shrb $4, %cl +; SSE2-SSSE3-NEXT: movzbl %cl, %ecx +; SSE2-SSSE3-NEXT: andb $15, %al +; SSE2-SSSE3-NEXT: movzbl %al, %eax +; SSE2-SSSE3-NEXT: movd %eax, %xmm0 +; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-SSSE3-NEXT: addb -{{[0-9]+}}(%rsp), %al +; SSE2-SSSE3-NEXT: retq +; +; SSE41-LABEL: bitcast_v8i64_to_v2i4: +; SSE41: # %bb.0: +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: packssdw %xmm1, %xmm0 +; SSE41-NEXT: packssdw %xmm2, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: movl %eax, %ecx +; SSE41-NEXT: shrb $4, %cl +; SSE41-NEXT: andb $15, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: # kill: def $al killed $al killed $eax +; SSE41-NEXT: retq ; ; AVX1-LABEL: bitcast_v8i64_to_v2i4: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovmskps %ymm0, %eax ; AVX1-NEXT: movl %eax, %ecx @@ -740,26 +793,43 @@ ; ; SSE41-LABEL: trunc_v8i64_cmp: ; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setb %al +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packusdw %xmm2, %xmm0 +; SSE41-NEXT: psllw $15, %xmm0 +; SSE41-NEXT: packsswb %xmm0, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: cmpb $-1, %al +; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v8i64_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vtestps %xmm1, %xmm0 ; AVX1-NEXT: setb %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v8i64_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] +; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vtestps %ymm1, %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -858,37 +928,65 @@ define i1 @trunc_v16i32_cmp(<16 x i32> %a0) nounwind { ; SSE2-SSSE3-LABEL: trunc_v16i32_cmp: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: por %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: por %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: pslld $31, %xmm0 -; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 +; SSE2-SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: psllw $7, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax ; SSE2-SSSE3-NEXT: testl %eax, %eax ; SSE2-SSSE3-NEXT: sete %al ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_v16i32_cmp: ; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pand %xmm4, %xmm1 +; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pmovmskb %xmm0, %eax +; SSE41-NEXT: testl %eax, %eax ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc_v16i32_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v16i32_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967297,4294967297,4294967297,4294967297] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -961,41 +1059,50 @@ } define i1 @trunc_v32i16_cmp(<32 x i16> %a0) nounwind { -; SSE2-SSSE3-LABEL: trunc_v32i16_cmp: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: psllw $7, %xmm0 -; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax -; SSE2-SSSE3-NEXT: notl %eax -; SSE2-SSSE3-NEXT: testl $21845, %eax # imm = 0x5555 -; SSE2-SSSE3-NEXT: setne %al -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_v32i16_cmp: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: setae %al -; SSE41-NEXT: retq +; SSE-LABEL: trunc_v32i16_cmp: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: psllw $7, %xmm2 +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; SSE-NEXT: setne %al +; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_v32i16_cmp: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setae %al +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; AVX1-NEXT: setne %al ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_v32i16_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [281479271743489,281479271743489,281479271743489,281479271743489] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: cmpl $-1, %eax +; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1026,7 +1133,6 @@ ; SSE2-SSSE3-NEXT: shll $16, %edx ; SSE2-SSSE3-NEXT: orl %eax, %edx ; SSE2-SSSE3-NEXT: shlq $32, %rdx -; SSE2-SSSE3-NEXT: orq %rcx, %rdx ; SSE2-SSSE3-NEXT: movq %rdx, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-SSSE3-NEXT: movd %xmm0, %eax @@ -1473,14 +1579,10 @@ ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: movdqu (%rdi), %xmm1 ; SSE-NEXT: movdqu 16(%rdi), %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE-NEXT: packssdw %xmm4, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pmovmskb %xmm3, %eax -; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: packssdw %xmm2, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll --- a/llvm/test/CodeGen/X86/bitselect.ll +++ b/llvm/test/CodeGen/X86/bitselect.ll @@ -35,21 +35,21 @@ define i16 @bitselect_i16(i16 %a, i16 %b, i16 %m) nounwind { ; X86-LABEL: bitselect_i16: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorw %ax, %cx +; X86-NEXT: andw %cx, %ax +; X86-NEXT: notl %ecx ; X86-NEXT: andw {{[0-9]+}}(%esp), %cx -; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-NOBMI-LABEL: bitselect_i16: ; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: movl %edx, %eax -; X64-NOBMI-NEXT: andl %edx, %esi -; X64-NOBMI-NEXT: notl %eax -; X64-NOBMI-NEXT: andl %edi, %eax -; X64-NOBMI-NEXT: orl %esi, %eax +; X64-NOBMI-NEXT: movl %esi, %eax +; X64-NOBMI-NEXT: xorl %edi, %eax +; X64-NOBMI-NEXT: andl %edx, %eax +; X64-NOBMI-NEXT: xorl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NOBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/bool-ext-inc.ll b/llvm/test/CodeGen/X86/bool-ext-inc.ll --- a/llvm/test/CodeGen/X86/bool-ext-inc.ll +++ b/llvm/test/CodeGen/X86/bool-ext-inc.ll @@ -6,8 +6,8 @@ define i32 @sext_inc(i1 zeroext %x) nounwind { ; CHECK-LABEL: sext_inc: ; CHECK: # %bb.0: -; CHECK-NEXT: xorb $1, %dil -; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorl $1, %eax ; CHECK-NEXT: retq %ext = sext i1 %x to i32 %add = add i32 %ext, 1 @@ -19,8 +19,10 @@ define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind { ; CHECK-LABEL: sext_inc_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $31, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> %add = add <4 x i32> %ext, @@ -31,8 +33,8 @@ ; CHECK-LABEL: cmpgt_sext_inc_vec: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i32> %x, %y %ext = sext <4 x i1> %cmp to <4 x i32> @@ -44,7 +46,8 @@ ; CHECK-LABEL: cmpne_sext_inc_vec: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %cmp = icmp ne <4 x i32> %x, %y %ext = sext <4 x i1> %cmp to <4 x i32> @@ -56,8 +59,8 @@ ; CHECK-LABEL: cmpgt_sext_inc_vec256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i64> %x, %y %ext = sext <4 x i1> %cmp to <4 x i64> diff --git a/llvm/test/CodeGen/X86/bool-math.ll b/llvm/test/CodeGen/X86/bool-math.ll --- a/llvm/test/CodeGen/X86/bool-math.ll +++ b/llvm/test/CodeGen/X86/bool-math.ll @@ -12,8 +12,9 @@ ; ; X32-LABEL: sub_zext_cmp_mask_same_size_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: andb $1, %al +; X32-NEXT: movzbl %al, %eax ; X32-NEXT: orl $-28, %eax ; X32-NEXT: retl %a = and i32 %x, 1 @@ -141,7 +142,7 @@ ; ; X32-LABEL: low_bit_select_constants_bigger_false_same_size_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $1, %eax ; X32-NEXT: orl $42, %eax ; X32-NEXT: retl @@ -161,7 +162,7 @@ ; ; X32-LABEL: low_bit_select_constants_bigger_false_wider_result: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $1, %eax ; X32-NEXT: orl $26, %eax ; X32-NEXT: xorl %edx, %edx @@ -183,7 +184,7 @@ ; ; X32-LABEL: low_bit_select_constants_bigger_false_narrower_result: ; X32: # %bb.0: -; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: andl $1, %eax ; X32-NEXT: orl $36, %eax ; X32-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll --- a/llvm/test/CodeGen/X86/bswap.ll +++ b/llvm/test/CodeGen/X86/bswap.ll @@ -126,14 +126,21 @@ ; CHECK-LABEL: test2: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shll $24, %ecx +; CHECK-NEXT: shll $8, %eax +; CHECK-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: sarl $16, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test2: ; CHECK64: # %bb.0: ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: shll $8, %edi +; CHECK64-NEXT: andl $16711680, %edi # imm = 0xFF0000 +; CHECK64-NEXT: orl %edi, %eax ; CHECK64-NEXT: sarl $16, %eax ; CHECK64-NEXT: retq %and = lshr i32 %a, 8 diff --git a/llvm/test/CodeGen/X86/bswap_tree2.ll b/llvm/test/CodeGen/X86/bswap_tree2.ll --- a/llvm/test/CodeGen/X86/bswap_tree2.ll +++ b/llvm/test/CodeGen/X86/bswap_tree2.ll @@ -11,20 +11,28 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: andl $16711935, %ecx # imm = 0xFF00FF +; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000 ; CHECK-NEXT: shll $8, %ecx -; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shrl $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test1: ; CHECK64: # %bb.0: +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: andl $16711935, %eax # imm = 0xFF00FF -; CHECK64-NEXT: shll $8, %eax -; CHECK64-NEXT: orl $-16777216, %edi # imm = 0xFF000000 -; CHECK64-NEXT: shrl $8, %edi +; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: orl %ecx, %eax +; CHECK64-NEXT: bswapl %edi +; CHECK64-NEXT: shrl $16, %edi ; CHECK64-NEXT: orl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff diff --git a/llvm/test/CodeGen/X86/bt.ll b/llvm/test/CodeGen/X86/bt.ll --- a/llvm/test/CodeGen/X86/bt.ll +++ b/llvm/test/CodeGen/X86/bt.ll @@ -1064,7 +1064,7 @@ ; X86-LABEL: extend: ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: btl %ecx, %eax ; X86-NEXT: setb %al ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/btc_bts_btr.ll b/llvm/test/CodeGen/X86/btc_bts_btr.ll --- a/llvm/test/CodeGen/X86/btc_bts_btr.ll +++ b/llvm/test/CodeGen/X86/btc_bts_btr.ll @@ -859,8 +859,8 @@ ; X86-NEXT: .LBB33_2: ; X86-NEXT: notl %esi ; X86-NEXT: notl %edx -; X86-NEXT: andl %edx, (%eax) ; X86-NEXT: andl %esi, 4(%eax) +; X86-NEXT: andl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -899,8 +899,8 @@ ; X86-NEXT: movl %edx, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB34_2: -; X86-NEXT: orl %edx, (%eax) ; X86-NEXT: orl %esi, 4(%eax) +; X86-NEXT: orl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -938,8 +938,8 @@ ; X86-NEXT: movl %edx, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB35_2: -; X86-NEXT: xorl %edx, (%eax) ; X86-NEXT: xorl %esi, 4(%eax) +; X86-NEXT: xorl %edx, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -1027,8 +1027,8 @@ ; ; X86-LABEL: btr_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $2, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shlb $2, %cl ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx @@ -1062,8 +1062,8 @@ ; ; X86-LABEL: bts_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $2, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shlb $2, %cl ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx @@ -1094,8 +1094,8 @@ ; ; X86-LABEL: btc_64_mask_zeros: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $2, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shlb $2, %cl ; X86-NEXT: movl $1, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: shldl %cl, %eax, %edx diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -8,22 +8,17 @@ ; SSE2-LABEL: foo: ; SSE2: # %bb.0: ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl $65280, %eax # imm = 0xFF00 -; SSE2-NEXT: orl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE41-LABEL: foo: ; SSE41: # %bb.0: ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm0 ; SSE41-NEXT: movl $255, %eax ; SSE41-NEXT: pinsrb $3, %eax, %xmm0 ; SSE41-NEXT: movd %xmm0, (%rdi) @@ -32,7 +27,8 @@ ; AVX-LABEL: foo: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: movl $255, %eax ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rdi) @@ -50,12 +46,12 @@ define <4 x float> @test_negative_zero_1(<4 x float> %A) { ; SSE2-LABEL: test_negative_zero_1: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_negative_zero_1: @@ -80,19 +76,14 @@ ; FIXME: This could be 'movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]'. define <2 x double> @test_negative_zero_2(<2 x double> %A) { -; SSE2-LABEL: test_negative_zero_2: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_negative_zero_2: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; SSE41-NEXT: retq +; SSE-LABEL: test_negative_zero_2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE-NEXT: retq ; ; AVX-LABEL: test_negative_zero_2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX-NEXT: retq entry: %0 = extractelement <2 x double> %A, i32 0 @@ -788,9 +779,10 @@ define i32 @PR46586(ptr %p, <4 x i32> %v) { ; SSE2-LABEL: PR46586: ; SSE2: # %bb.0: -; SSE2-NEXT: movzbl 3(%rdi), %eax -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pinsrw $6, %eax, %xmm1 +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: movzbl 3(%rdi), %ecx +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pinsrw $6, %ecx, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] @@ -802,9 +794,10 @@ ; ; SSE41-LABEL: PR46586: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pinsrb $12, 3(%rdi), %xmm1 +; SSE41-NEXT: pextrd $3, %xmm1, %eax ; SSE41-NEXT: extractps $3, %xmm0, %ecx -; SSE41-NEXT: pextrb $3, %xmm1, %eax ; SSE41-NEXT: xorl %edx, %edx ; SSE41-NEXT: divl %ecx ; SSE41-NEXT: movl %edx, %eax @@ -812,9 +805,10 @@ ; ; AVX-LABEL: PR46586: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrb $12, 3(%rdi), %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm1, %eax ; AVX-NEXT: vextractps $3, %xmm0, %ecx -; AVX-NEXT: vpextrb $3, %xmm1, %eax ; AVX-NEXT: xorl %edx, %edx ; AVX-NEXT: divl %ecx ; AVX-NEXT: movl %edx, %eax diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-32.ll b/llvm/test/CodeGen/X86/bypass-slow-division-32.ll --- a/llvm/test/CodeGen/X86/bypass-slow-division-32.ll +++ b/llvm/test/CodeGen/X86/bypass-slow-division-32.ll @@ -174,13 +174,10 @@ ; CHECK-NEXT: imull %edx ; CHECK-NEXT: movl %edx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: sarl $3, %edx -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shll $5, %edx +; CHECK-NEXT: shrl $3, %edx ; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: shll $5, %edx ; CHECK-NEXT: subl %edx, %ecx -; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: retl %resultdiv = sdiv i32 %a, 33 diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -343,9 +343,16 @@ ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovups da+4096(%rax), %ymm1 ; AVX1-NEXT: vcmpltps db+4096(%rax), %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovups dc+4096(%rax), %ymm2 ; AVX1-NEXT: vcmpltps dd+4096(%rax), %ymm2, %ymm2 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovups %ymm1, dj+4096(%rax) ; AVX1-NEXT: addq $32, %rax @@ -357,16 +364,22 @@ ; AVX2-LABEL: example25: ; AVX2: # %bb.0: # %vector.ph ; AVX2-NEXT: movq $-4096, %rax # imm = 0xF000 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB5_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovups da+4096(%rax), %ymm0 -; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm0, %ymm0 -; AVX2-NEXT: vmovups dc+4096(%rax), %ymm1 -; AVX2-NEXT: vcmpltps dd+4096(%rax), %ymm1, %ymm1 -; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, dj+4096(%rax) +; AVX2-NEXT: vmovups da+4096(%rax), %ymm1 +; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm1, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovups dc+4096(%rax), %ymm2 +; AVX2-NEXT: vcmpltps dd+4096(%rax), %ymm2, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, dj+4096(%rax) ; AVX2-NEXT: addq $32, %rax ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %for.end diff --git a/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll b/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll --- a/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll +++ b/llvm/test/CodeGen/X86/cfguard-x86-64-vectorcall.ll @@ -14,12 +14,12 @@ ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movups (%rdx), %xmm0 ; X64-NEXT: movups 16(%rdx), %xmm1 -; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) -; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) ; X64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; X64-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: callq *__guard_dispatch_icall_fptr(%rip) ; X64-NEXT: nop ; X64-NEXT: addq $72, %rsp diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -831,7 +831,6 @@ ; X86-NOCMOV-LABEL: cttz_i64_zero_test: ; X86-NOCMOV: # %bb.0: ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOCMOV-NOT: rep ; X86-NOCMOV-NEXT: bsfl {{[0-9]+}}(%esp), %edx ; X86-NOCMOV-NEXT: movl $32, %eax ; X86-NOCMOV-NEXT: je .LBB15_2 @@ -852,12 +851,10 @@ ; X86-CMOV-LABEL: cttz_i64_zero_test: ; X86-CMOV: # %bb.0: ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-CMOV-NOT: rep ; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx ; X86-CMOV-NEXT: movl $32, %edx ; X86-CMOV-NEXT: cmovnel %ecx, %edx ; X86-CMOV-NEXT: addl $32, %edx -; X86-CMOV-NOT: rep ; X86-CMOV-NEXT: bsfl %eax, %eax ; X86-CMOV-NEXT: cmovel %edx, %eax ; X86-CMOV-NEXT: xorl %edx, %edx @@ -1395,15 +1392,13 @@ ; ; X86-CLZ-LABEL: PR47603_trunc: ; X86-CLZ: # %bb.0: -; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax -; X86-CLZ-NEXT: xorb $31, %al +; X86-CLZ-NEXT: bsrl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: PR47603_trunc: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: lzcntl %edi, %eax -; X64-CLZ-NEXT: xorb $31, %al +; X64-CLZ-NEXT: bsrl %edi, %eax ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq ; @@ -1481,13 +1476,11 @@ define i32 @cttz_i32_osize(i32 %x) optsize { ; X86-LABEL: cttz_i32_osize: ; X86: # %bb.0: -; X86-NOT: rep ; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: cttz_i32_osize: ; X64: # %bb.0: -; X64-NOT: rep ; X64-NEXT: bsfl %edi, %eax ; X64-NEXT: retq ; @@ -1517,13 +1510,11 @@ define i32 @cttz_i32_msize(i32 %x) minsize { ; X86-LABEL: cttz_i32_msize: ; X86: # %bb.0: -; X86-NOT: rep ; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: cttz_i32_msize: ; X64: # %bb.0: -; X64-NOT: rep ; X64-NEXT: bsfl %edi, %eax ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/cmov-promotion.ll b/llvm/test/CodeGen/X86/cmov-promotion.ll --- a/llvm/test/CodeGen/X86/cmov-promotion.ll +++ b/llvm/test/CodeGen/X86/cmov-promotion.ll @@ -30,20 +30,19 @@ define i32 @cmov_zpromotion_8_to_32(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_8_to_32: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $126, %ecx -; CMOV-NEXT: movl $255, %eax -; CMOV-NEXT: cmovnel %ecx, %eax +; CMOV-NEXT: andb $1, %dil +; CMOV-NEXT: decb %dil +; CMOV-NEXT: orb $126, %dil +; CMOV-NEXT: movzbl %dil, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_8_to_32: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $126, %eax -; NO_CMOV-NEXT: jne .LBB1_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $255, %eax -; NO_CMOV-NEXT: .LBB1_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andb $1, %al +; NO_CMOV-NEXT: decb %al +; NO_CMOV-NEXT: orb $126, %al +; NO_CMOV-NEXT: movzbl %al, %eax ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i8 12414, i8 -1 %ret = zext i8 %t0 to i32 @@ -53,20 +52,19 @@ define i64 @cmov_zpromotion_8_to_64(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_8_to_64: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $126, %ecx -; CMOV-NEXT: movl $255, %eax -; CMOV-NEXT: cmovneq %rcx, %rax +; CMOV-NEXT: andb $1, %dil +; CMOV-NEXT: decb %dil +; CMOV-NEXT: orb $126, %dil +; CMOV-NEXT: movzbl %dil, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_8_to_64: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $126, %eax -; NO_CMOV-NEXT: jne .LBB2_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $255, %eax -; NO_CMOV-NEXT: .LBB2_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andb $1, %al +; NO_CMOV-NEXT: decb %al +; NO_CMOV-NEXT: orb $126, %al +; NO_CMOV-NEXT: movzbl %al, %eax ; NO_CMOV-NEXT: xorl %edx, %edx ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i8 12414, i8 -1 @@ -77,20 +75,19 @@ define i32 @cmov_zpromotion_16_to_32(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_16_to_32: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E -; CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; CMOV-NEXT: cmovnel %ecx, %eax +; CMOV-NEXT: andl $1, %edi +; CMOV-NEXT: decl %edi +; CMOV-NEXT: orl $12414, %edi # imm = 0x307E +; CMOV-NEXT: movzwl %di, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_16_to_32: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E -; NO_CMOV-NEXT: jne .LBB3_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; NO_CMOV-NEXT: .LBB3_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andl $1, %eax +; NO_CMOV-NEXT: decl %eax +; NO_CMOV-NEXT: orl $12414, %eax # imm = 0x307E +; NO_CMOV-NEXT: movzwl %ax, %eax ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i16 12414, i16 -1 %ret = zext i16 %t0 to i32 @@ -100,20 +97,19 @@ define i64 @cmov_zpromotion_16_to_64(i1 %c) { ; CMOV-LABEL: cmov_zpromotion_16_to_64: ; CMOV: # %bb.0: -; CMOV-NEXT: testb $1, %dil -; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E -; CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; CMOV-NEXT: cmovneq %rcx, %rax +; CMOV-NEXT: andl $1, %edi +; CMOV-NEXT: decl %edi +; CMOV-NEXT: orl $12414, %edi # imm = 0x307E +; CMOV-NEXT: movzwl %di, %eax ; CMOV-NEXT: retq ; ; NO_CMOV-LABEL: cmov_zpromotion_16_to_64: ; NO_CMOV: # %bb.0: -; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp) -; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E -; NO_CMOV-NEXT: jne .LBB4_2 -; NO_CMOV-NEXT: # %bb.1: -; NO_CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF -; NO_CMOV-NEXT: .LBB4_2: +; NO_CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; NO_CMOV-NEXT: andl $1, %eax +; NO_CMOV-NEXT: decl %eax +; NO_CMOV-NEXT: orl $12414, %eax # imm = 0x307E +; NO_CMOV-NEXT: movzwl %ax, %eax ; NO_CMOV-NEXT: xorl %edx, %edx ; NO_CMOV-NEXT: retl %t0 = select i1 %c, i16 12414, i16 -1 diff --git a/llvm/test/CodeGen/X86/cmov.ll b/llvm/test/CodeGen/X86/cmov.ll --- a/llvm/test/CodeGen/X86/cmov.ll +++ b/llvm/test/CodeGen/X86/cmov.ll @@ -216,7 +216,7 @@ ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: notl %edi ; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovnsl %edi, %eax +; CHECK-NEXT: cmovgl %edi, %eax ; CHECK-NEXT: retq %not_x = xor i32 %x, -1 %1 = icmp slt i32 %not_x, -1 diff --git a/llvm/test/CodeGen/X86/cmp-bool.ll b/llvm/test/CodeGen/X86/cmp-bool.ll --- a/llvm/test/CodeGen/X86/cmp-bool.ll +++ b/llvm/test/CodeGen/X86/cmp-bool.ll @@ -25,8 +25,9 @@ define void @bool_ne(i1 zeroext %a, i1 zeroext %b, ptr nocapture %c) nounwind { ; CHECK-LABEL: bool_ne: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpb %sil, %dil -; CHECK-NEXT: je .LBB1_1 +; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: jmpq *%rdx # TAILCALL ; CHECK-NEXT: .LBB1_1: # %if.end diff --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll --- a/llvm/test/CodeGen/X86/cmp-concat.ll +++ b/llvm/test/CodeGen/X86/cmp-concat.ll @@ -36,7 +36,7 @@ ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: movzwl %si, %ecx ; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: orq %rcx, %rax ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %zx = zext i16 %x to i64 @@ -54,7 +54,7 @@ ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: movzwl %si, %ecx ; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: orq %rcx, %rax ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %zx = zext i16 %x to i64 diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll --- a/llvm/test/CodeGen/X86/cmp.ll +++ b/llvm/test/CodeGen/X86/cmp.ll @@ -310,8 +310,10 @@ define i8 @signbit_i16(i16 signext %L) { ; CHECK-LABEL: signbit_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: testw %di, %di # encoding: [0x66,0x85,0xff] -; CHECK-NEXT: setns %al # encoding: [0x0f,0x99,0xc0] +; CHECK-NEXT: movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7] +; CHECK-NEXT: shrl $15, %eax # encoding: [0xc1,0xe8,0x0f] +; CHECK-NEXT: xorb $1, %al # encoding: [0x34,0x01] +; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq # encoding: [0xc3] %lshr = lshr i16 %L, 15 %trunc = trunc i16 %lshr to i8 diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -589,8 +589,9 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v8i64_arg: ; AVX512: # %bb.0: +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <8 x i64> undef, i64 %1, i64 0 @@ -618,35 +619,38 @@ ; AVX1-LABEL: neg_scalar_broadcast_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: notq %rdi +; AVX1-NEXT: vmovq %rdi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,3] -; AVX1-NEXT: vmovq %rdi, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; AVX1-NEXT: vandnpd %ymm0, %ymm2, %ymm0 -; AVX1-NEXT: vandnpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,0] +; AVX2-NEXT: notq %rdi +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,0] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX2-NEXT: vmovq %rdi, %xmm2 -; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,1,1,0,1,0,0] ; AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <8 x i64> undef, i64 %1, i64 0 @@ -668,23 +672,26 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX1: # %bb.0: +; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX2: # %bb.0: +; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX512: # %bb.0: +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <4 x i64> undef, i64 %1, i64 0 @@ -708,29 +715,32 @@ ; AVX1-LABEL: neg_scalar_broadcast_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,3] -; AVX1-NEXT: vandnpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <4 x i64> undef, i64 %1, i64 0 @@ -743,30 +753,33 @@ define <2 x i64> @neg_scalar_broadcast_v2i64(i64 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v2i64: ; SSE: # %bb.0: +; SSE-NEXT: notq %rdi ; SSE-NEXT: movq %rdi, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v2i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v2i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v2i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <2 x i64> undef, i64 %1, i64 0 @@ -819,23 +832,26 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v8i32: ; AVX1: # %bb.0: +; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i32: ; AVX2: # %bb.0: +; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i32: ; AVX512: # %bb.0: +; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastd %edi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i32 %a0, -1 %2 = insertelement <8 x i32> undef, i32 %1, i64 0 @@ -847,32 +863,35 @@ define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v8i16: ; SSE: # %bb.0: +; SSE-NEXT: notl %edi ; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v8i16: ; AVX1: # %bb.0: +; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i16: ; AVX2: # %bb.0: +; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i16: ; AVX512: # %bb.0: +; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastw %edi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i16 %a0, -1 %2 = insertelement <8 x i16> undef, i16 %1, i64 0 @@ -884,32 +903,36 @@ define <16 x i8> @neg_scalar_broadcast_v16i8(i8 %a0, <16 x i8> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: notb %dil +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v16i8: ; AVX1: # %bb.0: +; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v16i8: ; AVX2: # %bb.0: +; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v16i8: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <16 x i8> undef, i8 %1, i64 0 @@ -954,8 +977,9 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v64i8: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <64 x i8> undef, i8 %1, i64 0 @@ -1000,8 +1024,9 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v64i8_v8i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %zmm1 -; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <64 x i8> undef, i8 %1, i64 0 @@ -1025,24 +1050,27 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <32 x i8> undef, i8 %1, i64 0 @@ -1055,32 +1083,36 @@ define <2 x i64> @neg_scalar_broadcast_v16i8_v2i64(i8 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: notb %dil +; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <16 x i8> undef, i8 %1, i64 0 @@ -1102,23 +1134,26 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX1: # %bb.0: +; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX2: # %bb.0: +; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX512: # %bb.0: +; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastd %edi, %ymm1 -; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i32 %a0, -1 %2 = insertelement <8 x i32> undef, i32 %1, i64 0 diff --git a/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll b/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll --- a/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll +++ b/llvm/test/CodeGen/X86/combine-avx2-intrinsics.ll @@ -113,6 +113,7 @@ define <4 x i64> @demandedelts_vpsrlvq(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: demandedelts_vpsrlvq: ; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastq %xmm1, %xmm1 ; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll --- a/llvm/test/CodeGen/X86/combine-bitreverse.ll +++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll @@ -75,9 +75,9 @@ ; X86-NEXT: andl $858993408, %eax # imm = 0x33333300 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $1431655744, %ecx # imm = 0x55555540 ; X86-NEXT: shrl %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: andl $1431655680, %eax # imm = 0x55555500 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: retl ; @@ -115,9 +115,9 @@ ; X64-NEXT: andl $858993408, %eax # imm = 0x33333300 ; X64-NEXT: leal (%rax,%rcx,4), %eax ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X64-NEXT: andl $1431655744, %ecx # imm = 0x55555540 ; X64-NEXT: shrl %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: andl $1431655680, %eax # imm = 0x55555500 ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: retq %b = call i32 @llvm.bitreverse.i32(i32 %a0) @@ -163,7 +163,7 @@ ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 ; X86-NEXT: shrl %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: andl $1431655764, %eax # imm = 0x55555554 ; X86-NEXT: leal (%eax,%ecx,2), %edx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: retl @@ -183,34 +183,33 @@ ; X64-NEXT: andq %rax, %rcx ; X64-NEXT: shrq $2, %rdi ; X64-NEXT: andq %rax, %rdi -; X64-NEXT: leaq (%rdi,%rcx,4), %rax -; X64-NEXT: movabsq $6148914689804861440, %rcx # imm = 0x5555555500000000 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq %rax -; X64-NEXT: movabsq $6148914685509894144, %rdx # imm = 0x5555555400000000 -; X64-NEXT: andq %rax, %rdx -; X64-NEXT: leaq (%rdx,%rcx,2), %rax -; X64-NEXT: shrq $33, %rax -; X64-NEXT: bswapq %rax -; X64-NEXT: movabsq $1085102592318504960, %rcx # imm = 0xF0F0F0F00000000 -; X64-NEXT: andq %rax, %rcx -; X64-NEXT: shrq $4, %rax -; X64-NEXT: movabsq $1085102557958766592, %rdx # imm = 0xF0F0F0700000000 -; X64-NEXT: andq %rax, %rdx -; X64-NEXT: shlq $4, %rcx -; X64-NEXT: orq %rdx, %rcx -; X64-NEXT: movabsq $3689348813882916864, %rax # imm = 0x3333333300000000 -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: shrq $2, %rcx -; X64-NEXT: movabsq $3689348805292982272, %rdx # imm = 0x3333333100000000 +; X64-NEXT: leaq (%rdi,%rcx,4), %rdx +; X64-NEXT: movabsq $6148914689804861440, %rax # imm = 0x5555555500000000 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: andq %rax, %rsi +; X64-NEXT: shrq %rdx +; X64-NEXT: movabsq $6148914685509894144, %rcx # imm = 0x5555555400000000 ; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: leaq (%rdx,%rax,4), %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NEXT: movq %rax, %rdx +; X64-NEXT: leaq (%rdx,%rsi,2), %rdx +; X64-NEXT: shrq $33, %rdx +; X64-NEXT: bswapq %rdx +; X64-NEXT: movabsq $1085102592318504960, %rsi # imm = 0xF0F0F0F00000000 +; X64-NEXT: andq %rdx, %rsi +; X64-NEXT: shrq $4, %rdx +; X64-NEXT: movabsq $1085102557958766592, %rdi # imm = 0xF0F0F0700000000 +; X64-NEXT: andq %rdx, %rdi +; X64-NEXT: shlq $4, %rsi +; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: movabsq $3689348813882916864, %rdx # imm = 0x3333333300000000 +; X64-NEXT: andq %rsi, %rdx +; X64-NEXT: shrq $2, %rsi +; X64-NEXT: movabsq $3689348805292982272, %rdi # imm = 0x3333333100000000 +; X64-NEXT: andq %rsi, %rdi +; X64-NEXT: leaq (%rdi,%rdx,4), %rdx +; X64-NEXT: andq %rdx, %rax +; X64-NEXT: shrq %rdx ; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: shrq %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: leaq (%rax,%rdx,2), %rax +; X64-NEXT: leaq (%rdx,%rax,2), %rax ; X64-NEXT: retq %1 = call i64 @llvm.bitreverse.i64(i64 %a) %2 = lshr i64 %1, 33 @@ -254,9 +253,9 @@ ; X86-NEXT: andl $36909875, %eax # imm = 0x2333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $5592405, %ecx # imm = 0x555555 ; X86-NEXT: shrl %eax -; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: andl $22369621, %eax # imm = 0x1555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax ; X86-NEXT: retl ; @@ -294,9 +293,9 @@ ; X64-NEXT: andl $36909875, %ecx # imm = 0x2333333 ; X64-NEXT: leal (%rcx,%rax,4), %eax ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X64-NEXT: andl $5592405, %ecx # imm = 0x555555 ; X64-NEXT: shrl %eax -; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: andl $22369621, %eax # imm = 0x1555555 ; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: retq %b = call i32 @llvm.bitreverse.i32(i32 %a0) @@ -338,7 +337,7 @@ ; X86-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $357913941, %ecx # imm = 0x15555555 ; X86-NEXT: shrl %eax ; X86-NEXT: andl $1431655765, %eax # imm = 0x55555555 ; X86-NEXT: leal (%eax,%ecx,2), %eax @@ -377,12 +376,11 @@ ; X64-NEXT: shrq $2, %rax ; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X64-NEXT: leaq (%rax,%rcx,4), %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555 ; X64-NEXT: shrq %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: leaq (%rax,%rdx,2), %rax +; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: leaq (%rax,%rcx,2), %rax ; X64-NEXT: retq %1 = call i64 @llvm.bitreverse.i64(i64 %a) %2 = shl i64 %1, 33 diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -541,8 +541,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm2 ; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -551,14 +554,20 @@ ; AVX512F-NEXT: vmovq %rdi, %xmm2 ; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrr: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm2 -; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VL-NEXT: vpternlogq $248, %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer @@ -590,25 +599,43 @@ ; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v4i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm3 +; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX512F-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to4}, %ymm1, %ymm0 +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512VL-NEXT: vpternlogq $248, %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %a2 = load i64, ptr %p2 %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 @@ -914,19 +941,35 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq %rdi, %xmm4 ; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpxor %xmm5, %xmm4, %xmm5 +; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: bitselect_v8i64_broadcast_rrr: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq %rdi, %zmm2 -; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: bitselect_v8i64_broadcast_rrr: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq %rdi, %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: bitselect_v8i64_broadcast_rrr: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq %rdi, %zmm2 +; AVX512VL-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512VL-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512VL-NEXT: retq %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer %3 = xor <8 x i64> %1, @@ -966,21 +1009,49 @@ ; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v8i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm4 -; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq ; -; AVX512-LABEL: bitselect_v8i64_broadcast_rrm: -; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogq $228, (%rdi){1to8}, %zmm1, %zmm0 -; AVX512-NEXT: retq +; AVX2-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq (%rdi), %ymm4 +; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpxor %xmm5, %xmm4, %xmm5 +; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq (%rdi), %zmm2 +; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512F-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq (%rdi), %zmm2 +; AVX512VL-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; AVX512VL-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 +; AVX512VL-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm0 +; AVX512VL-NEXT: retq %a2 = load i64, ptr %p2 %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/combine-bswap.ll b/llvm/test/CodeGen/X86/combine-bswap.ll --- a/llvm/test/CodeGen/X86/combine-bswap.ll +++ b/llvm/test/CodeGen/X86/combine-bswap.ll @@ -42,15 +42,15 @@ define i16 @test_bswap_srli_8_bswap_i16(i16 %a) nounwind { ; X86-LABEL: test_bswap_srli_8_bswap_i16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $8, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rolw $8, %ax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: test_bswap_srli_8_bswap_i16: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $8, %eax +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: rolw $8, %ax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %1 = call i16 @llvm.bswap.i16(i16 %a) @@ -106,7 +106,8 @@ ; X64-LABEL: test_bswap_shli_8_bswap_i16: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: movzbl %ah, %eax +; X64-NEXT: andl $65280, %eax # imm = 0xFF00 +; X64-NEXT: rolw $8, %ax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %1 = call i16 @llvm.bswap.i16(i16 %a) @@ -136,8 +137,12 @@ define i64 @test_bswap_shli_16_bswap_i64(i64 %a) nounwind { ; X86-LABEL: test_bswap_shli_16_bswap_i64: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shrl $16, %edx ; X86-NEXT: retl ; ; X64-LABEL: test_bswap_shli_16_bswap_i64: @@ -220,7 +225,7 @@ define i64 @test_bswap64_shift48(i64 %a0) { ; X86-LABEL: test_bswap64_shift48: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: rolw $8, %ax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: xorl %edx, %edx diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -48,8 +48,7 @@ ; AVX1-NEXT: movl $1091567616, 30256(%rax) # imm = 0x41100000 ; AVX1-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001 ; AVX1-NEXT: movq %rcx, 46348(%rax) -; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3] -; AVX1-NEXT: # ymm0 = mem[0,1,0,1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm0 = [?,?,?,?] ; AVX1-NEXT: vmovups %ymm0, 48296(%rax) ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovsd %xmm0, 47372(%rax) @@ -91,25 +90,24 @@ ; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vmovaps (%rsi), %ymm1 ; AVX1-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0,2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,0],xmm0[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: concat_of_broadcast_v4f32_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %ymm0 -; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0] -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [6,7,4,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4],mem[5,6],ymm0[7] -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vmovaps (%rsi), %ymm1 +; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = <6,0,u,3> +; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],mem[2,3,4,5,6,7] +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %ld0 = load volatile <8 x float>, ptr %a0 diff --git a/llvm/test/CodeGen/X86/combine-multiplies.ll b/llvm/test/CodeGen/X86/combine-multiplies.ll --- a/llvm/test/CodeGen/X86/combine-multiplies.ll +++ b/llvm/test/CodeGen/X86/combine-multiplies.ll @@ -105,21 +105,21 @@ define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind { ; CHECK-LABEL: testCombineMultiplies_splat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11] -; CHECK-NEXT: paddd %xmm0, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [22,22,22,22] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-NEXT: pmuludq %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242] -; CHECK-NEXT: paddd %xmm0, %xmm2 +; CHECK-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; CHECK-NEXT: movdqa %xmm2, v2 -; CHECK-NEXT: movdqa %xmm0, v3 -; CHECK-NEXT: movdqa %xmm1, x +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [242,242,242,242] +; CHECK-NEXT: paddd %xmm2, %xmm1 +; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 +; CHECK-NEXT: movdqa %xmm1, v2 +; CHECK-NEXT: movdqa %xmm2, v3 +; CHECK-NEXT: movdqa %xmm0, x ; CHECK-NEXT: retl entry: %add1 = add <4 x i32> %v1, @@ -139,20 +139,20 @@ define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind { ; CHECK-LABEL: testCombineMultiplies_non_splat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44] -; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [22,33,44,55] +; CHECK-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420] -; CHECK-NEXT: paddd %xmm0, %xmm2 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420] +; CHECK-NEXT: paddd %xmm1, %xmm2 +; CHECK-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; CHECK-NEXT: movdqa %xmm2, v2 -; CHECK-NEXT: movdqa %xmm0, v3 -; CHECK-NEXT: movdqa %xmm1, x +; CHECK-NEXT: movdqa %xmm1, v3 +; CHECK-NEXT: movdqa %xmm0, x ; CHECK-NEXT: retl entry: %add1 = add <4 x i32> %v1, diff --git a/llvm/test/CodeGen/X86/combine-or.ll b/llvm/test/CodeGen/X86/combine-or.ll --- a/llvm/test/CodeGen/X86/combine-or.ll +++ b/llvm/test/CodeGen/X86/combine-or.ll @@ -345,7 +345,9 @@ define <4 x float> @test25(<4 x float> %a0) { ; CHECK-LABEL: test25: ; CHECK: # %bb.0: -; CHECK-NEXT: blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast <4 x float> %a0 to <4 x i32> %bc2 = bitcast <4 x float> to <4 x i32> diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -254,18 +254,18 @@ ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %edi ; AVX2-NEXT: vpextrd $1, %xmm0, %esi @@ -277,18 +277,18 @@ ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512VL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %edi ; AVX512VL-NEXT: vpextrd $1, %xmm0, %esi @@ -300,18 +300,18 @@ ; AVX512DQVL: # %bb.0: # %entry ; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512DQVL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512DQVL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX512DQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512DQVL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %edi ; AVX512DQVL-NEXT: vpextrd $1, %xmm0, %esi @@ -597,7 +597,7 @@ ; AVX512VL-NEXT: .p2align 4, 0x90 ; AVX512VL-NEXT: .LBB8_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512VL-NEXT: vpmovsxdq 2097152(%rdi,%rax), %zmm2 ; AVX512VL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 ; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 @@ -616,7 +616,7 @@ ; AVX512DQVL-NEXT: .p2align 4, 0x90 ; AVX512DQVL-NEXT: .LBB8_1: # %loop ; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512DQVL-NEXT: vpmovsxdq 2097152(%rdi,%rax), %zmm2 ; AVX512DQVL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 ; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll --- a/llvm/test/CodeGen/X86/combine-rotates.ll +++ b/llvm/test/CodeGen/X86/combine-rotates.ll @@ -163,8 +163,10 @@ ; ; AVX512-LABEL: combine_vec_rot_select_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1} +; AVX512-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512-NEXT: retq %3 = and <4 x i32> %1, %4 = shl <4 x i32> %0, %3 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1031,19 +1031,19 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld $29, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: psrld $30, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm3 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1115,37 +1115,37 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrld $28, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld $29, %xmm4 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE2-NEXT: psrld $30, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm0 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE2-NEXT: psrld $30, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3] +; SSE2-NEXT: paddd %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: psrad $4, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm4 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE2-NEXT: psrad $2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: psrld $28, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrld $29, %xmm4 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE2-NEXT: psrld $30, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm2 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE2-NEXT: psrld $30, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3] +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: psrad $4, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm4 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] ; SSE2-NEXT: psrad $2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -1258,73 +1258,73 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: psrld $28, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: psrld $30, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm0 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm0[0,3] +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: psrad $4, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE2-NEXT: psrad $2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: psrld $28, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: psrld $30, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm1[0,3] +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: psrad $4, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: psrld $28, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: psrld $30, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm4 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm4[0,3] +; SSE2-NEXT: paddd %xmm2, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: psrad $4, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] ; SSE2-NEXT: psrad $2, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: psrld $29, %xmm6 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] -; SSE2-NEXT: paddd %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psrld $28, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm2 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm5[1] +; SSE2-NEXT: psrld $30, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm2[0,3] +; SSE2-NEXT: paddd %xmm3, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm2 ; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] ; SSE2-NEXT: psrad $2, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] ; SSE2-NEXT: movaps %xmm4, %xmm2 ; SSE2-NEXT: movaps %xmm5, %xmm3 @@ -1988,25 +1988,25 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrld $28, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrld $29, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrld $30, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $4, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] +; SSE2-NEXT: psrld $29, %xmm1 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: psrld $30, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1] ; SSE2-NEXT: psrad $3, %xmm3 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: psrad $2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubd %xmm1, %xmm2 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE2-NEXT: psrad $2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubd %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: @@ -3055,7 +3055,8 @@ ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15] +; XOP-NEXT: vpsrlw $8, %xmm2, %xmm2 +; XOP-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -3159,7 +3160,7 @@ ; CHECK-NEXT: testw %di, %di ; CHECK-NEXT: cmovnsl %edi, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: sarl $8, %eax +; CHECK-NEXT: shrl $8, %eax ; CHECK-NEXT: negl %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -711,13 +711,10 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) { ; SSE2-LABEL: combine_vec_shl_mul0: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [20,20,20,20] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslld $2, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pslld $2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_mul0: diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -225,7 +225,8 @@ ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -247,23 +248,33 @@ define <16 x i8> @combine_vec_ashr_trunc_lshr_splat(<16 x i32> %x) { ; SSE-LABEL: combine_vec_ashr_trunc_lshr_splat: ; SSE: # %bb.0: -; SSE-NEXT: psrad $26, %xmm3 -; SSE-NEXT: psrad $26, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: psrad $26, %xmm1 -; SSE-NEXT: psrad $26, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 +; SSE-NEXT: psrld $24, %xmm1 +; SSE-NEXT: psrld $24, %xmm0 +; SSE-NEXT: packusdw %xmm1, %xmm0 +; SSE-NEXT: psrld $24, %xmm3 +; SSE-NEXT: psrld $24, %xmm2 +; SSE-NEXT: packusdw %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: psrlw $2, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_ashr_trunc_lshr_splat: ; AVX: # %bb.0: -; AVX-NEXT: vpsrad $26, %ymm1, %ymm1 -; AVX-NEXT: vpsrad $26, %ymm0, %ymm0 -; AVX-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $24, %ymm1, %ymm1 +; AVX-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %1 = lshr <16 x i32> %x, @@ -297,7 +308,8 @@ ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -319,16 +331,18 @@ define <8 x i16> @combine_vec_ashr_trunc_ashr_splat(<8 x i32> %x) { ; SSE-LABEL: combine_vec_ashr_trunc_ashr_splat: ; SSE: # %bb.0: -; SSE-NEXT: psrad $19, %xmm1 -; SSE-NEXT: psrad $19, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: packusdw %xmm1, %xmm0 +; SSE-NEXT: psraw $3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_ashr_trunc_ashr_splat: ; AVX: # %bb.0: -; AVX-NEXT: vpsrad $19, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %1 = ashr <8 x i32> %x, diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll --- a/llvm/test/CodeGen/X86/combine-srem.ll +++ b/llvm/test/CodeGen/X86/combine-srem.ll @@ -494,7 +494,7 @@ ; CHECK-NEXT: leal 15(%rax), %ecx ; CHECK-NEXT: testw %ax, %ax ; CHECK-NEXT: cmovnsl %edi, %ecx -; CHECK-NEXT: andl $-16, %ecx +; CHECK-NEXT: andl $65520, %ecx # imm = 0xFFF0 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $rax ; CHECK-NEXT: retq @@ -509,7 +509,7 @@ ; CHECK-NEXT: leal 255(%rax), %ecx ; CHECK-NEXT: testw %ax, %ax ; CHECK-NEXT: cmovnsl %edi, %ecx -; CHECK-NEXT: andl $-256, %ecx +; CHECK-NEXT: andl $65280, %ecx # imm = 0xFF00 ; CHECK-NEXT: subl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll --- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll +++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll @@ -139,18 +139,18 @@ ; SSE-LABEL: demandedelts_pblendvb: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pshufb %xmm0, %xmm3 +; SSE-NEXT: pshufb %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: demandedelts_pblendvb: ; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <16 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -212,17 +212,17 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] ; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm6 +; SSE2-NEXT: pandn %xmm4, %xmm6 ; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: packssdw %xmm6, %xmm5 diff --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll --- a/llvm/test/CodeGen/X86/combine-sub.ll +++ b/llvm/test/CodeGen/X86/combine-sub.ll @@ -286,10 +286,10 @@ ; SSE-NEXT: movdqu (%rdi), %xmm0 ; SSE-NEXT: movdqu 16(%rdi), %xmm1 ; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: psubd %xmm2, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rdi) +; SSE-NEXT: psubd %xmm2, %xmm1 ; SSE-NEXT: movdqu %xmm1, 16(%rdi) +; SSE-NEXT: movdqu %xmm0, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: PR52032_oneuse_constant: @@ -317,14 +317,14 @@ ; SSE-NEXT: movdqu 16(%rdi), %xmm2 ; SSE-NEXT: movdqu 32(%rdi), %xmm3 ; SSE-NEXT: movdqu 48(%rdi), %xmm4 -; SSE-NEXT: psubd %xmm0, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm1 -; SSE-NEXT: movdqu %xmm1, (%rdi) +; SSE-NEXT: psubd %xmm0, %xmm2 ; SSE-NEXT: movdqu %xmm2, 16(%rdi) -; SSE-NEXT: psubd %xmm0, %xmm4 +; SSE-NEXT: movdqu %xmm1, (%rdi) ; SSE-NEXT: psubd %xmm0, %xmm3 -; SSE-NEXT: movdqu %xmm3, 32(%rdi) +; SSE-NEXT: psubd %xmm0, %xmm4 ; SSE-NEXT: movdqu %xmm4, 48(%rdi) +; SSE-NEXT: movdqu %xmm3, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: PR52032: diff --git a/llvm/test/CodeGen/X86/commute-blend-sse41.ll b/llvm/test/CodeGen/X86/commute-blend-sse41.ll --- a/llvm/test/CodeGen/X86/commute-blend-sse41.ll +++ b/llvm/test/CodeGen/X86/commute-blend-sse41.ll @@ -54,11 +54,11 @@ define void @baz(ptr %arg, ptr %arg1) optsize { ; CHECK-LABEL: baz: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movaps (%rdi), %xmm0 -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [3,3] -; CHECK-NEXT: andps %xmm0, %xmm1 -; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; CHECK-NEXT: movups %xmm1, (%rsi) +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: movq 8(%rdi), %rcx +; CHECK-NEXT: movq %rax, (%rsi) +; CHECK-NEXT: andl $3, %ecx +; CHECK-NEXT: movq %rcx, 8(%rsi) ; CHECK-NEXT: retq bb: %tmp = load <2 x i64>, ptr %arg, align 16 diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll --- a/llvm/test/CodeGen/X86/conditional-tailcall.ll +++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll @@ -476,8 +476,8 @@ ; WIN64-NEXT: # %bb.5: # %sw.bb ; WIN64-NEXT: # in Loop: Header=BB3_1 Depth=1 ; WIN64-NEXT: movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09] -; WIN64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] ; WIN64-NEXT: movl $1, %r8d # encoding: [0x41,0xb8,0x01,0x00,0x00,0x00] +; WIN64-NEXT: cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b] ; WIN64-NEXT: je .LBB3_10 # encoding: [0x74,A] ; WIN64-NEXT: # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1 ; WIN64-NEXT: # %bb.6: # %sw.bb diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll @@ -1595,10 +1595,10 @@ define i64 @test_i64_140737488289792_mask_lshr_15(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_lshr_15: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $16, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $17, %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1615,10 +1615,11 @@ define i64 @test_i64_140737488289792_mask_lshr_16(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_lshr_16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $32767, %eax # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $16, %ecx, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1769,10 +1770,10 @@ define i64 @test_i64_140737488289792_mask_ashr_15(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_ashr_15: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $16, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $17, %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1789,10 +1790,11 @@ define i64 @test_i64_140737488289792_mask_ashr_16(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_ashr_16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $32767, %eax # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shldl $16, %ecx, %eax +; X86-NEXT: shll $16, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: retl ; @@ -1996,12 +1998,13 @@ define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_shl_15: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shll $16, %ecx ; X86-NEXT: movl $32767, %edx # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx -; X86-NEXT: shldl $15, %eax, %edx -; X86-NEXT: andl $65536, %eax # imm = 0x10000 -; X86-NEXT: shll $15, %eax +; X86-NEXT: shldl $15, %ecx, %edx +; X86-NEXT: shll $31, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_i64_140737488289792_mask_shl_15: @@ -2017,7 +2020,8 @@ define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) { ; X86-LABEL: test_i64_140737488289792_mask_shl_16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $16, %eax ; X86-NEXT: movl $32767, %edx # imm = 0x7FFF ; X86-NEXT: andl {{[0-9]+}}(%esp), %edx ; X86-NEXT: shldl $16, %eax, %edx diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -17,7 +17,8 @@ ; X64-NEXT: imull %ecx, %esi ; X64-NEXT: addl %edx, %esi ; X64-NEXT: movslq %esi, %rax -; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: retq entry: %tmp7 = mul i32 %idxY, %ref_frame_stride ; [#uses=2] @@ -70,13 +71,13 @@ ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: adcl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %edi ; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll --- a/llvm/test/CodeGen/X86/dagcombine-select.ll +++ b/llvm/test/CodeGen/X86/dagcombine-select.ll @@ -7,7 +7,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: cmovgel %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -20,7 +22,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: cmovgel %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -33,7 +37,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: cmovll %esi, %eax +; CHECK-NEXT: setl %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: andl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 -1, i32 0 @@ -61,9 +67,11 @@ define i32 @select_or1(i32 %x, i32 %y) { ; CHECK-LABEL: select_or1: ; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovll %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -74,9 +82,11 @@ define i32 @select_or2(i32 %x, i32 %y) { ; CHECK-LABEL: select_or2: ; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovll %esi, %eax +; CHECK-NEXT: setge %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 0, i32 -1 @@ -87,9 +97,11 @@ define i32 @select_or3(i32 %x, i32 %y) { ; CHECK-LABEL: select_or3: ; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $11, %edi -; CHECK-NEXT: movl $-1, %eax -; CHECK-NEXT: cmovgel %esi, %eax +; CHECK-NEXT: setl %al +; CHECK-NEXT: negl %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: retq %c = icmp slt i32 %x, 11 %s = select i1 %c, i32 -1, i32 0 @@ -180,10 +192,9 @@ define i32 @sel_constants_shl_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_shl_constant: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %dil -; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: orl $2, %eax +; CHECK-NEXT: xorl $3, %eax ; CHECK-NEXT: shll $8, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 @@ -194,10 +205,12 @@ define i32 @shl_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: shl_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: notb %dil -; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: leal 4(,%rax,4), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = shl i32 1, %sel @@ -207,10 +220,12 @@ define i32 @shl_constant_sel_setcc(i32 %a) { ; CHECK-LABEL: shl_constant_sel_setcc: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: leal 4(,%rax,4), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: retq %m = and i32 %a, 1 %cond = icmp ne i32 %m, 0 @@ -222,9 +237,12 @@ define i32 @lshr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: lshr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: leal 8(,%rdi,8), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $64, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = lshr i32 64, %sel @@ -234,9 +252,12 @@ define i32 @lshr_constant_sel_setcc(i32 %a) { ; CHECK-LABEL: lshr_constant_sel_setcc: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: leal 8(,%rdi,8), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $64, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %m = and i32 %a, 1 %cond = icmp ne i32 %m, 0 @@ -248,10 +269,12 @@ define i32 @ashr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: ashr_constant_sel_constants: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: shll $4, %edi -; CHECK-NEXT: leal 16(%rdi), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $128, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %sel = select i1 %cond, i32 2, i32 3 %bo = ashr i32 128, %sel @@ -261,10 +284,12 @@ define i32 @ashr_constant_sel_setcc(i32 %a) { ; CHECK-LABEL: ashr_constant_sel_setcc: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: shll $4, %edi -; CHECK-NEXT: leal 16(%rdi), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: andb $1, %cl +; CHECK-NEXT: xorb $3, %cl +; CHECK-NEXT: movl $128, %eax +; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-NEXT: shrl %cl, %eax ; CHECK-NEXT: retq %m = and i32 %a, 1 %cond = icmp ne i32 %m, 0 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -589,8 +589,8 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl %ebx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %esi, 8(%eax) ; X86-NEXT: movl %edi, 12(%eax) ; X86-NEXT: addl $156, %esp @@ -1033,35 +1033,35 @@ ; X86-NEXT: movd %xmm2, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: movd %xmm2, %eax -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-NEXT: movd %xmm2, %esi +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-NEXT: movd %xmm3, %eax +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-NEXT: movd %xmm3, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X86-NEXT: movd %xmm4, %eax -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X86-NEXT: movd %xmm4, %esi +; X86-NEXT: movd %eax, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X86-NEXT: movd %xmm5, %eax +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X86-NEXT: movd %xmm5, %esi ; X86-NEXT: cltd ; X86-NEXT: idivl %esi -; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X86-NEXT: movdqa %xmm3, (%ecx) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X86-NEXT: movd %eax, %xmm5 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X86-NEXT: movdqa %xmm4, (%ecx) +; X86-NEXT: pmuludq %xmm1, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: pmuludq %xmm5, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-NEXT: psubd %xmm3, %xmm0 @@ -1089,25 +1089,25 @@ ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: cltd ; X64-NEXT: idivl %ecx -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm4, %eax -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: movd %eax, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X64-NEXT: movd %xmm5, %ecx ; X64-NEXT: cltd ; X64-NEXT: idivl %ecx -; X64-NEXT: movd %eax, %xmm4 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, (%rdi) -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-NEXT: movd %eax, %xmm5 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X64-NEXT: movdqa %xmm4, (%rdi) +; X64-NEXT: pmuludq %xmm1, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-NEXT: pmuludq %xmm3, %xmm1 +; X64-NEXT: pmuludq %xmm5, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: psubd %xmm2, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X64-NEXT: psubd %xmm3, %xmm0 ; X64-NEXT: retq %div = sdiv <4 x i32> %x, %y store <4 x i32> %div, ptr %divdst, align 16 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -540,8 +540,8 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: movl %edi, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: addl $132, %esp @@ -984,35 +984,35 @@ ; X86-NEXT: movd %xmm2, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; X86-NEXT: movd %xmm2, %eax -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; X86-NEXT: movd %xmm2, %esi +; X86-NEXT: movd %eax, %xmm2 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; X86-NEXT: movd %xmm3, %eax +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; X86-NEXT: movd %xmm3, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X86-NEXT: movd %eax, %xmm3 +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; X86-NEXT: movd %xmm0, %eax ; X86-NEXT: movd %xmm1, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X86-NEXT: movd %xmm4, %eax -; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X86-NEXT: movd %xmm4, %esi +; X86-NEXT: movd %eax, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X86-NEXT: movd %xmm5, %eax +; X86-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X86-NEXT: movd %xmm5, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: divl %esi -; X86-NEXT: movd %eax, %xmm4 -; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; X86-NEXT: movdqa %xmm3, (%ecx) -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; X86-NEXT: movd %eax, %xmm5 +; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X86-NEXT: movdqa %xmm4, (%ecx) +; X86-NEXT: pmuludq %xmm1, %xmm4 +; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm2, %xmm1 +; X86-NEXT: pmuludq %xmm5, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; X86-NEXT: psubd %xmm3, %xmm0 @@ -1040,25 +1040,25 @@ ; X64-NEXT: movd %xmm1, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %ecx -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; X64-NEXT: movd %xmm4, %eax -; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; X64-NEXT: movd %xmm4, %ecx +; X64-NEXT: movd %eax, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1] +; X64-NEXT: movd %xmm5, %eax +; X64-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; X64-NEXT: movd %xmm5, %ecx ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: divl %ecx -; X64-NEXT: movd %eax, %xmm4 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, (%rdi) -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; X64-NEXT: movd %eax, %xmm5 +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; X64-NEXT: movdqa %xmm4, (%rdi) +; X64-NEXT: pmuludq %xmm1, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-NEXT: pmuludq %xmm3, %xmm1 +; X64-NEXT: pmuludq %xmm5, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: psubd %xmm2, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X64-NEXT: psubd %xmm3, %xmm0 ; X64-NEXT: retq %div = udiv <4 x i32> %x, %y store <4 x i32> %div, ptr %divdst, align 16 diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -320,7 +320,10 @@ ; X64-FAST-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 ; X64-FAST-NEXT: movq %rdi, %rax ; X64-FAST-NEXT: mulq %rcx -; X64-FAST-NEXT: movq %rdx, %rax +; X64-FAST-NEXT: subq %rdx, %rdi +; X64-FAST-NEXT: shrq %rdi +; X64-FAST-NEXT: leaq (%rdi,%rdx), %rax +; X64-FAST-NEXT: shrq $2, %rax ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: PR23590: @@ -329,10 +332,14 @@ ; X64-SLOW-NEXT: movq %rdi, %rax ; X64-SLOW-NEXT: mulq %rcx ; X64-SLOW-NEXT: shrq $12, %rdx -; X64-SLOW-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039 -; X64-SLOW-NEXT: subq %rax, %rdi +; X64-SLOW-NEXT: imull $12345, %edx, %eax # imm = 0x3039 +; X64-SLOW-NEXT: subl %eax, %edi ; X64-SLOW-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925 ; X64-SLOW-NEXT: shrq $32, %rax +; X64-SLOW-NEXT: subl %eax, %edi +; X64-SLOW-NEXT: shrl %edi +; X64-SLOW-NEXT: addl %edi, %eax +; X64-SLOW-NEXT: shrl $2, %eax ; X64-SLOW-NEXT: retq entry: %rem = urem i64 %x, 12345 diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -19,8 +19,8 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __modti3 @@ -49,8 +49,8 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $3, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __divti3 @@ -79,8 +79,8 @@ ; WIN64-NEXT: subq $72, %rsp ; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp) ; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) +; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx ; WIN64-NEXT: callq __umodti3 @@ -969,8 +969,8 @@ ; WIN64-NEXT: movq %rdx, 8(%rax) ; WIN64-NEXT: movq %rcx, (%rax) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: andq $0, 8(%rdx) +; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: movq %rax, %rcx ; WIN64-NEXT: callq __umodti3 ; WIN64-NEXT: movq %xmm0, %rax @@ -1001,8 +1001,8 @@ ; WIN64-NEXT: movq %rdx, 8(%rax) ; WIN64-NEXT: movq %rcx, (%rax) ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: movq $0, 8(%rdx) +; WIN64-NEXT: movq $3, (%rdx) ; WIN64-NEXT: movq %rax, %rcx ; WIN64-NEXT: callq __umodti3 ; WIN64-NEXT: movq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll b/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll --- a/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll +++ b/llvm/test/CodeGen/X86/dont-trunc-store-double-to-float.ll @@ -10,10 +10,13 @@ ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: .cfi_def_cfa_register %ebp ; CHECK-NEXT: andl $-8, %esp -; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: subl $24, %esp ; CHECK-NEXT: movl $1074339512, {{[0-9]+}}(%esp) # imm = 0x40091EB8 -; CHECK-NEXT: movl $1374389535, (%esp) # imm = 0x51EB851F -; CHECK-NEXT: movl $1078523331, {{[0-9]+}}(%esp) # imm = 0x4048F5C3 +; CHECK-NEXT: movl $1374389535, {{[0-9]+}}(%esp) # imm = 0x51EB851F +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -6,13 +6,18 @@ define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: no_dpbusd: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -22,13 +27,16 @@ ; ; AVX512-LABEL: no_dpbusd: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -49,44 +57,41 @@ define i32 @vpdpbusd_mutate(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_mutate: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqa (%rsi), %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rdi), %xmm0, %xmm1 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpmovsxbd 8(%rdi), %ymm0 +; AVXVNNI-NEXT: vpmovsxbd (%rdi), %ymm1 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_mutate: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VNNI-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_mutate: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd (%rdi), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_mutate: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovsxbd (%rdi), %zmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 %1 = sext <16 x i8> %0 to <16 x i32> @@ -109,9 +114,9 @@ ; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -128,9 +133,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -163,9 +168,9 @@ ; AVXVNNI-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -182,9 +187,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -210,44 +215,41 @@ define i32 @vpdpbusd_512(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_512: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %xmm0, %xmm1 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd 8(%rsi), %ymm2 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm2 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_512: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VNNI-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm1, %zmm0, %zmm2 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_512: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd (%rsi), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_512: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %zmm1 +; AVX512-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 %1 = zext <16 x i8> %0 to <16 x i32> @@ -264,40 +266,35 @@ define i32 @vpdpbusd_256(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_256: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm1, %xmm2 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm1 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_256: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_256: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_256: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %ymm1 +; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <8 x i8>, ptr %a, align 8 %1 = zext <8 x i8> %0 to <8 x i32> @@ -314,42 +311,29 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_128: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 -; AVXVNNI-NEXT: vmovd %xmm2, %eax +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %xmm1 +; AVXVNNI-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_128: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vmovd %xmm2, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_128: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_128: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %xmm1 +; AVX512-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: retq entry: %0 = load <4 x i8>, ptr %a, align 8 %1 = zext <4 x i8> %0 to <4 x i32> @@ -367,40 +351,28 @@ ; AVXVNNI-LABEL: vpdpbusd_2xi32: ; AVXVNNI: # %bb.0: # %entry ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 -; AVXVNNI-NEXT: vmovd %xmm2, %eax +; AVXVNNI-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVXVNNI-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edx, %eax ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_2xi32: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] -; AVX512VNNI-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512VNNI-NEXT: vpandq %zmm1, %zmm2, %zmm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vmovd %xmm2, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_2xi32: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VLVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_2xi32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NEXT: vpmovsxbd %xmm1, %xmm1 +; AVX512-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: retq entry: %0 = load <2 x i8>, ptr %a, align 8 %1 = zext <2 x i8> %0 to <2 x i32> @@ -417,13 +389,25 @@ define i32 @vpdpbusd_32xi32(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_32xi32: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %ymm0, %ymm1 -; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd 16(%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm4, %ymm0 +; AVXVNNI-NEXT: vpmovsxbd 24(%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm4, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd 8(%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm2 +; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -431,38 +415,27 @@ ; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: vpdpbusd_32xi32: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VNNI-NEXT: vmovdqu (%rsi), %ymm1 -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm1, %zmm0, %zmm2 -; AVX512VNNI-NEXT: vextracti128 $1, %ymm2, %xmm0 -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edx, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: vpdpbusd_32xi32: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd (%rsi), %ymm0, %ymm1 -; AVX512VLVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edx, %eax -; AVX512VLVNNI-NEXT: vzeroupper -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: vpdpbusd_32xi32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovsxbd (%rsi), %zmm2 +; AVX512-NEXT: vpmulld %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vpmovsxbd 16(%rsi), %zmm2 +; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edx, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = load <32 x i8>, ptr %a, align 16 %1 = zext <32 x i8> %0 to <32 x i32> @@ -479,17 +452,41 @@ define i32 @vpdpbusd_64xi32(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVXVNNI-LABEL: vpdpbusd_64xi32: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovdqu (%rdi), %ymm0 -; AVXVNNI-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVXVNNI-NEXT: {vex} vpdpbusd 32(%rsi), %ymm1, %ymm3 -; AVXVNNI-NEXT: {vex} vpdpbusd (%rsi), %ymm0, %ymm2 -; AVXVNNI-NEXT: vpaddd %ymm3, %ymm2, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovsxbd 40(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm8, %ymm0 +; AVXVNNI-NEXT: vpmovsxbd 56(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm8, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd 32(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm8, %ymm2 +; AVXVNNI-NEXT: vpmovsxbd 48(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm8, %ymm3 +; AVXVNNI-NEXT: vpmovsxbd 16(%rsi), %ymm8 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm8, %ymm4 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm4, %ymm3 +; AVXVNNI-NEXT: vpmovsxbd (%rsi), %ymm4 +; AVXVNNI-NEXT: vpmaddwd %ymm5, %ymm4, %ymm4 +; AVXVNNI-NEXT: vpaddd %ymm2, %ymm4, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm2, %ymm2 +; AVXVNNI-NEXT: vpmovsxbd 24(%rsi), %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm6, %ymm3, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVXVNNI-NEXT: vpmovsxbd 8(%rsi), %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm7, %ymm3, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -499,15 +496,27 @@ ; ; AVX512-LABEL: vpdpbusd_64xi32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpdpbusd (%rsi), %zmm0, %zmm1 -; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovsxbd 16(%rsi), %zmm4 +; AVX512-NEXT: vpmulld %zmm0, %zmm4, %zmm0 +; AVX512-NEXT: vpmovsxbd 48(%rsi), %zmm4 +; AVX512-NEXT: vpmulld %zmm1, %zmm4, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpmovsxbd (%rsi), %zmm1 +; AVX512-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vpmovsxbd 32(%rsi), %zmm2 +; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -526,3 +535,6 @@ } declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512VLVNNI: {{.*}} +; AVX512VNNI: {{.*}} diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -24,35 +24,17 @@ } define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_zc: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq -; -; AVX512VNNI-LABEL: mul_4xi8_zc: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vmovd %xmm1, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_4xi8_zc: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; ALL-LABEL: mul_4xi8_zc: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: addl %edi, %eax +; ALL-NEXT: retq entry: %0 = zext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> %0, @@ -64,35 +46,39 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) { ; AVXVNNI-LABEL: mul_4xi4_cz: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVXVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax +; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; AVXVNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edi, %eax ; AVXVNNI-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi4_cz: ; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512VNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vmovd %xmm1, %eax +; AVX512VNNI-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; AVX512VNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vmovd %xmm0, %eax ; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper ; AVX512VNNI-NEXT: retq ; ; AVX512VLVNNI-LABEL: mul_4xi4_cz: ; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpmovdb %xmm0, %xmm0 ; AVX512VLVNNI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax +; AVX512VLVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax ; AVX512VLVNNI-NEXT: addl %edi, %eax ; AVX512VLVNNI-NEXT: retq entry: @@ -104,38 +90,17 @@ } define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_cs: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq -; -; AVX512VNNI-LABEL: mul_4xi8_cs: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vmovd %xmm2, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_4xi8_cs: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; ALL-LABEL: mul_4xi8_cs: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vpmovsxbd %xmm0, %xmm0 +; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: addl %edi, %eax +; ALL-NEXT: retq entry: %0 = sext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> , %0 @@ -167,41 +132,41 @@ define i32 @mul_16xi8_zc(<16 x i8> %a, i32 %c) { ; AVXVNNI-LABEL: mul_16xi8_zc: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,64,0,0,0,1,0,2,0,64,0] +; AVXVNNI-NEXT: # ymm2 = mem[0,1,0,1] +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edi, %eax +; AVXVNNI-NEXT: vzeroupper ; AVXVNNI-NEXT: retq ; -; AVX512VNNI-LABEL: mul_16xi8_zc: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vmovdqa %xmm0, %xmm0 -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VNNI-NEXT: vmovd %xmm0, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_16xi8_zc: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; AVX512-LABEL: mul_16xi8_zc: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: addl %edi, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %0 = zext <16 x i8> %a to <16 x i32> %1 = mul nsw <16 x i32> %0, @@ -213,12 +178,26 @@ define i32 @mul_32xi8_zc(<32 x i8> %a, i32 %c) { ; AVXVNNI-LABEL: mul_32xi8_zc: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,1,0,2,0,64,0,0,0,1,0,2,0,64,0] +; AVXVNNI-NEXT: # ymm4 = mem[0,1,0,1] +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm1, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm2, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -232,9 +211,9 @@ ; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512VNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VNNI-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512VNNI-NEXT: vmovd %xmm0, %eax @@ -247,9 +226,9 @@ ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX512VLVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VLVNNI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax @@ -267,16 +246,41 @@ define i32 @mul_64xi8_zc(<64 x i8> %a, i32 %c) { ; AVXVNNI-LABEL: mul_64xi8_zc: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] -; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVXVNNI-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 -; AVXVNNI-NEXT: vpaddd %ymm4, %ymm3, %ymm0 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero +; AVXVNNI-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVXVNNI-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,0,1,0,2,0,64,0,0,0,1,0,2,0,64,0] +; AVXVNNI-NEXT: # ymm8 = mem[0,1,0,1] +; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm8, %ymm0 +; AVXVNNI-NEXT: vpmaddwd %ymm5, %ymm8, %ymm5 +; AVXVNNI-NEXT: vpmaddwd %ymm7, %ymm8, %ymm7 +; AVXVNNI-NEXT: vpmaddwd %ymm6, %ymm8, %ymm6 +; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm8, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm6, %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm1, %ymm8, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm7, %ymm1 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVXVNNI-NEXT: vpmaddwd %ymm4, %ymm8, %ymm3 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm5, %ymm3 +; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm8, %ymm2 +; AVXVNNI-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm3, %ymm0, %ymm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVXVNNI-NEXT: vmovd %xmm0, %eax @@ -291,9 +295,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll --- a/llvm/test/CodeGen/X86/dpbusd_i4.ll +++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll @@ -6,15 +6,20 @@ define i32 @mul_i8i8(ptr%a, <16 x i8> %b, i32 %c) { ; CHECK-LABEL: mul_i8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-NEXT: vpmulld %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %esi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 @@ -30,14 +35,20 @@ ; CHECK-LABEL: mul_i4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1 +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = zext <16 x i4> %a to <16 x i32> @@ -51,20 +62,23 @@ define i32 @mul_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-LABEL: mul_i4i4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpsllw $4, %xmm1, %xmm1 -; CHECK-NEXT: vpsrlw $4, %xmm1, %xmm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; CHECK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 -; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpslld $28, %zmm1, %zmm1 +; CHECK-NEXT: vpsrad $28, %zmm1, %zmm1 +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = zext <16 x i4> %a to <16 x i32> @@ -78,17 +92,20 @@ define i32 @mul_sext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-LABEL: mul_sext_i4i4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; CHECK-NEXT: vpsllw $12, %ymm1, %ymm1 -; CHECK-NEXT: vpsraw $12, %ymm1, %ymm1 -; CHECK-NEXT: vpsllw $12, %ymm0, %ymm0 -; CHECK-NEXT: vpsraw $12, %ymm0, %ymm0 -; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpslld $28, %zmm0, %zmm0 +; CHECK-NEXT: vpsrad $28, %zmm0, %zmm0 +; CHECK-NEXT: vpslld $28, %zmm1, %zmm1 +; CHECK-NEXT: vpsrad $28, %zmm1, %zmm1 +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax @@ -108,16 +125,22 @@ ; CHECK-LABEL: mul_zext_i4i4: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpdpbusd %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = zext <16 x i4> %a to <16 x i32> diff --git a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll --- a/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll +++ b/llvm/test/CodeGen/X86/elementwise-store-of-scalar-splat.ll @@ -625,10 +625,10 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) +; ALL-NEXT: notq %rcx ; ALL-NEXT: movq %rcx, 8(%rsi) +; ALL-NEXT: movq %rax, (%rsi) ; ALL-NEXT: movq %rcx, 24(%rsi) ; ALL-NEXT: movq %rax, 16(%rsi) ; ALL-NEXT: retq @@ -1038,10 +1038,10 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) +; ALL-NEXT: notq %rcx ; ALL-NEXT: movq %rcx, 8(%rsi) +; ALL-NEXT: movq %rax, (%rsi) ; ALL-NEXT: movq %rcx, 24(%rsi) ; ALL-NEXT: movq %rax, 16(%rsi) ; ALL-NEXT: movq %rcx, 40(%rsi) @@ -1563,10 +1563,10 @@ ; ALL: # %bb.0: ; ALL-NEXT: movq (%rdi), %rax ; ALL-NEXT: movq 8(%rdi), %rcx -; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, (%rsi) +; ALL-NEXT: notq %rcx ; ALL-NEXT: movq %rcx, 8(%rsi) +; ALL-NEXT: movq %rax, (%rsi) ; ALL-NEXT: movq %rcx, 24(%rsi) ; ALL-NEXT: movq %rax, 16(%rsi) ; ALL-NEXT: movq %rcx, 40(%rsi) @@ -1590,22 +1590,22 @@ define void @vec512_i256(ptr %in.elt.ptr, ptr %out.vec.ptr) nounwind { ; ALL-LABEL: vec512_i256: ; ALL: # %bb.0: -; ALL-NEXT: movq 16(%rdi), %rax -; ALL-NEXT: movq 24(%rdi), %rcx +; ALL-NEXT: movq 24(%rdi), %rax +; ALL-NEXT: movq 16(%rdi), %rcx ; ALL-NEXT: movq (%rdi), %rdx ; ALL-NEXT: movq 8(%rdi), %rdi -; ALL-NEXT: notq %rdi ; ALL-NEXT: notq %rdx +; ALL-NEXT: notq %rdi ; ALL-NEXT: notq %rcx ; ALL-NEXT: notq %rax -; ALL-NEXT: movq %rax, 16(%rsi) -; ALL-NEXT: movq %rcx, 24(%rsi) -; ALL-NEXT: movq %rdx, (%rsi) +; ALL-NEXT: movq %rax, 24(%rsi) +; ALL-NEXT: movq %rcx, 16(%rsi) ; ALL-NEXT: movq %rdi, 8(%rsi) -; ALL-NEXT: movq %rax, 48(%rsi) -; ALL-NEXT: movq %rcx, 56(%rsi) -; ALL-NEXT: movq %rdx, 32(%rsi) +; ALL-NEXT: movq %rdx, (%rsi) +; ALL-NEXT: movq %rax, 56(%rsi) +; ALL-NEXT: movq %rcx, 48(%rsi) ; ALL-NEXT: movq %rdi, 40(%rsi) +; ALL-NEXT: movq %rdx, 32(%rsi) ; ALL-NEXT: retq %in.elt.not = load i256, ptr %in.elt.ptr, align 64 %in.elt = xor i256 %in.elt.not, -1 diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll --- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll @@ -72,39 +72,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_sdiv_v4i32: @@ -215,39 +214,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_udiv_v4i32: @@ -358,39 +356,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %edx, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: cltd ; SSE-NEXT: idivl %ecx ; SSE-NEXT: movd %edx, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_srem_v4i32: @@ -501,39 +498,38 @@ ; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: psubd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; SSE-NEXT: movd %xmm2, %ecx -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; SSE-NEXT: movd %xmm2, %eax +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movd %xmm3, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE-NEXT: movd %xmm3, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %edx, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movd %xmm1, %ecx +; SSE-NEXT: movd %edx, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE-NEXT: movd %xmm2, %ecx ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: xorl %edx, %edx ; SSE-NEXT: divl %ecx ; SSE-NEXT: movd %edx, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, (%rdi) +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; ; AVX1-LABEL: vp_urem_v4i32: diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -6214,13 +6214,13 @@ ; X64-NOBMI-LABEL: bextr64_32_c0: ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movq %rsi, %rcx +; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-NOBMI-NEXT: shrq %cl, %rdi +; X64-NOBMI-NEXT: shrq %cl, %rax ; X64-NOBMI-NEXT: negb %dl -; X64-NOBMI-NEXT: movq $-1, %rax ; X64-NOBMI-NEXT: movl %edx, %ecx +; X64-NOBMI-NEXT: shlq %cl, %rax ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NOBMI-NEXT: retq ; @@ -6236,7 +6236,8 @@ ; X64-BMI2-LABEL: bextr64_32_c0: ; X64-BMI2: # %bb.0: ; X64-BMI2-NEXT: shrxq %rsi, %rdi, %rax -; X64-BMI2-NEXT: bzhil %edx, %eax, %eax +; X64-BMI2-NEXT: bzhiq %rdx, %rax, %rax +; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax ; X64-BMI2-NEXT: retq %shifted = lshr i64 %val, %numskipbits %numhighbits = sub i64 64, %numlowbits @@ -8130,22 +8131,22 @@ ; ; X64-NOBMI-LABEL: pr38938: ; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: movq (%rsi), %rax -; X64-NOBMI-NEXT: shrq $19, %rax -; X64-NOBMI-NEXT: andl $4092, %eax # imm = 0xFFC -; X64-NOBMI-NEXT: incl (%rdi,%rax) +; X64-NOBMI-NEXT: movl (%rsi), %eax +; X64-NOBMI-NEXT: shrl $21, %eax +; X64-NOBMI-NEXT: andl $1023, %eax # imm = 0x3FF +; X64-NOBMI-NEXT: incl (%rdi,%rax,4) ; X64-NOBMI-NEXT: retq ; ; X64-BMINOTBM-LABEL: pr38938: ; X64-BMINOTBM: # %bb.0: ; X64-BMINOTBM-NEXT: movl $2581, %eax # imm = 0xA15 -; X64-BMINOTBM-NEXT: bextrq %rax, (%rsi), %rax +; X64-BMINOTBM-NEXT: bextrl %eax, (%rsi), %eax ; X64-BMINOTBM-NEXT: incl (%rdi,%rax,4) ; X64-BMINOTBM-NEXT: retq ; ; X64-BMITBM-LABEL: pr38938: ; X64-BMITBM: # %bb.0: -; X64-BMITBM-NEXT: bextrq $2581, (%rsi), %rax # imm = 0xA15 +; X64-BMITBM-NEXT: bextrl $2581, (%rsi), %eax # imm = 0xA15 ; X64-BMITBM-NEXT: incl (%rdi,%rax,4) ; X64-BMITBM-NEXT: retq %tmp = load i64, ptr %a1, align 8 diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -9,22 +9,17 @@ ; SSE2-LABEL: foo: ; SSE2: # %bb.0: ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl $65280, %eax # imm = 0xFF00 -; SSE2-NEXT: orl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: foo: ; SSE42: # %bb.0: ; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: movl $255, %eax ; SSE42-NEXT: pinsrb $3, %eax, %xmm0 ; SSE42-NEXT: movd %xmm0, (%rdi) @@ -33,7 +28,7 @@ ; AVX-LABEL: foo: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: movl $255, %eax ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rdi) @@ -162,11 +157,32 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; SSE-NEXT: retq ; -; AVX-LABEL: cat_ext_straddle: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: retq +; AVX1-LABEL: cat_ext_straddle: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: cat_ext_straddle: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rsi), %ymm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: cat_ext_straddle: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vbroadcastsd (%rsi), %ymm1 +; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq %x = load <6 x i32>, ptr %px %y = load <6 x i32>, ptr %py %cat = shufflevector <6 x i32> %x, <6 x i32> %y, <12 x i32> diff --git a/llvm/test/CodeGen/X86/extract-fp.ll b/llvm/test/CodeGen/X86/extract-fp.ll --- a/llvm/test/CodeGen/X86/extract-fp.ll +++ b/llvm/test/CodeGen/X86/extract-fp.ll @@ -86,8 +86,8 @@ define float @ext_maxnum_v4f32(<4 x float> %x) nounwind { ; CHECK-LABEL: ext_maxnum_v4f32: ; CHECK: # %bb.0: +; CHECK-NEXT: maxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: maxss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %v = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> ) %r = extractelement <4 x float> %v, i32 2 diff --git a/llvm/test/CodeGen/X86/extract-insert.ll b/llvm/test/CodeGen/X86/extract-insert.ll --- a/llvm/test/CodeGen/X86/extract-insert.ll +++ b/llvm/test/CodeGen/X86/extract-insert.ll @@ -32,8 +32,8 @@ define i8 @extractelt_bitcast_extra_use(i32 %x, ptr %p) nounwind { ; X86-LABEL: extractelt_bitcast_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/extract-lowbits.ll b/llvm/test/CodeGen/X86/extract-lowbits.ll --- a/llvm/test/CodeGen/X86/extract-lowbits.ll +++ b/llvm/test/CodeGen/X86/extract-lowbits.ll @@ -3031,23 +3031,25 @@ ; X64-NOBMI-LABEL: bzhi64_32_c0: ; X64-NOBMI: # %bb.0: ; X64-NOBMI-NEXT: movq %rsi, %rcx +; X64-NOBMI-NEXT: movq %rdi, %rax ; X64-NOBMI-NEXT: negb %cl -; X64-NOBMI-NEXT: movq $-1, %rax +; X64-NOBMI-NEXT: shlq %cl, %rax ; X64-NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NOBMI-NEXT: shrq %cl, %rax -; X64-NOBMI-NEXT: andl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NOBMI-NEXT: retq ; ; X64-BMI1-LABEL: bzhi64_32_c0: ; X64-BMI1: # %bb.0: ; X64-BMI1-NEXT: shll $8, %esi -; X64-BMI1-NEXT: bextrl %esi, %edi, %eax +; X64-BMI1-NEXT: bextrq %rsi, %rdi, %rax +; X64-BMI1-NEXT: # kill: def $eax killed $eax killed $rax ; X64-BMI1-NEXT: retq ; ; X64-BMI2-LABEL: bzhi64_32_c0: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: bzhil %esi, %edi, %eax +; X64-BMI2-NEXT: bzhiq %rsi, %rdi, %rax +; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax ; X64-BMI2-NEXT: retq %numhighbits = sub i64 64, %numlowbits %mask = lshr i64 -1, %numhighbits diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -571,16 +571,16 @@ define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; X64-LABEL: fmaxnum_v4f32: ; X64: # %bb.0: -; X64-NEXT: vmaxss %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmaxps %xmm0, %xmm1, %xmm2 +; X64-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fmaxnum_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm2 +; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -594,9 +594,9 @@ define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { ; X64-LABEL: fmaxnum_v4f64: ; X64: # %bb.0: -; X64-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X64-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X64-NEXT: vmaxpd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -606,9 +606,9 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X86-NEXT: vmaxpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -623,16 +623,16 @@ define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; X64-LABEL: fminnum_v4f32: ; X64: # %bb.0: -; X64-NEXT: vminss %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X64-NEXT: vminps %xmm0, %xmm1, %xmm2 +; X64-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fminnum_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vminss %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 +; X86-NEXT: vminps %xmm0, %xmm1, %xmm2 +; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -646,9 +646,9 @@ define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { ; X64-LABEL: fminnum_v4f64: ; X64: # %bb.0: -; X64-NEXT: vminsd %xmm0, %xmm1, %xmm2 -; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X64-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X64-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -658,9 +658,9 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vminsd %xmm0, %xmm1, %xmm2 -; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 +; X86-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm2 +; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -406,10 +406,10 @@ ; X32-SSE2-NEXT: andl $-32, %esp ; X32-SSE2-NEXT: subl $64, %esp ; X32-SSE2-NEXT: movdqa zero, %xmm0 -; X32-SSE2-NEXT: movaps n1+16, %xmm1 -; X32-SSE2-NEXT: movaps n1, %xmm2 -; X32-SSE2-NEXT: movaps %xmm2, zero -; X32-SSE2-NEXT: movaps %xmm1, zero+16 +; X32-SSE2-NEXT: movaps n1, %xmm1 +; X32-SSE2-NEXT: movaps n1+16, %xmm2 +; X32-SSE2-NEXT: movaps %xmm2, zero+16 +; X32-SSE2-NEXT: movaps %xmm1, zero ; X32-SSE2-NEXT: movaps {{.*#+}} xmm1 = [2,2,2,2] ; X32-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X32-SSE2-NEXT: movaps %xmm1, (%esp) @@ -444,8 +444,8 @@ ; X64-SSSE3-NEXT: movq n1@GOTPCREL(%rip), %rax ; X64-SSSE3-NEXT: movaps (%rax), %xmm1 ; X64-SSSE3-NEXT: movaps 16(%rax), %xmm2 -; X64-SSSE3-NEXT: movaps %xmm1, zero(%rip) ; X64-SSSE3-NEXT: movaps %xmm2, zero+16(%rip) +; X64-SSSE3-NEXT: movaps %xmm1, zero(%rip) ; X64-SSSE3-NEXT: movaps {{.*#+}} xmm1 = [2,2,2,2] ; X64-SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; X64-SSSE3-NEXT: movaps %xmm1, (%rsp) diff --git a/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/f16c-intrinsics-upgrade.ll @@ -92,7 +92,9 @@ ; X86-LABEL: test_x86_vcvtph2ps_128_scalar: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-NEXT: vmovsd (%eax), %xmm0 # encoding: [0xc5,0xfb,0x10,0x00] +; X86-NEXT: # xmm0 = mem[0],zero +; X86-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_128_scalar: @@ -103,7 +105,9 @@ ; X86-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-AVX512VL-NEXT: vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00] +; X86-AVX512VL-NEXT: # xmm0 = mem[0],zero +; X86-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar: @@ -122,7 +126,9 @@ ; X86-LABEL: test_x86_vcvtph2ps_128_scalar2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-NEXT: vmovsd (%eax), %xmm0 # encoding: [0xc5,0xfb,0x10,0x00] +; X86-NEXT: # xmm0 = mem[0],zero +; X86-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtph2ps_128_scalar2: @@ -133,7 +139,9 @@ ; X86-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar2: ; X86-AVX512VL: # %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00] +; X86-AVX512VL-NEXT: vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00] +; X86-AVX512VL-NEXT: # xmm0 = mem[0],zero +; X86-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_scalar2: diff --git a/llvm/test/CodeGen/X86/fdiv.ll b/llvm/test/CodeGen/X86/fdiv.ll --- a/llvm/test/CodeGen/X86/fdiv.ll +++ b/llvm/test/CodeGen/X86/fdiv.ll @@ -85,11 +85,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, %xmm3 ; CHECK-NEXT: subss %xmm1, %xmm3 +; CHECK-NEXT: mulss %xmm2, %xmm3 ; CHECK-NEXT: subss %xmm0, %xmm1 -; CHECK-NEXT: mulss %xmm2, %xmm1 -; CHECK-NEXT: subss %xmm2, %xmm3 -; CHECK-NEXT: divss %xmm3, %xmm1 -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: addss %xmm2, %xmm1 +; CHECK-NEXT: divss %xmm1, %xmm3 +; CHECK-NEXT: movaps %xmm3, %xmm0 ; CHECK-NEXT: retq %sub1 = fsub fast float %a0, %a1 %mul2 = fmul fast float %sub1, %a2 diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll --- a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll @@ -68,14 +68,14 @@ define float @test_fneg_fma_subx_negy_negz_f32(float %w, float %x, float %y, float %z) { ; FMA3-LABEL: test_fneg_fma_subx_negy_negz_f32: ; FMA3: # %bb.0: # %entry -; FMA3-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; FMA3-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm3 +; FMA3-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; FMA3-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 ; FMA3-NEXT: retq ; ; FMA4-LABEL: test_fneg_fma_subx_negy_negz_f32: ; FMA4: # %bb.0: # %entry -; FMA4-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm3 +; FMA4-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm3 ; FMA4-NEXT: retq entry: %subx = fsub nsz float %w, %x diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -443,20 +443,20 @@ ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x54] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] ; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] @@ -756,43 +756,43 @@ ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x68] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x74] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x8c,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x98,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x1c,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] ; FMACALL32_BDVER2-NEXT: addl $284, %esp ## encoding: [0x81,0xc4,0x1c,0x01,0x00,0x00] @@ -1336,84 +1336,84 @@ ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xd0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x60] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x5c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x5c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x58] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x54] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x54] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c] +; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x40,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x4c,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x4c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x54] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x48,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0],mem[0],xmm2[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x3c,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x44,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0],xmm2[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x40,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x2c,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c] +; FMACALL32_BDVER2-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero,zero,zero -; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x58,0x10] +; FMACALL32_BDVER2-NEXT: vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[2,3] -; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x54,0x20] +; FMACALL32_BDVER2-NEXT: vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0],xmm1[3] -; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x50,0x30] +; FMACALL32_BDVER2-NEXT: vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1,2],mem[0] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] @@ -1508,13 +1508,13 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: addl $108, %esp ## encoding: [0x83,0xc4,0x6c] ; FMACALL32_BDVER2-NEXT: retl ## encoding: [0xc3] @@ -1723,23 +1723,23 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x38] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x44] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x30] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x28] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x18] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x18] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] ; FMACALL32_BDVER2-NEXT: addl $236, %esp ## encoding: [0x81,0xc4,0xec,0x00,0x00,0x00] @@ -2048,44 +2048,44 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x48,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x20] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x30] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4 -; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60] +; FMACALL32_BDVER2-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x78] ; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x58] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x60] ; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0,1],mem[0,1] -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x48] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x50] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70] ; FMACALL32_BDVER2-NEXT: ## xmm2 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x68] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x48] ; FMACALL32_BDVER2-NEXT: ## xmm2 = xmm2[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] -; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x68] ; FMACALL32_BDVER2-NEXT: ## xmm1 = mem[0],zero -; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x78] +; FMACALL32_BDVER2-NEXT: vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x58] ; FMACALL32_BDVER2-NEXT: ## xmm1 = xmm1[0,1],mem[0,1] ; FMACALL32_BDVER2-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01] ; FMACALL32_BDVER2-NEXT: movl %ebp, %esp ## encoding: [0x89,0xec] diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -1293,20 +1293,20 @@ ; ; FMA-NOINFS-LABEL: test_f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddss {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213ss {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz float 1.0, %t %tx = fmul nsz float %x, %t @@ -1342,20 +1342,20 @@ ; ; FMA-NOINFS-LABEL: test_v4f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v4f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v4f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <4 x float> , %t %tx = fmul nsz <4 x float> %x, %t @@ -1391,20 +1391,20 @@ ; ; FMA-NOINFS-LABEL: test_v8f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v8f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v8f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <8 x float> , %t %tx = fmul nsz <8 x float> %x, %t @@ -1440,20 +1440,20 @@ ; ; FMA-NOINFS-LABEL: test_f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213sd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubsd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddsd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213sd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz double 1.0, %t %tx = fmul nsz double %x, %t @@ -1492,20 +1492,20 @@ ; ; FMA-NOINFS-LABEL: test_v2f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v2f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v2f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} xmm1 = -(xmm2 * xmm1) + xmm1 +; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <2 x double> , %t %tx = fmul nsz <2 x double> %x, %t @@ -1541,20 +1541,20 @@ ; ; FMA-NOINFS-LABEL: test_v4f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v4f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v4f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm1 +; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <4 x double> , %t %tx = fmul nsz <4 x double> %x, %t @@ -1612,17 +1612,26 @@ define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { ; FMA-LABEL: test_v4f32_fneg_fnmadd: ; FMA: # %bb.0: -; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; FMA-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; FMA-NEXT: vfnmsub231ps {{.*#+}} xmm3 = -(xmm1 * xmm0) - xmm3 +; FMA-NEXT: vaddps %xmm2, %xmm3, %xmm0 +; FMA-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: test_v4f32_fneg_fnmadd: ; FMA4: # %bb.0: -; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 +; FMA4-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm3 +; FMA4-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; FMA4-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: test_v4f32_fneg_fnmadd: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231ps {{.*#+}} xmm3 = -(xmm1 * xmm0) - xmm3 +; AVX512-NEXT: vaddps %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: retq %mul = fmul nsz <4 x float> %a0, %a1 %neg0 = fsub nsz <4 x float> , %mul @@ -1634,17 +1643,23 @@ define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 { ; FMA-LABEL: test_v4f64_fneg_fnmsub: ; FMA: # %bb.0: -; FMA-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 +; FMA-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; FMA-NEXT: vfnmsub231pd {{.*#+}} ymm3 = -(ymm1 * ymm0) - ymm3 +; FMA-NEXT: vsubpd %ymm3, %ymm2, %ymm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: test_v4f64_fneg_fnmsub: ; FMA4: # %bb.0: -; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2 +; FMA4-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; FMA4-NEXT: vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm3 +; FMA4-NEXT: vsubpd %ymm0, %ymm2, %ymm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: test_v4f64_fneg_fnmsub: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 +; AVX512-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231pd {{.*#+}} ymm3 = -(ymm1 * ymm0) - ymm3 +; AVX512-NEXT: vsubpd %ymm3, %ymm2, %ymm0 ; AVX512-NEXT: retq %mul = fmul nsz <4 x double> %a0, %a1 %neg0 = fsub nsz <4 x double> , %mul @@ -1888,28 +1903,26 @@ define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x double> %x3, <2 x double> %x4, <2 x double> %x5, <2 x double> %x6, <2 x double> %x7, <2 x double> %x8) nounwind { ; FMA-LABEL: fadd_fma_fmul_3: ; FMA: # %bb.0: -; FMA-NEXT: vmulpd %xmm3, %xmm2, %xmm2 -; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 -; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2 -; FMA-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2 -; FMA-NEXT: vmovapd %xmm2, %xmm0 +; FMA-NEXT: vmulpd %xmm7, %xmm6, %xmm6 +; FMA-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm5 * xmm4) + xmm6 +; FMA-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm3 * xmm2) + xmm6 +; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm6 ; FMA-NEXT: retq ; ; FMA4-LABEL: fadd_fma_fmul_3: ; FMA4: # %bb.0: -; FMA4-NEXT: vmulpd %xmm3, %xmm2, %xmm2 +; FMA4-NEXT: vmulpd %xmm7, %xmm6, %xmm6 +; FMA4-NEXT: vfmaddpd {{.*#+}} xmm4 = (xmm4 * xmm5) + xmm6 +; FMA4-NEXT: vfmaddpd {{.*#+}} xmm2 = (xmm2 * xmm3) + xmm4 ; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 -; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm6 * xmm7) + xmm0 -; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm4 * xmm5) + xmm0 ; FMA4-NEXT: retq ; ; AVX512-LABEL: fadd_fma_fmul_3: ; AVX512: # %bb.0: -; AVX512-NEXT: vmulpd %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2 -; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2 -; AVX512-NEXT: vmovapd %xmm2, %xmm0 +; AVX512-NEXT: vmulpd %xmm7, %xmm6, %xmm6 +; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm5 * xmm4) + xmm6 +; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm6 = (xmm3 * xmm2) + xmm6 +; AVX512-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm6 ; AVX512-NEXT: retq %m1 = fmul fast <2 x double> %x1, %x2 %m2 = fmul fast <2 x double> %x3, %x4 diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -849,24 +849,24 @@ ; ; FMA-NOINFS-LABEL: test_v16f32_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v16f32_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v16f32_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 +; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} zmm1 = -(zmm2 * zmm1) + zmm1 +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <16 x float> , %t %tx = fmul nsz <16 x float> %x, %t @@ -908,24 +908,24 @@ ; ; FMA-NOINFS-LABEL: test_v8f64_interp: ; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2 +; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3 ; FMA-NOINFS-NEXT: retq ; ; FMA4-NOINFS-LABEL: test_v8f64_interp: ; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm3 = -(ymm5 * ymm3) + ymm3 +; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm2 = -(ymm4 * ymm2) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2 +; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3 ; FMA4-NOINFS-NEXT: retq ; ; AVX512-NOINFS-LABEL: test_v8f64_interp: ; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 +; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} zmm1 = -(zmm2 * zmm1) + zmm1 +; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1 ; AVX512-NOINFS-NEXT: retq %t1 = fsub nsz <8 x double> , %t %tx = fmul nsz <8 x double> %x, %t @@ -999,7 +999,10 @@ ; ; AVX512-LABEL: test_v16f32_fneg_fnmadd: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 +; AVX512-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231ps {{.*#+}} zmm3 = -(zmm1 * zmm0) - zmm3 +; AVX512-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; AVX512-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: retq %mul = fmul nsz <16 x float> %a0, %a1 %neg0 = fsub nsz <16 x float> , %mul @@ -1023,7 +1026,9 @@ ; ; AVX512-LABEL: test_v8f64_fneg_fnmsub: ; AVX512: # %bb.0: -; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 +; AVX512-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vfnmsub231pd {{.*#+}} zmm3 = -(zmm1 * zmm0) - zmm3 +; AVX512-NEXT: vsubpd %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: retq %mul = fmul nsz <8 x double> %a0, %a1 %neg0 = fsub nsz <8 x double> , %mul diff --git a/llvm/test/CodeGen/X86/fmul-combines.ll b/llvm/test/CodeGen/X86/fmul-combines.ll --- a/llvm/test/CodeGen/X86/fmul-combines.ll +++ b/llvm/test/CodeGen/X86/fmul-combines.ll @@ -114,10 +114,12 @@ ret <4 x float> %z } -; CHECK: float 5 -; CHECK: float 12 -; CHECK: float 21 -; CHECK: float 32 + +; CHECK: .LCPI12_0: +; CHECK-NEXT: .long 0x40a00000 +; CHECK-NEXT: .long 0x41400000 +; CHECK-NEXT: .long 0x41a80000 +; CHECK-NEXT: .long 0x42000000 ; We should be able to pre-multiply the two constant vectors. define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) { @@ -165,17 +167,26 @@ ret <4 x float> %z } -; CHECK: float 6 -; CHECK: float 14 -; CHECK: float 24 -; CHECK: float 36 +; CHECK: .LCPI16_0: +; CHECK-NEXT: .long 0x3f800000 +; CHECK-NEXT: .long 0x40000000 +; CHECK-NEXT: .long 0x40400000 +; CHECK-NEXT: .long 0x40800000 +; CHECK: .LCPI16_1: +; CHECK-NEXT: .long 0x40a00000 +; CHECK-NEXT: .long 0x41400000 +; CHECK-NEXT: .long 0x41a80000 +; CHECK-NEXT: .long 0x42000000 ; More than one use of a constant multiply should not inhibit the optimization. ; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) { ; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use: ; CHECK: # %bb.0: +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; CHECK-NEXT: mulps %xmm0, %xmm1 ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: addps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = fmul fast <4 x float> %x, %z = fmul fast <4 x float> %y, @@ -186,10 +197,11 @@ ; PR22698 - http://llvm.org/bugs/show_bug.cgi?id=22698 ; Make sure that we don't infinite loop swapping constants back and forth. -; CHECK: float 24 -; CHECK: float 24 -; CHECK: float 24 -; CHECK: float 24 +; CHECK: .LCPI17_0: +; CHECK-NEXT: .long 0x41c00000 +; CHECK-NEXT: .long 0x41c00000 +; CHECK-NEXT: .long 0x41c00000 +; CHECK-NEXT: .long 0x41c00000 define <4 x float> @PR22698_splats(<4 x float> %a) { ; CHECK-LABEL: PR22698_splats: @@ -204,10 +216,11 @@ ; Same as above, but verify that non-splat vectors are handled correctly too. -; CHECK: float 45 -; CHECK: float 120 -; CHECK: float 231 -; CHECK: float 384 +; CHECK: .LCPI18_0: +; CHECK-NEXT: .long 0x42340000 +; CHECK-NEXT: .long 0x42f00000 +; CHECK-NEXT: .long 0x43670000 +; CHECK-NEXT: .long 0x43c00000 define <4 x float> @PR22698_no_splats(<4 x float> %a) { ; CHECK-LABEL: PR22698_no_splats: @@ -269,7 +282,14 @@ ; CHECK-LABEL: getNegatedExpression_crash: ; CHECK: # %bb.0: ; CHECK-NEXT: movl $0, (%rdi) -; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: mulss %xmm0, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: mulss %xmm0, %xmm2 +; CHECK-NEXT: mulss %xmm2, %xmm0 +; CHECK-NEXT: mulss %xmm2, %xmm0 +; CHECK-NEXT: mulss %xmm1, %xmm0 ; CHECK-NEXT: retq store float 0.0, ptr %p, align 1 %real = load float, ptr %p, align 1 diff --git a/llvm/test/CodeGen/X86/fold-call-3.ll b/llvm/test/CodeGen/X86/fold-call-3.ll --- a/llvm/test/CodeGen/X86/fold-call-3.ll +++ b/llvm/test/CodeGen/X86/fold-call-3.ll @@ -60,8 +60,8 @@ ; pre-RA-NEXT: movq %rax, %rsi ; pre-RA-NEXT: callq *560(%rcx) ; pre-RA-NEXT: incl %ebp -; pre-RA-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; pre-RA-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; pre-RA-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; pre-RA-NEXT: cmpl _NumTrials(%rip), %ebp ; pre-RA-NEXT: jb LBB0_2 ; pre-RA-NEXT: ## %bb.3: diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll --- a/llvm/test/CodeGen/X86/fold-masked-merge.ll +++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll @@ -30,11 +30,10 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { ; NOBMI-LABEL: masked_merge1: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movl %edi, %eax -; NOBMI-NEXT: andl %edi, %esi -; NOBMI-NEXT: notl %eax -; NOBMI-NEXT: andl %edx, %eax -; NOBMI-NEXT: orl %esi, %eax +; NOBMI-NEXT: movl %esi, %eax +; NOBMI-NEXT: xorl %edx, %eax +; NOBMI-NEXT: andl %edi, %eax +; NOBMI-NEXT: xorl %edx, %eax ; NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; NOBMI-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fold-rmw-ops.ll b/llvm/test/CodeGen/X86/fold-rmw-ops.ll --- a/llvm/test/CodeGen/X86/fold-rmw-ops.ll +++ b/llvm/test/CodeGen/X86/fold-rmw-ops.ll @@ -1041,9 +1041,12 @@ define void @and32_imm_br() nounwind { ; CHECK-LABEL: and32_imm_br: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: andl $-2147483648, g32(%rip) # encoding: [0x81,0x25,A,A,A,A,0x00,0x00,0x00,0x80] -; CHECK-NEXT: # fixup A - offset: 2, value: g32-8, kind: reloc_riprel_4byte +; CHECK-NEXT: movl $-2147483648, %eax # encoding: [0xb8,0x00,0x00,0x00,0x80] ; CHECK-NEXT: # imm = 0x80000000 +; CHECK-NEXT: andl g32(%rip), %eax # encoding: [0x23,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte_relax +; CHECK-NEXT: movl %eax, g32(%rip) # encoding: [0x89,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte ; CHECK-NEXT: jne b # TAILCALL ; CHECK-NEXT: # encoding: [0x75,A] ; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1 @@ -1127,9 +1130,12 @@ define void @and16_imm_br() nounwind { ; CHECK-LABEL: and16_imm_br: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: andw $-32768, g16(%rip) # encoding: [0x66,0x81,0x25,A,A,A,A,0x00,0x80] -; CHECK-NEXT: # fixup A - offset: 3, value: g16-6, kind: reloc_riprel_4byte +; CHECK-NEXT: movzwl g16(%rip), %eax # encoding: [0x0f,0xb7,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte +; CHECK-NEXT: andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00] ; CHECK-NEXT: # imm = 0x8000 +; CHECK-NEXT: movw %ax, g16(%rip) # encoding: [0x66,0x89,0x05,A,A,A,A] +; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte ; CHECK-NEXT: jne b # TAILCALL ; CHECK-NEXT: # encoding: [0x75,A] ; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1 diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -1133,10 +1133,10 @@ ; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edi -; X87-NEXT: movl %edi, 8(%esi) -; X87-NEXT: movl %edx, 12(%esi) -; X87-NEXT: movl %eax, (%esi) +; X87-NEXT: movl %edi, 12(%esi) +; X87-NEXT: movl %edx, 8(%esi) ; X87-NEXT: movl %ecx, 4(%esi) +; X87-NEXT: movl %eax, (%esi) ; X87-NEXT: movl %esi, %eax ; X87-NEXT: addl $36, %esp ; X87-NEXT: popl %esi @@ -1159,10 +1159,10 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) -; X86-SSE-NEXT: movl %edx, 12(%esi) -; X86-SSE-NEXT: movl %eax, (%esi) +; X86-SSE-NEXT: movl %edi, 12(%esi) +; X86-SSE-NEXT: movl %edx, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) +; X86-SSE-NEXT: movl %eax, (%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $36, %esp ; X86-SSE-NEXT: popl %esi @@ -1478,10 +1478,10 @@ ; X87-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edx ; X87-NEXT: movl {{[0-9]+}}(%esp), %edi -; X87-NEXT: movl %edi, 8(%esi) -; X87-NEXT: movl %edx, 12(%esi) -; X87-NEXT: movl %eax, (%esi) +; X87-NEXT: movl %edi, 12(%esi) +; X87-NEXT: movl %edx, 8(%esi) ; X87-NEXT: movl %ecx, 4(%esi) +; X87-NEXT: movl %eax, (%esi) ; X87-NEXT: movl %esi, %eax ; X87-NEXT: addl $36, %esp ; X87-NEXT: popl %esi @@ -1504,10 +1504,10 @@ ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE-NEXT: movl %edi, 8(%esi) -; X86-SSE-NEXT: movl %edx, 12(%esi) -; X86-SSE-NEXT: movl %eax, (%esi) +; X86-SSE-NEXT: movl %edi, 12(%esi) +; X86-SSE-NEXT: movl %edx, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) +; X86-SSE-NEXT: movl %eax, (%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $36, %esp ; X86-SSE-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp-logic.ll b/llvm/test/CodeGen/X86/fp-logic.ll --- a/llvm/test/CodeGen/X86/fp-logic.ll +++ b/llvm/test/CodeGen/X86/fp-logic.ll @@ -231,8 +231,9 @@ define float @movmsk(float %x) { ; CHECK-LABEL: movmsk: ; CHECK: # %bb.0: -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: andps %xmm1, %xmm0 +; CHECK-NEXT: movmskps %xmm0, %eax +; CHECK-NEXT: shll $31, %eax +; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast float %x to i32 %and = and i32 %bc1, 2147483648 diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -496,10 +496,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -635,10 +635,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -675,10 +675,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -715,10 +715,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -753,10 +753,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -792,10 +792,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -833,10 +833,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -873,10 +873,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -913,10 +913,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -951,10 +951,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -990,10 +990,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -1031,10 +1031,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll --- a/llvm/test/CodeGen/X86/fp128-cast.ll +++ b/llvm/test/CodeGen/X86/fp128-cast.ll @@ -1123,11 +1123,9 @@ ; X64-SSE-NEXT: movaps %xmm0, %xmm1 ; X64-SSE-NEXT: callq __multf3@PLT ; X64-SSE-NEXT: movaps %xmm0, (%rsp) -; X64-SSE-NEXT: movq (%rsp), %rcx -; X64-SSE-NEXT: movq %rcx, %rdx -; X64-SSE-NEXT: shrq $32, %rdx +; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; X64-SSE-NEXT: xorl %eax, %eax -; X64-SSE-NEXT: orl %ecx, %edx +; X64-SSE-NEXT: orl (%rsp), %ecx ; X64-SSE-NEXT: sete %al ; X64-SSE-NEXT: addq $24, %rsp ; X64-SSE-NEXT: retq @@ -1169,11 +1167,9 @@ ; X64-AVX-NEXT: vmovaps %xmm0, %xmm1 ; X64-AVX-NEXT: callq __multf3@PLT ; X64-AVX-NEXT: vmovaps %xmm0, (%rsp) -; X64-AVX-NEXT: movq (%rsp), %rcx -; X64-AVX-NEXT: movq %rcx, %rdx -; X64-AVX-NEXT: shrq $32, %rdx +; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; X64-AVX-NEXT: xorl %eax, %eax -; X64-AVX-NEXT: orl %ecx, %edx +; X64-AVX-NEXT: orl (%rsp), %ecx ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: addq $24, %rsp ; X64-AVX-NEXT: retq @@ -1221,14 +1217,14 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: addl $3, %ecx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: addl $3, %esi ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, 8(%eax) -; X32-NEXT: movl %edx, 4(%eax) -; X32-NEXT: movl %ecx, (%eax) -; X32-NEXT: movl %edi, 12(%eax) +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %ecx, 8(%eax) +; X32-NEXT: movl %edi, 4(%eax) +; X32-NEXT: movl %esi, (%eax) +; X32-NEXT: movl %edx, 12(%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl $4 diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -42,10 +42,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -87,10 +87,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -132,10 +132,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -177,10 +177,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -226,10 +226,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -271,10 +271,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -312,10 +312,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -353,10 +353,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -394,10 +394,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -435,10 +435,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -476,10 +476,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -517,10 +517,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -558,10 +558,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -599,10 +599,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -644,10 +644,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -689,10 +689,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -730,10 +730,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -775,10 +775,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -817,10 +817,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -858,10 +858,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -899,10 +899,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -940,10 +940,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -981,10 +981,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -1022,10 +1022,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi @@ -1063,10 +1063,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, 8(%esi) -; X86-NEXT: movl %edx, 12(%esi) -; X86-NEXT: movl %eax, (%esi) +; X86-NEXT: movl %edi, 12(%esi) +; X86-NEXT: movl %edx, 8(%esi) ; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $20, %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll --- a/llvm/test/CodeGen/X86/fp128-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll @@ -84,10 +84,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -171,10 +171,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -258,10 +258,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -345,10 +345,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl @@ -432,10 +432,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, vf128+8 -; X86-NEXT: movl %edx, vf128+12 -; X86-NEXT: movl %eax, vf128 +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 ; X86-NEXT: addl $24, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fpclamptosat.ll b/llvm/test/CodeGen/X86/fpclamptosat.ll --- a/llvm/test/CodeGen/X86/fpclamptosat.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat.ll @@ -55,7 +55,7 @@ ; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: cmovlel %ecx, %eax +; CHECK-NEXT: cmovsl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq entry: @@ -121,7 +121,7 @@ ; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: cmovlel %ecx, %eax +; CHECK-NEXT: cmovsl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq entry: @@ -203,7 +203,7 @@ ; CHECK-NEXT: cmovlq %rcx, %rax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rax, %rax -; CHECK-NEXT: cmovlel %ecx, %eax +; CHECK-NEXT: cmovsl %ecx, %eax ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -266,7 +266,7 @@ ; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovnsl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq entry: @@ -325,7 +325,7 @@ ; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovnsl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq entry: @@ -402,7 +402,7 @@ ; CHECK-NEXT: cmovll %eax, %ecx ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl %ecx, %ecx -; CHECK-NEXT: cmovgl %ecx, %eax +; CHECK-NEXT: cmovnsl %ecx, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -423,23 +423,13 @@ define i64 @stest_f64i64(double %x) { ; CHECK-LABEL: stest_f64i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: callq __fixdfti@PLT -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmpq %rsi, %rax -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rdx, %rcx -; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rax, %rdx -; CHECK-NEXT: movq $-1, %rsi -; CHECK-NEXT: sbbq %rcx, %rsi -; CHECK-NEXT: cmovgeq %rdx, %rax -; CHECK-NEXT: popq %rcx -; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: cvttsd2si %xmm0, %rax +; CHECK-NEXT: ucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovbeq %rax, %rcx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ucomisd %xmm0, %xmm0 +; CHECK-NEXT: cmovnpq %rcx, %rax ; CHECK-NEXT: retq entry: %conv = fptosi double %x to i128 @@ -479,14 +469,11 @@ ; CHECK-NEXT: callq __fixdfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovgq %rcx, %rax ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: cmovleq %rdx, %rsi -; CHECK-NEXT: cmovgq %rcx, %rax -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: negq %rdx -; CHECK-NEXT: movl $0, %edx -; CHECK-NEXT: sbbq %rsi, %rdx -; CHECK-NEXT: cmovgeq %rcx, %rax +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -504,23 +491,13 @@ define i64 @stest_f32i64(float %x) { ; CHECK-LABEL: stest_f32i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: callq __fixsfti@PLT -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmpq %rsi, %rax -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rdx, %rcx -; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rax, %rdx -; CHECK-NEXT: movq $-1, %rsi -; CHECK-NEXT: sbbq %rcx, %rsi -; CHECK-NEXT: cmovgeq %rdx, %rax -; CHECK-NEXT: popq %rcx -; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovbeq %rax, %rcx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovnpq %rcx, %rax ; CHECK-NEXT: retq entry: %conv = fptosi float %x to i128 @@ -560,14 +537,11 @@ ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovgq %rcx, %rax ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: cmovleq %rdx, %rsi -; CHECK-NEXT: cmovgq %rcx, %rax -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: negq %rdx -; CHECK-NEXT: movl $0, %edx -; CHECK-NEXT: sbbq %rsi, %rdx -; CHECK-NEXT: cmovgeq %rcx, %rax +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -587,19 +561,17 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: callq __fixhfti@PLT -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF -; CHECK-NEXT: cmpq %rsi, %rax -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rdx, %rcx -; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rax, %rdx -; CHECK-NEXT: movq $-1, %rsi -; CHECK-NEXT: sbbq %rcx, %rsi -; CHECK-NEXT: cmovgeq %rdx, %rax +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; CHECK-NEXT: cmovaeq %rax, %rcx +; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmovbeq %rcx, %rdx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ucomiss %xmm0, %xmm0 +; CHECK-NEXT: cmovnpq %rdx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -641,14 +613,11 @@ ; CHECK-NEXT: callq __fixhfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovgq %rcx, %rax ; CHECK-NEXT: movl $1, %esi ; CHECK-NEXT: cmovleq %rdx, %rsi -; CHECK-NEXT: cmovgq %rcx, %rax -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: negq %rdx -; CHECK-NEXT: movl $0, %edx -; CHECK-NEXT: sbbq %rsi, %rdx -; CHECK-NEXT: cmovgeq %rcx, %rax +; CHECK-NEXT: testq %rsi, %rsi +; CHECK-NEXT: cmovsq %rcx, %rax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -15,31 +15,32 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-NEXT: por %xmm1, %xmm3 ; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: por %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pand %xmm4, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: por %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -76,11 +77,12 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -107,14 +109,14 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -163,27 +165,27 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm4, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm7, %xmm8 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm9 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; CHECK-NEXT: por %xmm9, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm8 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm8, %xmm1 ; CHECK-NEXT: pand %xmm1, %xmm4 ; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm4, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm4 ; CHECK-NEXT: pxor %xmm0, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: pand %xmm5, %xmm2 ; CHECK-NEXT: pandn %xmm3, %xmm5 @@ -191,30 +193,31 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: movdqa %xmm5, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm3, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm5 -; CHECK-NEXT: pandn %xmm2, %xmm3 -; CHECK-NEXT: por %xmm5, %xmm3 +; CHECK-NEXT: pand %xmm7, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm6 +; CHECK-NEXT: pand %xmm6, %xmm5 +; CHECK-NEXT: pandn %xmm2, %xmm6 +; CHECK-NEXT: por %xmm5, %xmm6 ; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm0 +; CHECK-NEXT: pand %xmm5, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm4, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; CHECK-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -274,22 +277,22 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm4 ; CHECK-NEXT: pxor %xmm3, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm6, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm5, %xmm7 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 -; CHECK-NEXT: pand %xmm7, %xmm4 +; CHECK-NEXT: pand %xmm6, %xmm4 ; CHECK-NEXT: pand %xmm4, %xmm0 ; CHECK-NEXT: pandn %xmm2, %xmm4 ; CHECK-NEXT: por %xmm0, %xmm4 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm5, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm3 +; CHECK-NEXT: movdqa %xmm5, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm2, %xmm0 @@ -326,27 +329,27 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm4, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm7, %xmm8 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm9 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; CHECK-NEXT: por %xmm9, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm8 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm8, %xmm1 ; CHECK-NEXT: pand %xmm1, %xmm4 ; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm4, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm4 ; CHECK-NEXT: pxor %xmm0, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; CHECK-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm7, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: pand %xmm5, %xmm2 ; CHECK-NEXT: pandn %xmm3, %xmm5 @@ -421,28 +424,28 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm8 +; CHECK-NEXT: movdqa %xmm3, %xmm7 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm6, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm8, %xmm1 +; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm3, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 @@ -453,30 +456,31 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: movdqa %xmm4, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm3, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm4 -; CHECK-NEXT: pandn %xmm2, %xmm3 -; CHECK-NEXT: por %xmm4, %xmm3 +; CHECK-NEXT: pand %xmm7, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm6 +; CHECK-NEXT: pand %xmm6, %xmm4 +; CHECK-NEXT: pandn %xmm2, %xmm6 +; CHECK-NEXT: por %xmm4, %xmm6 ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm5, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm5, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] ; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] ; CHECK-NEXT: por %xmm5, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -554,23 +558,23 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm6, %xmm3 +; CHECK-NEXT: pand %xmm5, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: pandn %xmm1, %xmm3 ; CHECK-NEXT: por %xmm0, %xmm3 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm6, %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm6, %xmm2 +; CHECK-NEXT: movdqa %xmm4, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; CHECK-NEXT: pand %xmm5, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm6 ; CHECK-NEXT: pandn %xmm1, %xmm0 @@ -625,28 +629,28 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm8 +; CHECK-NEXT: movdqa %xmm3, %xmm7 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm8 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm6, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm7 ; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm8, %xmm1 +; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm3, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; CHECK-NEXT: por %xmm3, %xmm4 @@ -984,27 +988,27 @@ ; CHECK-NEXT: por %xmm0, %xmm2 ; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm2, %xmm3 -; CHECK-NEXT: pxor %xmm1, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; CHECK-NEXT: movdqa %xmm4, %xmm0 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pxor %xmm3, %xmm4 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] +; CHECK-NEXT: movdqa %xmm5, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 -; CHECK-NEXT: pxor %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm1, %xmm0 ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; CHECK-NEXT: pxor %xmm2, %xmm1 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pxor %xmm3, %xmm4 -; CHECK-NEXT: por %xmm2, %xmm4 -; CHECK-NEXT: pslld $16, %xmm4 -; CHECK-NEXT: psrad $16, %xmm4 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pandn %xmm1, %xmm5 +; CHECK-NEXT: por %xmm2, %xmm5 +; CHECK-NEXT: pslld $16, %xmm5 +; CHECK-NEXT: psrad $16, %xmm5 ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: packssdw %xmm4, %xmm0 +; CHECK-NEXT: packssdw %xmm5, %xmm0 ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -1591,31 +1595,32 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-NEXT: por %xmm1, %xmm3 ; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: por %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pand %xmm4, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm3 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: por %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -1650,11 +1655,12 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1680,14 +1686,14 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -1733,59 +1739,60 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm6, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm7 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm7, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] ; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: pandn %xmm4, %xmm1 +; CHECK-NEXT: pandn %xmm5, %xmm1 ; CHECK-NEXT: por %xmm3, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm7 +; CHECK-NEXT: movdqa %xmm4, %xmm6 ; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: pand %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm5 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm5 -; CHECK-NEXT: por %xmm2, %xmm5 -; CHECK-NEXT: movdqa %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm5, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: pand %xmm2, %xmm5 -; CHECK-NEXT: pandn %xmm3, %xmm2 -; CHECK-NEXT: por %xmm5, %xmm2 -; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm6, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm5 +; CHECK-NEXT: por %xmm4, %xmm5 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; CHECK-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -1842,26 +1849,26 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm6 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] ; CHECK-NEXT: pand %xmm3, %xmm0 -; CHECK-NEXT: pandn %xmm4, %xmm3 +; CHECK-NEXT: pandn %xmm5, %xmm3 ; CHECK-NEXT: por %xmm0, %xmm3 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm4, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-NEXT: pand %xmm5, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm4, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm4, %xmm0 +; CHECK-NEXT: pandn %xmm5, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: retq @@ -1893,33 +1900,33 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm3, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm6, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm4, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm7 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm7, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] ; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: pandn %xmm4, %xmm1 +; CHECK-NEXT: pandn %xmm5, %xmm1 ; CHECK-NEXT: por %xmm3, %xmm1 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm7 +; CHECK-NEXT: movdqa %xmm4, %xmm6 ; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] +; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: pand %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm5 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm5 -; CHECK-NEXT: por %xmm2, %xmm5 -; CHECK-NEXT: movdqa %xmm5, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm5, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 @@ -1928,7 +1935,7 @@ ; CHECK-NEXT: pand %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm5, %xmm3 +; CHECK-NEXT: pand %xmm4, %xmm3 ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm4 @@ -1986,63 +1993,64 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm7 +; CHECK-NEXT: movdqa %xmm2, %xmm6 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm3, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] +; CHECK-NEXT: pand %xmm1, %xmm6 +; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm6, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] -; CHECK-NEXT: pand %xmm1, %xmm7 -; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm5 -; CHECK-NEXT: pxor %xmm0, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm2 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm3, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 ; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320] +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: pand %xmm2, %xmm4 -; CHECK-NEXT: pandn %xmm3, %xmm2 -; CHECK-NEXT: por %xmm4, %xmm2 +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; CHECK-NEXT: pand %xmm5, %xmm4 +; CHECK-NEXT: pandn %xmm2, %xmm5 +; CHECK-NEXT: por %xmm4, %xmm5 ; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm6, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -2117,27 +2125,27 @@ ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pxor %xmm1, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259455,9223372039002259455] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm1, %xmm2 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; CHECK-NEXT: pand %xmm2, %xmm0 -; CHECK-NEXT: pandn %xmm3, %xmm2 +; CHECK-NEXT: pandn %xmm4, %xmm2 ; CHECK-NEXT: por %xmm0, %xmm2 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm6, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm6, %xmm1 +; CHECK-NEXT: movdqa %xmm3, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm4, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm3, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm6 -; CHECK-NEXT: pandn %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm4, %xmm0 ; CHECK-NEXT: por %xmm6, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; CHECK-NEXT: addq $72, %rsp @@ -2187,36 +2195,36 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] ; CHECK-NEXT: movdqa %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm7 +; CHECK-NEXT: movdqa %xmm2, %xmm6 ; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm3, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] +; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; CHECK-NEXT: pand %xmm1, %xmm6 +; CHECK-NEXT: pandn %xmm3, %xmm1 ; CHECK-NEXT: por %xmm6, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; CHECK-NEXT: pand %xmm1, %xmm7 -; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm7, %xmm1 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm5 -; CHECK-NEXT: pxor %xmm0, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 +; CHECK-NEXT: movdqa %xmm2, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa %xmm7, %xmm2 +; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pandn %xmm3, %xmm4 +; CHECK-NEXT: por %xmm2, %xmm4 ; CHECK-NEXT: movdqa %xmm4, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm3 @@ -2540,14 +2548,14 @@ ; CHECK-NEXT: movdqa %xmm4, %xmm0 ; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 -; CHECK-NEXT: pxor %xmm3, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] +; CHECK-NEXT: pandn %xmm3, %xmm0 ; CHECK-NEXT: por %xmm2, %xmm0 ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; CHECK-NEXT: pxor %xmm2, %xmm1 ; CHECK-NEXT: pcmpgtd %xmm1, %xmm4 ; CHECK-NEXT: pand %xmm4, %xmm2 -; CHECK-NEXT: pxor %xmm3, %xmm4 +; CHECK-NEXT: pandn %xmm3, %xmm4 ; CHECK-NEXT: por %xmm2, %xmm4 ; CHECK-NEXT: pslld $16, %xmm4 ; CHECK-NEXT: psrad $16, %xmm4 diff --git a/llvm/test/CodeGen/X86/fpenv-combine.ll b/llvm/test/CodeGen/X86/fpenv-combine.ll --- a/llvm/test/CodeGen/X86/fpenv-combine.ll +++ b/llvm/test/CodeGen/X86/fpenv-combine.ll @@ -22,13 +22,13 @@ ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; X64-NEXT: movq %rsi, 24(%r14) -; X64-NEXT: movq %rcx, (%r14) -; X64-NEXT: movq %rdx, 8(%r14) -; X64-NEXT: movq %rax, 16(%r14) -; X64-NEXT: movq %rax, 16(%rbx) +; X64-NEXT: movq %rdx, 16(%r14) +; X64-NEXT: movq %rcx, 8(%r14) +; X64-NEXT: movq %rax, (%r14) ; X64-NEXT: movq %rsi, 24(%rbx) -; X64-NEXT: movq %rcx, (%rbx) -; X64-NEXT: movq %rdx, 8(%rbx) +; X64-NEXT: movq %rdx, 16(%rbx) +; X64-NEXT: movq %rcx, 8(%rbx) +; X64-NEXT: movq %rax, (%rbx) ; X64-NEXT: addq $40, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r14 @@ -72,8 +72,8 @@ ; X64-NEXT: movq (%rsp), %rax ; X64-NEXT: andl $1, %eax ; X64-NEXT: movq %rax, (%rbx) -; X64-NEXT: movq $0, 16(%rbx) ; X64-NEXT: movq $0, 24(%rbx) +; X64-NEXT: movq $0, 16(%rbx) ; X64-NEXT: movq $0, 8(%rbx) ; X64-NEXT: addq $32, %rsp ; X64-NEXT: popq %rbx diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll --- a/llvm/test/CodeGen/X86/fpenv.ll +++ b/llvm/test/CodeGen/X86/fpenv.ll @@ -13,7 +13,9 @@ ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: orb $12, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: movzwl (%esp), %eax +; X86-NOSSE-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NOSSE-NEXT: movw %ax, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) ; X86-NOSSE-NEXT: popl %eax ; X86-NOSSE-NEXT: retl @@ -22,7 +24,9 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: orb $12, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movzwl (%esp), %eax +; X86-SSE-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: fldcw (%esp) ; X86-SSE-NEXT: stmxcsr (%esp) ; X86-SSE-NEXT: orb $96, {{[0-9]+}}(%esp) @@ -33,7 +37,9 @@ ; X64-LABEL: func_01: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: orb $12, -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) ; X64-NEXT: orb $96, -{{[0-9]+}}(%rsp) @@ -83,8 +89,8 @@ ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-NOSSE-NEXT: andl (%esp), %eax +; X86-NOSSE-NEXT: movzwl (%esp), %eax +; X86-NOSSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-NOSSE-NEXT: orl $2048, %eax # imm = 0x800 ; X86-NOSSE-NEXT: movw %ax, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) @@ -95,8 +101,8 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-SSE-NEXT: andl (%esp), %eax +; X86-SSE-NEXT: movzwl (%esp), %eax +; X86-SSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-SSE-NEXT: orl $2048, %eax # imm = 0x800 ; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: fldcw (%esp) @@ -112,8 +118,8 @@ ; X64-LABEL: func_03: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: andl $62463, %eax # imm = 0xF3FF ; X64-NEXT: orl $2048, %eax # imm = 0x800 ; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) @@ -133,8 +139,8 @@ ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-NOSSE-NEXT: andl (%esp), %eax +; X86-NOSSE-NEXT: movzwl (%esp), %eax +; X86-NOSSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-NOSSE-NEXT: orl $1024, %eax # imm = 0x400 ; X86-NOSSE-NEXT: movw %ax, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) @@ -145,8 +151,8 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X86-SSE-NEXT: andl (%esp), %eax +; X86-SSE-NEXT: movzwl (%esp), %eax +; X86-SSE-NEXT: andl $62463, %eax # imm = 0xF3FF ; X86-SSE-NEXT: orl $1024, %eax # imm = 0x400 ; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: fldcw (%esp) @@ -162,8 +168,8 @@ ; X64-LABEL: func_04: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: movl $-3073, %eax # imm = 0xF3FF -; X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: andl $62463, %eax # imm = 0xF3FF ; X64-NEXT: orl $1024, %eax # imm = 0x400 ; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) @@ -189,8 +195,8 @@ ; X86-NOSSE-NEXT: shll %cl, %eax ; X86-NOSSE-NEXT: andl $3072, %eax # imm = 0xC00 ; X86-NOSSE-NEXT: fnstcw (%esp) -; X86-NOSSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF -; X86-NOSSE-NEXT: andl (%esp), %ecx +; X86-NOSSE-NEXT: movzwl (%esp), %ecx +; X86-NOSSE-NEXT: andl $62463, %ecx # imm = 0xF3FF ; X86-NOSSE-NEXT: orl %eax, %ecx ; X86-NOSSE-NEXT: movw %cx, (%esp) ; X86-NOSSE-NEXT: fldcw (%esp) @@ -207,8 +213,8 @@ ; X86-SSE-NEXT: shll %cl, %eax ; X86-SSE-NEXT: andl $3072, %eax # imm = 0xC00 ; X86-SSE-NEXT: fnstcw (%esp) -; X86-SSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF -; X86-SSE-NEXT: andl (%esp), %ecx +; X86-SSE-NEXT: movzwl (%esp), %ecx +; X86-SSE-NEXT: andl $62463, %ecx # imm = 0xF3FF ; X86-SSE-NEXT: orl %eax, %ecx ; X86-SSE-NEXT: movw %cx, (%esp) ; X86-SSE-NEXT: fldcw (%esp) @@ -230,8 +236,8 @@ ; X64-NEXT: shll %cl, %eax ; X64-NEXT: andl $3072, %eax # imm = 0xC00 ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; X64-NEXT: movl $-3073, %ecx # imm = 0xF3FF -; X64-NEXT: andl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: andl $62463, %ecx # imm = 0xF3FF ; X64-NEXT: orl %eax, %ecx ; X64-NEXT: movw %cx, -{{[0-9]+}}(%rsp) ; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/freeze-unary.ll b/llvm/test/CodeGen/X86/freeze-unary.ll --- a/llvm/test/CodeGen/X86/freeze-unary.ll +++ b/llvm/test/CodeGen/X86/freeze-unary.ll @@ -6,6 +6,7 @@ ; X86-LABEL: freeze_sext: ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cwtl ; X86-NEXT: retl ; ; X64-LABEL: freeze_sext: @@ -40,6 +41,7 @@ ; X86-LABEL: freeze_zext: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: retl ; ; X64-LABEL: freeze_zext: diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -349,15 +349,16 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl (%edx), %edx ; X86-NEXT: andl $15, %edx -; X86-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 -; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7] -; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-NEXT: vmovdqa %xmm0, (%ecx) ; X86-NEXT: vmovd %edx, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X86-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7] +; X86-NEXT: vpand %xmm3, %xmm1, %xmm1 +; X86-NEXT: vmovdqa %xmm1, (%ecx) ; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpand %xmm3, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) ; X86-NEXT: retl ; @@ -365,15 +366,15 @@ ; X64: # %bb.0: ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: andl $15, %eax -; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vmovdqa %xmm0, (%rdx) ; X64-NEXT: vmovd %eax, %xmm0 ; X64-NEXT: vpbroadcastd %xmm0, %xmm0 -; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3] +; X64-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7] +; X64-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X64-NEXT: vmovdqa %xmm2, (%rdx) +; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] +; X64-NEXT: vpand %xmm3, %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rcx) ; X64-NEXT: retq %i0.src = load i32, ptr %origin0 diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -180,7 +180,7 @@ ; X64-AVX2-NEXT: andq %rdx, %rax ; X64-AVX2-NEXT: movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5 ; X64-AVX2-NEXT: mulq %rdx -; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax +; X64-AVX2-NEXT: leaq (%rdx,%rdx,8), %rax ; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax ; X64-AVX2-NEXT: subl %eax, %ecx ; X64-AVX2-NEXT: shlq $27, %rsi @@ -347,7 +347,7 @@ ; X64-AVX2-NEXT: andq %rdx, %rax ; X64-AVX2-NEXT: movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5 ; X64-AVX2-NEXT: mulq %rdx -; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax +; X64-AVX2-NEXT: leaq (%rdx,%rdx,8), %rax ; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax ; X64-AVX2-NEXT: subl %eax, %ecx ; X64-AVX2-NEXT: addl $27, %ecx @@ -980,20 +980,18 @@ ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: leal (%eax,%eax,2), %edx -; X86-SSE2-NEXT: movzwl 8(%ecx,%edx,4), %esi -; X86-SSE2-NEXT: movl 4(%ecx,%edx,4), %edi -; X86-SSE2-NEXT: shrdl $8, %esi, %edi +; X86-SSE2-NEXT: leal (%eax,%eax,2), %esi +; X86-SSE2-NEXT: movzwl 8(%ecx,%esi,4), %edx +; X86-SSE2-NEXT: movl 4(%ecx,%esi,4), %edi +; X86-SSE2-NEXT: shrdl $8, %edx, %edi ; X86-SSE2-NEXT: xorl %eax, %edi ; X86-SSE2-NEXT: sarl $31, %eax -; X86-SSE2-NEXT: movzbl 10(%ecx,%edx,4), %ecx -; X86-SSE2-NEXT: shll $16, %ecx -; X86-SSE2-NEXT: orl %esi, %ecx -; X86-SSE2-NEXT: shll $8, %ecx -; X86-SSE2-NEXT: movl %ecx, %edx -; X86-SSE2-NEXT: sarl $8, %edx +; X86-SSE2-NEXT: movsbl 10(%ecx,%esi,4), %ecx +; X86-SSE2-NEXT: movl %ecx, %esi +; X86-SSE2-NEXT: shll $16, %esi +; X86-SSE2-NEXT: orl %edx, %esi ; X86-SSE2-NEXT: sarl $31, %ecx -; X86-SSE2-NEXT: shldl $24, %edx, %ecx +; X86-SSE2-NEXT: shldl $24, %esi, %ecx ; X86-SSE2-NEXT: xorl %eax, %ecx ; X86-SSE2-NEXT: orl %ecx, %edi ; X86-SSE2-NEXT: jne .LBB46_1 diff --git a/llvm/test/CodeGen/X86/h-registers-2.ll b/llvm/test/CodeGen/X86/h-registers-2.ll --- a/llvm/test/CodeGen/X86/h-registers-2.ll +++ b/llvm/test/CodeGen/X86/h-registers-2.ll @@ -8,8 +8,7 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movzbl %ah, %eax +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movb $77, (%ecx,%eax,8) ; CHECK-NEXT: shll $3, %eax ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -195,12 +195,12 @@ ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %eax, %esi +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: movd %xmm1, %edi +; SSE3-NEXT: addl %eax, %edi ; SSE3-NEXT: movd %esi, %xmm0 -; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE3-NEXT: movd %xmm2, %eax -; SSE3-NEXT: movd %xmm1, %esi -; SSE3-NEXT: addl %eax, %esi -; SSE3-NEXT: movd %esi, %xmm1 +; SSE3-NEXT: movd %edi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE3-NEXT: movd %ecx, %xmm2 ; SSE3-NEXT: movd %edx, %xmm0 @@ -311,12 +311,12 @@ ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: subl %esi, %edx -; SSE3-NEXT: movd %edx, %xmm0 -; SSE3-NEXT: movd %xmm1, %edx -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; SSE3-NEXT: movd %xmm1, %esi -; SSE3-NEXT: subl %esi, %edx -; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE3-NEXT: movd %xmm0, %edi +; SSE3-NEXT: subl %edi, %esi +; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE3-NEXT: movd %eax, %xmm2 ; SSE3-NEXT: movd %ecx, %xmm0 diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -72,11 +72,11 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSE2-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: movapd %xmm2, %xmm3 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: addpd %xmm3, %xmm2 -; SSE2-NEXT: divpd %xmm2, %xmm1 -; SSE2-NEXT: divpd %xmm2, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE2-NEXT: addsd %xmm2, %xmm3 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0,0] +; SSE2-NEXT: divpd %xmm3, %xmm1 +; SSE2-NEXT: divpd %xmm3, %xmm0 ; SSE2-NEXT: xorpd %xmm2, %xmm2 ; SSE2-NEXT: addpd %xmm2, %xmm0 ; SSE2-NEXT: addpd %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -121,25 +121,26 @@ define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) { ; SSE-LABEL: hadd_reverse3_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] -; SSE-NEXT: haddps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2] -; SSE-NEXT: movaps %xmm3, %xmm0 -; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: haddps %xmm2, %xmm4 +; SSE-NEXT: haddps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2,1,0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse3_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse3_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -554,6 +554,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_v8i32b: @@ -670,6 +671,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_v8i32b: @@ -814,6 +816,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_v16i16b: @@ -954,6 +957,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_v16i16b: @@ -1013,15 +1017,45 @@ } define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { -; SSE-LABEL: PR34724_2: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE_SLOW-LABEL: PR34724_2: +; SSE_SLOW: # %bb.0: +; SSE_SLOW-NEXT: haddps %xmm1, %xmm0 +; SSE_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE_SLOW-NEXT: addps %xmm1, %xmm2 +; SSE_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSE_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE_SLOW-NEXT: retq ; -; AVX-LABEL: PR34724_2: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE_FAST-LABEL: PR34724_2: +; SSE_FAST: # %bb.0: +; SSE_FAST-NEXT: haddps %xmm1, %xmm0 +; SSE_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: PR34724_2: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: PR34724_2: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: PR34724_2: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: PR34724_2: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %t2 = fadd <4 x float> %t0, %t1 diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -216,7 +216,7 @@ ; AVX-FAST-LABEL: test8_undef: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,1] ; AVX-FAST-NEXT: retq %vecext = extractelement <4 x float> %a, i32 0 %vecext1 = extractelement <4 x float> %a, i32 1 @@ -504,32 +504,17 @@ } define <4 x float> @add_ps_030(<4 x float> %x) { -; SSE-SLOW-LABEL: add_ps_030: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; SSE-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE-SLOW-NEXT: retq -; -; SSE-FAST-LABEL: add_ps_030: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,2,3] -; SSE-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: add_ps_030: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq +; SSE-LABEL: add_ps_030: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSE-NEXT: retq ; -; AVX-FAST-LABEL: add_ps_030: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-FAST-NEXT: retq +; AVX-LABEL: add_ps_030: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -584,14 +569,14 @@ ; SSE-LABEL: add_ps_016: ; SSE: # %bb.0: ; SSE-NEXT: haddps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,1] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_016: ; AVX: # %bb.0: ; AVX-NEXT: vhaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,1] ; AVX-NEXT: retq %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> @@ -1127,40 +1112,68 @@ ; SSE-SLOW-LABEL: PR34724_add_v4f64_u123: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: haddpd %xmm2, %xmm1 +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: movapd %xmm3, %xmm2 ; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSE-SLOW-NEXT: addsd %xmm3, %xmm2 -; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR34724_add_v4f64_u123: ; SSE-FAST: # %bb.0: ; SSE-FAST-NEXT: movapd %xmm1, %xmm0 -; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 ; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 +; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_u123: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] -; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_u123: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = xmm0[0,0] +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_u123: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_u123: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 +; AVX1-FAST-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_u123: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_u123: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX512-FAST-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %5 = fadd <2 x double> %3, %4 @@ -1193,21 +1206,48 @@ ; SSE-FAST-NEXT: haddpd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_0u23: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_0u23: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_0u23: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_0u23: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_0u23: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_0u23: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> %5 = fadd <2 x double> %3, %4 @@ -1239,28 +1279,42 @@ ; SSE-FAST-NEXT: movapd %xmm3, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_01u3: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[2,3] +; AVX1-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: retq ; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_01u3: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX512-SLOW-NEXT: retq +; ; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3: ; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-FAST-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX512-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX512-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] ; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> @@ -1292,22 +1346,39 @@ ; SSE-FAST-NEXT: movapd %xmm2, %xmm1 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f64_012u: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f64_012u: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_012u: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_012u: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f64_012u: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX512-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX512-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX512-SLOW-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_012u: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> %5 = fadd <2 x double> %3, %4 diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -1638,33 +1638,25 @@ ; ; SSE3-FAST-LABEL: fadd_reduce_v8f32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddps %xmm1, %xmm2 -; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 +; SSE3-FAST-NEXT: addps %xmm2, %xmm1 +; SSE3-FAST-NEXT: movaps %xmm1, %xmm2 +; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE3-FAST-NEXT: addps %xmm1, %xmm2 ; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 ; SSE3-FAST-NEXT: addss %xmm2, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: fadd_reduce_v8f32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: fadd_reduce_v8f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: fadd_reduce_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) ret float %r } @@ -1681,29 +1673,20 @@ ; ; SSE3-FAST-LABEL: fadd_reduce_v4f64: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddpd %xmm1, %xmm2 -; SSE3-FAST-NEXT: haddpd %xmm2, %xmm2 -; SSE3-FAST-NEXT: addsd %xmm2, %xmm0 +; SSE3-FAST-NEXT: addpd %xmm2, %xmm1 +; SSE3-FAST-NEXT: haddpd %xmm1, %xmm1 +; SSE3-FAST-NEXT: addsd %xmm1, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: fadd_reduce_v4f64: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: fadd_reduce_v4f64: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 -; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: fadd_reduce_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vaddpd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX-NEXT: vaddsd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) ret double %r } @@ -1751,15 +1734,6 @@ ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: PR39936_v8f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> %4 = fadd <8 x float> %2, %3 @@ -1830,22 +1804,14 @@ ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: hadd32_8: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: hadd32_8: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: hadd32_8: +; AVX: # %bb.0: +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> %x227 = fadd <8 x float> %x225, %x226 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> @@ -1880,14 +1846,6 @@ ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: hadd32_16: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> %x227 = fadd <16 x float> %x225, %x226 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> @@ -1932,7 +1890,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> @@ -1951,14 +1910,6 @@ ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq -; -; AVX-LABEL: hadd32_16_optsize: -; AVX: # %bb.0: -; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> %x227 = fadd <16 x float> %x225, %x226 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> @@ -2003,7 +1954,8 @@ ; AVX: # %bb.0: ; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> @@ -2022,14 +1974,6 @@ ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq -; -; AVX-LABEL: hadd32_16_pgso: -; AVX: # %bb.0: -; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> %x227 = fadd <16 x float> %x225, %x226 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> @@ -2056,21 +2000,14 @@ ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: partial_reduction_fadd_v8f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: partial_reduction_fadd_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %x0213 = fadd <8 x float> %x, %x23 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> @@ -2100,22 +2037,14 @@ ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX-LABEL: partial_reduction_fadd_v8f32_wrong_flags: +; AVX: # %bb.0: +; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %x0213 = fadd fast <8 x float> %x, %x23 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> @@ -2150,13 +2079,6 @@ ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: partial_reduction_fadd_v16f32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> %x0213 = fadd <16 x float> %x, %x23 %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -64,9 +64,10 @@ ; ; CHECK-I686-LABEL: test_bitcast_to_half: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: movw {{[0-9]+}}(%esp), %ax -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-I686-NEXT: movw %ax, (%ecx) +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-I686-NEXT: movw %cx, (%eax) ; CHECK-I686-NEXT: retl %val_fp = bitcast i16 %in to half store half %val_fp, ptr %addr @@ -1235,7 +1236,7 @@ ; CHECK-LIBCALL-LABEL: fcopysign: ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm1, %eax -; CHECK-LIBCALL-NEXT: andl $-32768, %eax # imm = 0x8000 +; CHECK-LIBCALL-NEXT: andl $32768, %eax # imm = 0x8000 ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %ecx ; CHECK-LIBCALL-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-LIBCALL-NEXT: orl %eax, %ecx @@ -1245,7 +1246,7 @@ ; BWON-F16C-LABEL: fcopysign: ; BWON-F16C: # %bb.0: ; BWON-F16C-NEXT: vpextrw $0, %xmm1, %eax -; BWON-F16C-NEXT: andl $-32768, %eax # imm = 0x8000 +; BWON-F16C-NEXT: andl $32768, %eax # imm = 0x8000 ; BWON-F16C-NEXT: vpextrw $0, %xmm0, %ecx ; BWON-F16C-NEXT: andl $32767, %ecx # imm = 0x7FFF ; BWON-F16C-NEXT: orl %eax, %ecx @@ -1254,8 +1255,8 @@ ; ; CHECK-I686-LABEL: fcopysign: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: movl $-32768, %eax # imm = 0x8000 -; CHECK-I686-NEXT: andl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: andl $32768, %eax # imm = 0x8000 ; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; CHECK-I686-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-I686-NEXT: orl %eax, %ecx @@ -2113,37 +2114,37 @@ define void @pr63114() { ; CHECK-LIBCALL-LABEL: pr63114: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: movdqu (%rax), %xmm4 -; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,3,4,5,6,7] +; CHECK-LIBCALL-NEXT: movdqu (%rax), %xmm3 +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,3,4,5,6,7] ; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm0 ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm0 -; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm0 +; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm0 ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] ; CHECK-LIBCALL-NEXT: por %xmm5, %xmm0 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,7,7] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,7,7] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,5,5,5,5] +; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3,0,3] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm3 +; CHECK-LIBCALL-NEXT: por %xmm2, %xmm3 +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm3 +; CHECK-LIBCALL-NEXT: por %xmm5, %xmm3 ; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm6 ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm6 -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm6 +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm6 ; CHECK-LIBCALL-NEXT: por %xmm5, %xmm6 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,5,5,5] -; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3,0,3] -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] -; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm4 -; CHECK-LIBCALL-NEXT: por %xmm2, %xmm4 -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm4 -; CHECK-LIBCALL-NEXT: por %xmm5, %xmm4 ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm7 ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm7 -; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm7 +; CHECK-LIBCALL-NEXT: pand %xmm4, %xmm7 ; CHECK-LIBCALL-NEXT: por %xmm5, %xmm7 ; CHECK-LIBCALL-NEXT: movdqu %xmm7, 0 -; CHECK-LIBCALL-NEXT: movdqu %xmm4, 32 ; CHECK-LIBCALL-NEXT: movdqu %xmm6, 48 +; CHECK-LIBCALL-NEXT: movdqu %xmm3, 32 ; CHECK-LIBCALL-NEXT: movdqu %xmm0, 16 ; CHECK-LIBCALL-NEXT: retq ; @@ -2154,61 +2155,61 @@ ; BWON-F16C-NEXT: vbroadcastss (%rax), %xmm2 ; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; BWON-F16C-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0] -; BWON-F16C-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; BWON-F16C-NEXT: vpsllq $48, %xmm3, %xmm4 -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7] -; BWON-F16C-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; BWON-F16C-NEXT: vpor %xmm3, %xmm2, %xmm2 -; BWON-F16C-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3],xmm1[2,0] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] -; BWON-F16C-NEXT: vpor %xmm3, %xmm1, %xmm1 -; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,3,3,4,5,6,7] -; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] -; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3],xmm0[4,5,6,7] -; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm3[7] +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,3,3,4,5,6,7] +; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,1] +; BWON-F16C-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; BWON-F16C-NEXT: vpsllq $48, %xmm4, %xmm5 +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7] +; BWON-F16C-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] +; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,5,5,5,5] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3],xmm6[4,5,6,7] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm4[7] +; BWON-F16C-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7] +; BWON-F16C-NEXT: vpor %xmm4, %xmm2, %xmm2 +; BWON-F16C-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3],xmm0[4,5,6,7] +; BWON-F16C-NEXT: vpor %xmm4, %xmm0, %xmm0 ; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; BWON-F16C-NEXT: vmovups %ymm0, 0 -; BWON-F16C-NEXT: vmovups %ymm1, 32 +; BWON-F16C-NEXT: vmovups %ymm0, 32 +; BWON-F16C-NEXT: vmovups %ymm3, 0 ; BWON-F16C-NEXT: vzeroupper ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: pr63114: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: movdqu (%eax), %xmm6 -; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,3,4,5,6,7] +; CHECK-I686-NEXT: movdqu (%eax), %xmm3 +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,3,3,4,5,6,7] ; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-I686-NEXT: pand %xmm1, %xmm0 ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] ; CHECK-I686-NEXT: por %xmm2, %xmm0 -; CHECK-I686-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] -; CHECK-I686-NEXT: pand %xmm3, %xmm0 -; CHECK-I686-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] -; CHECK-I686-NEXT: por %xmm4, %xmm0 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,7] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; CHECK-I686-NEXT: pand %xmm1, %xmm5 -; CHECK-I686-NEXT: por %xmm2, %xmm5 -; CHECK-I686-NEXT: pand %xmm3, %xmm5 -; CHECK-I686-NEXT: por %xmm4, %xmm5 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,5,5,5,5] -; CHECK-I686-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3,0,3] -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0] +; CHECK-I686-NEXT: pand %xmm4, %xmm0 +; CHECK-I686-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] +; CHECK-I686-NEXT: por %xmm5, %xmm0 +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,7,7] +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3,0,3] +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: pand %xmm1, %xmm3 +; CHECK-I686-NEXT: por %xmm2, %xmm3 +; CHECK-I686-NEXT: pand %xmm4, %xmm3 +; CHECK-I686-NEXT: por %xmm5, %xmm3 +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; CHECK-I686-NEXT: pand %xmm1, %xmm6 ; CHECK-I686-NEXT: por %xmm2, %xmm6 -; CHECK-I686-NEXT: pand %xmm3, %xmm6 -; CHECK-I686-NEXT: por %xmm4, %xmm6 +; CHECK-I686-NEXT: pand %xmm4, %xmm6 +; CHECK-I686-NEXT: por %xmm5, %xmm6 ; CHECK-I686-NEXT: pand %xmm1, %xmm7 ; CHECK-I686-NEXT: por %xmm2, %xmm7 -; CHECK-I686-NEXT: pand %xmm3, %xmm7 -; CHECK-I686-NEXT: por %xmm4, %xmm7 +; CHECK-I686-NEXT: pand %xmm4, %xmm7 +; CHECK-I686-NEXT: por %xmm5, %xmm7 ; CHECK-I686-NEXT: movdqu %xmm7, 0 -; CHECK-I686-NEXT: movdqu %xmm6, 32 -; CHECK-I686-NEXT: movdqu %xmm5, 48 +; CHECK-I686-NEXT: movdqu %xmm6, 48 +; CHECK-I686-NEXT: movdqu %xmm3, 32 ; CHECK-I686-NEXT: movdqu %xmm0, 16 ; CHECK-I686-NEXT: retl %1 = load <24 x half>, ptr poison, align 2 diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -409,16 +409,18 @@ ; X64-BMI1-LABEL: scalar_i64_lowestbit_eq: ; X64-BMI1: # %bb.0: ; X64-BMI1-NEXT: movq %rsi, %rcx +; X64-BMI1-NEXT: movl $1, %eax ; X64-BMI1-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-BMI1-NEXT: shlq %cl, %rdi -; X64-BMI1-NEXT: testb $1, %dil +; X64-BMI1-NEXT: shrq %cl, %rax +; X64-BMI1-NEXT: testl %edi, %eax ; X64-BMI1-NEXT: sete %al ; X64-BMI1-NEXT: retq ; ; X64-BMI2-LABEL: scalar_i64_lowestbit_eq: ; X64-BMI2: # %bb.0: -; X64-BMI2-NEXT: shlxq %rsi, %rdi, %rax -; X64-BMI2-NEXT: testb $1, %al +; X64-BMI2-NEXT: movl $1, %eax +; X64-BMI2-NEXT: shrxq %rsi, %rax, %rax +; X64-BMI2-NEXT: testl %edi, %eax ; X64-BMI2-NEXT: sete %al ; X64-BMI2-NEXT: retq %t0 = lshr i64 1, %y @@ -497,45 +499,45 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_splat_eq: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X86-SSE2-NEXT: retl ; ; AVX2-LABEL: vec_4xi32_splat_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_splat_eq: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pxor %xmm2, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x @@ -581,45 +583,45 @@ define <4 x i1> @vec_4xi32_nonsplat_undef0_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pxor %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $23, %xmm1 ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X86-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: pxor %xmm0, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X86-SSE2-NEXT: retl ; ; AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pxor %xmm2, %xmm2 ; X64-SSE2-NEXT: pslld $23, %xmm1 ; X64-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE2-NEXT: pmuludq %xmm2, %xmm0 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE2-NEXT: pxor %xmm0, %xmm0 +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 ; X64-SSE2-NEXT: retq %t0 = lshr <4 x i32> , %y %t1 = and <4 x i32> %t0, %x diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -471,10 +471,10 @@ ; AVX2-LABEL: vec_4xi32_splat_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_splat_eq: @@ -559,10 +559,10 @@ ; AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} ; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll @@ -99,7 +99,7 @@ ; AVX1-FAST-LABEL: PR37890_v4f64: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -235,7 +235,7 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -480,8 +480,8 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -553,8 +553,8 @@ ; X64-AVX2-LABEL: test_reduce_v4i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -565,7 +565,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -629,9 +629,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -685,9 +685,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -697,9 +697,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -758,11 +758,14 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -806,11 +809,14 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -818,11 +824,14 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -910,13 +919,16 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorb $127, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -988,13 +1000,16 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorb $127, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1002,13 +1017,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1147,8 +1165,8 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1264,8 +1282,8 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1278,7 +1296,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1361,9 +1379,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1433,9 +1451,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1447,9 +1465,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1519,11 +1537,14 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1575,11 +1596,14 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1587,13 +1611,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1700,13 +1727,16 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorb $127, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1794,13 +1824,16 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorb $127, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1808,15 +1841,18 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1869,15 +1905,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1901,22 +1950,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1956,15 +2021,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1988,22 +2066,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32767, %eax ## imm = 0x7FFF -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32767, %eax ## imm = 0x7FFF +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2065,17 +2159,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorb $127, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorb $127, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2121,26 +2230,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorb $127, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorb $127, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2205,17 +2332,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorb $127, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorb $127, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2261,26 +2403,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorb $127, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorb $127, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorb $127, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -483,8 +483,8 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -557,8 +557,8 @@ ; X64-AVX2-LABEL: test_reduce_v4i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -569,7 +569,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -633,9 +633,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -689,9 +689,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -701,9 +701,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -762,11 +762,14 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -810,11 +813,14 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -822,11 +828,14 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -914,13 +923,16 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: addb $-128, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -992,13 +1004,16 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: addb $-128, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1006,13 +1021,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1151,8 +1169,8 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1268,8 +1286,8 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 ; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 @@ -1282,7 +1300,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1365,9 +1383,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1437,9 +1455,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1451,9 +1469,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1523,11 +1541,14 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1579,11 +1600,14 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1591,13 +1615,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1704,13 +1731,16 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: addb $-128, %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1798,13 +1828,16 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: addb $-128, %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1812,15 +1845,18 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1873,15 +1909,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1905,22 +1954,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1960,15 +2025,28 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1992,22 +2070,38 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: xorl $32768, %eax ## imm = 0x8000 -; X64-AVX1OR2-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: xorl $32768, %eax ## imm = 0x8000 +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2069,17 +2163,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: addb $-128, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: addb $-128, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2125,26 +2234,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: addb $-128, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: addb $-128, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2209,17 +2336,32 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: addb $-128, %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: addb $-128, %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2265,26 +2407,44 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX1OR2-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX1OR2: ## %bb.0: -; X64-AVX1OR2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1OR2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX1OR2-NEXT: vmovd %xmm0, %eax -; X64-AVX1OR2-NEXT: addb $-128, %al -; X64-AVX1OR2-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX1OR2-NEXT: vzeroupper -; X64-AVX1OR2-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: addb $-128, %al +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: addb $-128, %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -545,11 +545,11 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -636,10 +636,10 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -652,7 +652,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -725,9 +725,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -790,9 +790,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -802,9 +802,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -869,12 +869,14 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notl %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -924,12 +926,14 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -937,11 +941,14 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1011,14 +1018,16 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notb %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1072,14 +1081,16 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1087,13 +1098,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1256,10 +1270,10 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1401,10 +1415,10 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1419,7 +1433,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1517,9 +1531,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1604,9 +1618,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1618,9 +1632,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1698,12 +1712,14 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notl %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1763,12 +1779,14 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1776,13 +1794,16 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -1863,14 +1884,16 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: notb %al ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -1932,14 +1955,16 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -1947,15 +1972,18 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2012,16 +2040,29 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notl %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notl %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -2062,21 +2103,26 @@ ; ; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2120,16 +2166,29 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notl %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notl %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -2170,21 +2229,26 @@ ; ; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notl %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notl %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2231,18 +2295,33 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notb %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notb %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2288,25 +2367,30 @@ ; ; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq @@ -2356,18 +2440,33 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: notb %al -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: notb %al +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2413,25 +2512,30 @@ ; ; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX2: ## %bb.0: -; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: notb %al ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; ; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax -; X64-AVX512-NEXT: notb %al ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -489,11 +489,11 @@ ; X86-AVX2-LABEL: test_reduce_v4i64: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648] -; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -582,10 +582,10 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -598,7 +598,7 @@ ; X64-AVX512-LABEL: test_reduce_v4i64: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -671,9 +671,9 @@ ; X86-AVX2-LABEL: test_reduce_v8i32: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -736,9 +736,9 @@ ; X64-AVX2-LABEL: test_reduce_v8i32: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -748,9 +748,9 @@ ; X64-AVX512-LABEL: test_reduce_v8i32: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -813,8 +813,13 @@ ; X86-AVX2-LABEL: test_reduce_v16i16: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -863,8 +868,13 @@ ; X64-AVX2-LABEL: test_reduce_v16i16: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -873,8 +883,13 @@ ; X64-AVX512-LABEL: test_reduce_v16i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -939,10 +954,15 @@ ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -991,10 +1011,15 @@ ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -1003,10 +1028,15 @@ ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -1172,10 +1202,10 @@ ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1319,10 +1349,10 @@ ; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 +; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 @@ -1337,7 +1367,7 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovq %xmm0, %rax @@ -1435,9 +1465,9 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X86-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1522,9 +1552,9 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax @@ -1536,9 +1566,9 @@ ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; X64-AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax @@ -1616,8 +1646,13 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -1678,8 +1713,13 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -1688,10 +1728,15 @@ ; X64-AVX512-LABEL: test_reduce_v32i16: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -1767,10 +1812,15 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X86-AVX2-NEXT: vzeroupper @@ -1827,10 +1877,15 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX2-NEXT: vzeroupper @@ -1839,12 +1894,17 @@ ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vmovd %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax ; X64-AVX512-NEXT: vzeroupper @@ -1902,13 +1962,26 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v16i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v16i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -1936,13 +2009,39 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v16i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v16i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> %2 = icmp ult <16 x i16> %a0, %1 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1 @@ -1983,13 +2082,26 @@ ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i16_v8i16: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i16_v8i16: ; X64-SSE2: ## %bb.0: @@ -2017,13 +2129,39 @@ ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i16_v8i16: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $ax killed $ax killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i16_v8i16: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $ax killed $ax killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> %2 = icmp ult <32 x i16> %a0, %1 %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1 @@ -2064,15 +2202,30 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v32i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v32i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2100,15 +2253,45 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v32i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v32i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> %2 = icmp ult <32 x i8> %a0, %1 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1 @@ -2152,15 +2335,30 @@ ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl ; -; X86-AVX-LABEL: test_reduce_v64i8_v16i8: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd %xmm0, %eax -; X86-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X86-AVX-NEXT: vzeroupper -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %xmm0, %eax +; X86-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovd %xmm0, %eax +; X86-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_reduce_v64i8_v16i8: ; X64-SSE2: ## %bb.0: @@ -2188,15 +2386,45 @@ ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq ; -; X64-AVX-LABEL: test_reduce_v64i8_v16i8: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd %xmm0, %eax -; X64-AVX-NEXT: ## kill: def $al killed $al killed $eax -; X64-AVX-NEXT: vzeroupper -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %xmm0, %eax +; X64-AVX1-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovd %xmm0, %eax +; X64-AVX2-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: test_reduce_v64i8_v16i8: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovd %xmm0, %eax +; X64-AVX512-NEXT: ## kill: def $al killed $al killed $eax +; X64-AVX512-NEXT: vzeroupper +; X64-AVX512-NEXT: retq %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> %2 = icmp ult <64 x i8> %a0, %1 %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1 diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -20,9 +20,16 @@ ; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm0 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32: @@ -32,17 +39,17 @@ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: pair_sum_v4f32_v4f32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] -; AVX-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 -; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] +; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 +; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: pair_sum_v4f32_v4f32: ; AVX-FAST: # %bb.0: @@ -50,6 +57,25 @@ ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX2-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX2-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %7 = fadd <2 x float> %5, %6 @@ -82,13 +108,19 @@ ; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] +; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -103,15 +135,15 @@ ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm2 +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -123,18 +155,18 @@ ; ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1] ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> @@ -173,24 +205,22 @@ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 -; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; SSSE3-SLOW-NEXT: movaps %xmm5, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm2 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1] -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm4 +; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm4 +; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2 +; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm5, %xmm4 -; SSSE3-FAST-NEXT: haddps %xmm4, %xmm2 -; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6 ; SSSE3-FAST-NEXT: haddps %xmm6, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm4, %xmm1 @@ -352,16 +382,12 @@ ; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2 +; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0 ; SSSE3-FAST-NEXT: phaddd %xmm5, %xmm4 -; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm2 -; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm6 -; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm7 ; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm6 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] -; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm4 +; SSSE3-FAST-NEXT: movdqa %xmm4, %xmm1 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32: @@ -425,8 +451,10 @@ ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -448,8 +476,10 @@ ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -524,7 +554,7 @@ ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[3,2] ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm0[0,1] @@ -550,7 +580,7 @@ ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 ; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[3,2] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 @@ -638,20 +668,23 @@ ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-SLOW-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSSE3-SLOW-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm3 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32: @@ -660,19 +693,22 @@ ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm4 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSSE3-FAST-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-FAST-NEXT: paddd %xmm0, %xmm4 ; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 +; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] +; SSSE3-FAST-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] +; SSSE3-FAST-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm5 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-FAST-NEXT: paddd %xmm5, %xmm6 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] -; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0 +; SSSE3-FAST-NEXT: paddd %xmm4, %xmm1 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm3 +; SSSE3-FAST-NEXT: paddd %xmm3, %xmm0 +; SSSE3-FAST-NEXT: paddd %xmm0, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32: @@ -948,24 +984,25 @@ ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm3 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -983,27 +1020,31 @@ ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; SSSE3-FAST-NEXT: addps %xmm3, %xmm2 -; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-FAST-NEXT: haddps %xmm1, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,0] ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1] -; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] -; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -1015,10 +1056,12 @@ ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0] +; AVX-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-FAST-NEXT: retq %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0) %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1) @@ -1035,24 +1078,23 @@ ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32: @@ -1066,69 +1108,73 @@ ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2 -; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,0] ; SSSE3-FAST-NEXT: retq ; -; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-SLOW-NEXT: retq +; AVX-SLOW-LABEL: reduction_sum_v4i32_v4i32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %eax +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %ecx +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovd %xmm1, %edx +; AVX-SLOW-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: reduction_sum_v4i32_v4i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX1-FAST-NEXT: vmovd %xmm2, %eax +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,1] +; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; AVX1-FAST-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq ; -; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: retq +; AVX2-FAST-LABEL: reduction_sum_v4i32_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovd %xmm2, %eax +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-FAST-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0) %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1) %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2) diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll --- a/llvm/test/CodeGen/X86/i128-add.ll +++ b/llvm/test/CodeGen/X86/i128-add.ll @@ -74,13 +74,9 @@ ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: addq %rdx, %rax ; X64-NEXT: adcq %rcx, %rsi -; X64-NEXT: movq %rax, %xmm0 -; X64-NEXT: movq %rsi, %xmm1 -; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; X64-NEXT: movq %xmm0, %rdx ; X64-NEXT: addq $1, %rax -; X64-NEXT: adcq $0, %rdx +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %rsi, %rdx ; X64-NEXT: retq %t0 = add <1 x i128> %x, %t1 = add <1 x i128> %y, %t0 diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -323,31 +323,32 @@ ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE-NEXT: pxor %xmm1, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X64-SSE-NEXT: pcmpeqd %xmm4, %xmm4 -; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm4 -; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; X64-SSE-NEXT: pand %xmm4, %xmm3 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067713,18446744071562067713] +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pcmpgtd %xmm3, %xmm4 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] +; X64-SSE-NEXT: pcmpeqd %xmm3, %xmm2 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X64-SSE-NEXT: por %xmm3, %xmm2 -; X64-SSE-NEXT: pand %xmm2, %xmm0 -; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE-NEXT: por %xmm0, %xmm2 -; X64-SSE-NEXT: pxor %xmm2, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pxor %xmm3, %xmm3 -; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm3 +; X64-SSE-NEXT: pand %xmm5, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; X64-SSE-NEXT: por %xmm2, %xmm3 +; X64-SSE-NEXT: pand %xmm3, %xmm0 +; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; X64-SSE-NEXT: por %xmm0, %xmm3 +; X64-SSE-NEXT: pxor %xmm3, %xmm1 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903] -; X64-SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] -; X64-SSE-NEXT: pand %xmm3, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-SSE-NEXT: por %xmm1, %xmm0 -; X64-SSE-NEXT: pand %xmm0, %xmm2 -; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: por %xmm2, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] +; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pand %xmm4, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; X64-SSE-NEXT: por %xmm0, %xmm1 +; X64-SSE-NEXT: pand %xmm1, %xmm3 +; X64-SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE-NEXT: por %xmm3, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; X64-SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll --- a/llvm/test/CodeGen/X86/iabs.ll +++ b/llvm/test/CodeGen/X86/iabs.ll @@ -39,7 +39,7 @@ ; X86-NO-CMOV: # %bb.0: ; X86-NO-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NO-CMOV-NEXT: movswl %ax, %ecx -; X86-NO-CMOV-NEXT: sarl $15, %ecx +; X86-NO-CMOV-NEXT: shrl $15, %ecx ; X86-NO-CMOV-NEXT: xorl %ecx, %eax ; X86-NO-CMOV-NEXT: subl %ecx, %eax ; X86-NO-CMOV-NEXT: # kill: def $ax killed $ax killed $eax diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -513,11 +513,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -572,9 +574,16 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] ; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_to_abs_vec4x64_sext: @@ -645,12 +654,15 @@ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] ; AVX2-NEXT: vpcmpeqq %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -713,10 +725,18 @@ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] ; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] ; AVX2-NEXT: vpcmpeqq %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: ne_and_to_abs_vec4x64_sext: @@ -1002,6 +1022,22 @@ ; AVX2-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: negb %al +; AVX2-NEXT: vpextrb $0, %xmm0, %ecx +; AVX2-NEXT: andb $1, %cl +; AVX2-NEXT: negb %cl +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: negb %al +; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: negb %al +; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_to_abs_vec4x8_sext: @@ -1010,6 +1046,27 @@ ; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: andb $1, %al +; SSE41-NEXT: negb %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pextrb $0, %xmm0, %ecx +; SSE41-NEXT: andb $1, %cl +; SSE41-NEXT: negb %cl +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: andb $1, %al +; SSE41-NEXT: negb %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: andb $1, %al +; SSE41-NEXT: negb %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: eq_or_to_abs_vec4x8_sext: @@ -1018,6 +1075,19 @@ ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: shll $8, %ecx +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: orl %ecx, %edx +; SSE2-NEXT: shll $8, %eax +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: orl %eax, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pinsrw $1, %edx, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp eq <4 x i8> %x, %cmp2 = icmp eq <4 x i8> %x, @@ -1114,6 +1184,22 @@ ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpextrw $1, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vpextrw $0, %xmm0, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: negl %ecx +; AVX2-NEXT: vmovd %ecx, %xmm1 +; AVX2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrw $2, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpextrw $3, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negl %eax +; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: ne_and_to_abs_vec4x16_sext: @@ -1124,7 +1210,22 @@ ; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pandn %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pextrw $1, %xmm1, %eax +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: negl %eax +; SSE41-NEXT: pextrw $0, %xmm1, %ecx +; SSE41-NEXT: andl $1, %ecx +; SSE41-NEXT: negl %ecx +; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: pinsrw $1, %eax, %xmm0 +; SSE41-NEXT: pextrw $2, %xmm1, %eax +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: negl %eax +; SSE41-NEXT: pinsrw $2, %eax, %xmm0 +; SSE41-NEXT: pextrw $3, %xmm1, %eax +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: negl %eax +; SSE41-NEXT: pinsrw $3, %eax, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: ne_and_to_abs_vec4x16_sext: @@ -1135,7 +1236,22 @@ ; SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pextrw $1, %xmm1, %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pextrw $0, %xmm1, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: negl %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: pextrw $2, %xmm1, %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: pextrw $3, %xmm1, %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: negl %eax +; SSE2-NEXT: pinsrw $3, %eax, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp ne <4 x i16> %x, %cmp2 = icmp ne <4 x i16> %x, diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll --- a/llvm/test/CodeGen/X86/icmp-abs-C.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll @@ -163,7 +163,7 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax +; X86-NEXT: shrl $15, %eax ; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll --- a/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-logic-npow2.ll @@ -198,7 +198,7 @@ ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movswl %ax, %ecx -; X86-NEXT: sarl $15, %ecx +; X86-NEXT: shrl $15, %ecx ; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movzwl %ax, %eax diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll --- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll @@ -13,30 +13,34 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB0_1: # %loop ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: addl $1, %edi +; X86-NEXT: addl $1, %ecx ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: orl %edx, %ebp -; X86-NEXT: orl %ecx, %ebp -; X86-NEXT: shrdl $28, %ebx, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: shldl $4, %edx, %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: shldl $4, %esi, %ebp +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: shrl $28, %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: jne .LBB0_1 ; X86-NEXT: # %bb.2: # %exit -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -52,9 +56,11 @@ ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: addq $1, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $60, %rcx -; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: shldq $4, %rax, %rcx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: shrq $60, %rsi +; X64-NEXT: orq %rcx, %rsi ; X64-NEXT: jne .LBB0_1 ; X64-NEXT: # %bb.2: # %exit ; X64-NEXT: retq @@ -73,21 +79,27 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: shldl $15, %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl $15, %ecx, %esi +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: shrl $17, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrdl $17, %ecx, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: sete %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_eq_zero: ; X64: # %bb.0: -; X64-NEXT: shrq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shrdq $17, %rsi, %rdi +; X64-NEXT: shrq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: sete %al ; X64-NEXT: retq %srl = lshr i128 %a, 17 @@ -98,21 +110,27 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: shldl $15, %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: shldl $15, %ecx, %esi +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: shrl $17, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrdl $17, %ecx, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_ne_zero: ; X64: # %bb.0: -; X64-NEXT: shrq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shrdq $17, %rsi, %rdi +; X64-NEXT: shrq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: setne %al ; X64-NEXT: retq %srl = lshr i128 %a, 17 @@ -123,19 +141,27 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $17, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: shldl $17, %eax, %edx +; X86-NEXT: shll $17, %esi +; X86-NEXT: orl %edx, %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: sete %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_eq_zero: ; X64: # %bb.0: -; X64-NEXT: shlq $17, %rsi -; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: shldq $17, %rdi, %rsi +; X64-NEXT: shlq $17, %rdi +; X64-NEXT: orq %rsi, %rdi ; X64-NEXT: sete %al ; X64-NEXT: retq %shl = shl i128 %a, 17 @@ -146,19 +172,27 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll $17, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: shldl $17, %eax, %edx +; X86-NEXT: shll $17, %esi +; X86-NEXT: orl %edx, %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %ecx, %eax +; X86-NEXT: shldl $17, %ecx, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: setne %al +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_ne_zero: ; X64: # %bb.0: -; X64-NEXT: shlq $17, %rsi -; X64-NEXT: orq %rdi, %rsi +; X64-NEXT: shldq $17, %rdi, %rsi +; X64-NEXT: shlq $17, %rdi +; X64-NEXT: orq %rsi, %rdi ; X64-NEXT: setne %al ; X64-NEXT: retq %shl = shl i128 %a, 17 @@ -233,8 +267,9 @@ ; ; X64-LABEL: opt_setcc_expanded_shl_correct_shifts: ; X64: # %bb.0: -; X64-NEXT: shlq $17, %rdi -; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: shldq $17, %rsi, %rdi +; X64-NEXT: shlq $17, %rsi +; X64-NEXT: orq %rdi, %rsi ; X64-NEXT: sete %al ; X64-NEXT: retq %shl.a = shl i64 %a, 17 diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll --- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -108,7 +108,15 @@ ; ; X64-LABEL: i56_or: ; X64: # %bb.0: -; X64-NEXT: orl $384, (%rdi) # imm = 0x180 +; X64-NEXT: movzwl 4(%rdi), %eax +; X64-NEXT: movzbl 6(%rdi), %ecx +; X64-NEXT: shll $16, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: shlq $32, %rcx +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: orq %rcx, %rax +; X64-NEXT: orq $384, %rax # imm = 0x180 +; X64-NEXT: movl %eax, (%rdi) ; X64-NEXT: retq %aa = load i56, ptr %a, align 1 %b = or i56 %aa, 384 @@ -163,19 +171,20 @@ ; ; X64-LABEL: i56_insert_bit: ; X64: # %bb.0: -; X64-NEXT: movzwl 4(%rdi), %eax -; X64-NEXT: movzbl 6(%rdi), %ecx -; X64-NEXT: shll $16, %ecx -; X64-NEXT: orl %eax, %ecx -; X64-NEXT: shlq $32, %rcx -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movzwl 4(%rdi), %ecx +; X64-NEXT: movzbl 6(%rdi), %edx +; X64-NEXT: shll $16, %edx +; X64-NEXT: orl %ecx, %edx +; X64-NEXT: shlq $32, %rdx +; X64-NEXT: movl (%rdi), %ecx +; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: shlq $13, %rax +; X64-NEXT: andq $-8193, %rcx # imm = 0xDFFF ; X64-NEXT: orq %rcx, %rax -; X64-NEXT: shll $13, %esi -; X64-NEXT: andq $-8193, %rax # imm = 0xDFFF -; X64-NEXT: orl %eax, %esi -; X64-NEXT: shrq $32, %rax -; X64-NEXT: movw %ax, 4(%rdi) -; X64-NEXT: movl %esi, (%rdi) +; X64-NEXT: shrq $32, %rcx +; X64-NEXT: movw %cx, 4(%rdi) +; X64-NEXT: movl %eax, (%rdi) ; X64-NEXT: retq %extbit = zext i1 %bit to i56 %b = load i56, ptr %a, align 1 diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll --- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll +++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll @@ -31,18 +31,18 @@ ; AVX-32: # %bb.0: # %L.entry ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0 -; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-32-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm1 +; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] ; AVX-32-NEXT: vmovups %ymm0, 608(%eax) ; AVX-32-NEXT: vzeroupper ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: PR15298: ; AVX-64: # %bb.0: # %L.entry -; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0 -; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm1 +; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] ; AVX-64-NEXT: vmovups %ymm0, 608(%rsi) ; AVX-64-NEXT: vzeroupper ; AVX-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -2270,14 +2270,14 @@ ; SSE: # %bb.0: ; SSE-NEXT: movl (%rdi), %eax ; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; SSE-NEXT: movdqa %xmm0, 96(%rdi) ; SSE-NEXT: movdqa %xmm0, 112(%rdi) -; SSE-NEXT: movdqa %xmm0, 64(%rdi) +; SSE-NEXT: movdqa %xmm0, 96(%rdi) ; SSE-NEXT: movdqa %xmm0, 80(%rdi) -; SSE-NEXT: movdqa %xmm0, 32(%rdi) +; SSE-NEXT: movdqa %xmm0, 64(%rdi) ; SSE-NEXT: movdqa %xmm0, 48(%rdi) -; SSE-NEXT: movdqa %xmm0, (%rdi) +; SSE-NEXT: movdqa %xmm0, 32(%rdi) ; SSE-NEXT: movdqa %xmm0, 16(%rdi) +; SSE-NEXT: movdqa %xmm0, (%rdi) ; SSE-NEXT: leal 2147483647(%rax), %ecx ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: cmovnsl %eax, %ecx @@ -2293,8 +2293,8 @@ ; AVX1-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX1-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) +; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) ; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) ; AVX1-NEXT: movl (%rdi), %eax ; AVX1-NEXT: vmovaps %ymm1, (%rdi) @@ -2314,8 +2314,8 @@ ; AVX2-NEXT: vpbroadcastq (%rdi), %ymm0 ; AVX2-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vmovdqa %ymm0, 64(%rdi) ; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi) +; AVX2-NEXT: vmovdqa %ymm0, 64(%rdi) ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi) ; AVX2-NEXT: movl (%rdi), %eax ; AVX2-NEXT: vmovdqa %ymm1, (%rdi) @@ -2357,8 +2357,8 @@ ; X86AVX2-NEXT: vbroadcastsd (%ecx), %ymm0 ; X86AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] ; X86AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; X86AVX2-NEXT: vmovaps %ymm0, 64(%ecx) ; X86AVX2-NEXT: vmovaps %ymm0, 96(%ecx) +; X86AVX2-NEXT: vmovaps %ymm0, 64(%ecx) ; X86AVX2-NEXT: vmovaps %ymm0, 32(%ecx) ; X86AVX2-NEXT: movl (%ecx), %eax ; X86AVX2-NEXT: vmovaps %ymm1, (%ecx) diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll --- a/llvm/test/CodeGen/X86/insertelement-zero.ll +++ b/llvm/test/CodeGen/X86/insertelement-zero.ll @@ -337,19 +337,22 @@ ; SSE2-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSE2: # %bb.0: ; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: pinsrw $7, %eax, %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSE3: # %bb.0: ; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: xorl %eax, %eax +; SSE3-NEXT: pinsrw $7, %eax, %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSSE3: # %bb.0: ; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: pinsrw $7, %eax, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i16_z12345z789ABCDEz: @@ -359,10 +362,10 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v16i16_z12345z789ABCDEz: -; AVX: # %bb.0: -; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_v16i16_z12345z789ABCDEz: +; AVX1: # %bb.0: +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq %1 = insertelement <16 x i16> %a, i16 0, i32 0 %2 = insertelement <16 x i16> %1, i16 0, i32 6 %3 = insertelement <16 x i16> %2, i16 0, i32 15 diff --git a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll --- a/llvm/test/CodeGen/X86/is_fpclass-fp80.ll +++ b/llvm/test/CodeGen/X86/is_fpclass-fp80.ll @@ -253,7 +253,7 @@ define i1 @is_inf_f80(x86_fp80 %x) { ; CHECK-32-LABEL: is_inf_f80: ; CHECK-32: # %bb.0: # %entry -; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: notl %eax ; CHECK-32-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; CHECK-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx @@ -265,7 +265,7 @@ ; ; CHECK-64-LABEL: is_inf_f80: ; CHECK-64: # %bb.0: # %entry -; CHECK-64-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; CHECK-64-NEXT: notl %eax ; CHECK-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx @@ -308,9 +308,9 @@ ; CHECK-32-LABEL: is_neginf_f80: ; CHECK-32: # %bb.0: # %entry ; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: xorl $65535, %eax # imm = 0xFFFF ; CHECK-32-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; CHECK-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: xorl $65535, %eax # imm = 0xFFFF ; CHECK-32-NEXT: orl {{[0-9]+}}(%esp), %eax ; CHECK-32-NEXT: orl %ecx, %eax ; CHECK-32-NEXT: sete %al @@ -319,10 +319,10 @@ ; CHECK-64-LABEL: is_neginf_f80: ; CHECK-64: # %bb.0: # %entry ; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-64-NEXT: xorq $65535, %rax # imm = 0xFFFF ; CHECK-64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 ; CHECK-64-NEXT: xorq {{[0-9]+}}(%rsp), %rcx -; CHECK-64-NEXT: orq %rax, %rcx +; CHECK-64-NEXT: xorq $65535, %rax # imm = 0xFFFF +; CHECK-64-NEXT: orq %rcx, %rax ; CHECK-64-NEXT: sete %al ; CHECK-64-NEXT: retq entry: @@ -370,22 +370,22 @@ ; CHECK-32-NEXT: pushl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-32-NEXT: .cfi_offset %esi, -8 -; CHECK-32-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; CHECK-32-NEXT: movswl %dx, %ecx -; CHECK-32-NEXT: sarl $15, %ecx +; CHECK-32-NEXT: movswl {{[0-9]+}}(%esp), %ecx +; CHECK-32-NEXT: movl %ecx, %edx +; CHECK-32-NEXT: sarl $31, %edx ; CHECK-32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-32-NEXT: andl $32767, %edx # imm = 0x7FFF -; CHECK-32-NEXT: decl %edx -; CHECK-32-NEXT: movzwl %dx, %edx +; CHECK-32-NEXT: andl $32767, %ecx # imm = 0x7FFF +; CHECK-32-NEXT: decl %ecx +; CHECK-32-NEXT: movzwl %cx, %ecx ; CHECK-32-NEXT: xorl %esi, %esi -; CHECK-32-NEXT: cmpl $32766, %edx # imm = 0x7FFE +; CHECK-32-NEXT: cmpl $32766, %ecx # imm = 0x7FFE ; CHECK-32-NEXT: sbbl %esi, %esi -; CHECK-32-NEXT: setb %dl -; CHECK-32-NEXT: testl %ecx, %ecx -; CHECK-32-NEXT: setns %cl +; CHECK-32-NEXT: setb %cl +; CHECK-32-NEXT: testl %edx, %edx +; CHECK-32-NEXT: setns %dl ; CHECK-32-NEXT: shrl $31, %eax -; CHECK-32-NEXT: andb %cl, %al ; CHECK-32-NEXT: andb %dl, %al +; CHECK-32-NEXT: andb %cl, %al ; CHECK-32-NEXT: # kill: def $al killed $al killed $eax ; CHECK-32-NEXT: popl %esi ; CHECK-32-NEXT: .cfi_def_cfa_offset 4 @@ -441,9 +441,10 @@ ; ; CHECK-64-LABEL: is_negnormal_f80: ; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; CHECK-64-NEXT: movswq %cx, %rdx ; CHECK-64-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-64-NEXT: movswq {{[0-9]+}}(%rsp), %rcx -; CHECK-64-NEXT: testq %rcx, %rcx +; CHECK-64-NEXT: testq %rdx, %rdx ; CHECK-64-NEXT: sets %dl ; CHECK-64-NEXT: andl $32767, %ecx # imm = 0x7FFF ; CHECK-64-NEXT: decl %ecx diff --git a/llvm/test/CodeGen/X86/ispow2.ll b/llvm/test/CodeGen/X86/ispow2.ll --- a/llvm/test/CodeGen/X86/ispow2.ll +++ b/llvm/test/CodeGen/X86/ispow2.ll @@ -78,22 +78,22 @@ ; CHECK-NOBMI-LABEL: is_pow2_non_zero_4xv64: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm2 = [256,256] -; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: por %xmm2, %xmm1 +; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 -; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3 +; CHECK-NOBMI-NEXT: pand %xmm3, %xmm0 +; CHECK-NOBMI-NEXT: paddq %xmm1, %xmm2 +; CHECK-NOBMI-NEXT: pand %xmm1, %xmm2 ; CHECK-NOBMI-NEXT: pxor %xmm1, %xmm1 -; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm3 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm3, %xmm4 -; CHECK-NOBMI-NEXT: paddq %xmm0, %xmm2 -; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0 +; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm2 +; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; CHECK-NOBMI-NEXT: pand %xmm2, %xmm3 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm0 -; CHECK-NOBMI-NEXT: packssdw %xmm4, %xmm0 +; CHECK-NOBMI-NEXT: packssdw %xmm3, %xmm0 ; CHECK-NOBMI-NEXT: retq ; ; CHECK-AVX2-LABEL: is_pow2_non_zero_4xv64: @@ -129,9 +129,12 @@ ; CHECK-NOBMI-LABEL: neither_pow2_non_zero_4xv64: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movdqa {{.*#+}} xmm2 = [256,256] -; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: por %xmm2, %xmm1 +; CHECK-NOBMI-NEXT: por %xmm2, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 +; CHECK-NOBMI-NEXT: pand %xmm3, %xmm0 ; CHECK-NOBMI-NEXT: movdqa %xmm1, %xmm3 ; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm3 @@ -140,9 +143,6 @@ ; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2] ; CHECK-NOBMI-NEXT: pand %xmm3, %xmm4 ; CHECK-NOBMI-NEXT: pxor %xmm2, %xmm4 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NOBMI-NEXT: paddq %xmm2, %xmm3 -; CHECK-NOBMI-NEXT: pand %xmm3, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm0 @@ -189,40 +189,39 @@ ; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm4 ; CHECK-NOBMI-NEXT: pand %xmm1, %xmm4 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm1 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm1, %xmm5 -; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm4 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm4, %xmm1 -; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm1 -; CHECK-NOBMI-NEXT: por %xmm5, %xmm1 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm5 +; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm5 +; CHECK-NOBMI-NEXT: movdqa %xmm5, %xmm6 +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm1[1,3] +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] +; CHECK-NOBMI-NEXT: andps %xmm6, %xmm5 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm4 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm4, %xmm5 -; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm4 -; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm4 -; CHECK-NOBMI-NEXT: pand %xmm4, %xmm0 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NOBMI-NEXT: paddq %xmm3, %xmm1 +; CHECK-NOBMI-NEXT: pand %xmm1, %xmm0 ; CHECK-NOBMI-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-NOBMI-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2] -; CHECK-NOBMI-NEXT: pand %xmm2, %xmm0 -; CHECK-NOBMI-NEXT: pxor %xmm3, %xmm0 -; CHECK-NOBMI-NEXT: por %xmm5, %xmm0 -; CHECK-NOBMI-NEXT: packssdw %xmm1, %xmm0 +; CHECK-NOBMI-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] +; CHECK-NOBMI-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; CHECK-NOBMI-NEXT: andps %xmm1, %xmm0 +; CHECK-NOBMI-NEXT: xorps %xmm3, %xmm0 +; CHECK-NOBMI-NEXT: orps %xmm5, %xmm0 ; CHECK-NOBMI-NEXT: retq ; ; CHECK-AVX2-LABEL: neither_pow2_non_zero_4xv64_x_maybe_z: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2 +; CHECK-AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; CHECK-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 ; CHECK-AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; CHECK-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm4 ; CHECK-AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; CHECK-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; CHECK-AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; CHECK-AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; CHECK-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; CHECK-AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 ; CHECK-AVX2-NEXT: vzeroupper ; CHECK-AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll --- a/llvm/test/CodeGen/X86/jump_sign.ll +++ b/llvm/test/CodeGen/X86/jump_sign.ll @@ -228,11 +228,13 @@ ; CHECK-NEXT: jne .LBB12_8 ; CHECK-NEXT: # %bb.4: # %if.end29 ; CHECK-NEXT: movzwl (%eax), %eax -; CHECK-NEXT: imull $-13107, %eax, %eax # imm = 0xCCCD -; CHECK-NEXT: rorw %ax ; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: cmpl $6554, %eax # imm = 0x199A -; CHECK-NEXT: jae .LBB12_5 +; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD +; CHECK-NEXT: shrl $18, %ecx +; CHECK-NEXT: andl $-2, %ecx +; CHECK-NEXT: leal (%ecx,%ecx,4), %ecx +; CHECK-NEXT: cmpw %cx, %ax +; CHECK-NEXT: jne .LBB12_5 ; CHECK-NEXT: .LBB12_8: # %if.then44 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al @@ -388,11 +390,10 @@ ; CHECK-LABEL: func_test1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl b, %eax -; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: setb %cl ; CHECK-NEXT: movl a, %eax -; CHECK-NEXT: testl %eax, %ecx +; CHECK-NEXT: testb %al, %cl ; CHECK-NEXT: je .LBB18_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: decl %eax diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -349,12 +349,26 @@ define <4 x i32> @knownbits_mask_srem_shuffle_lshr(<4 x i32> %a0) nounwind { ; X86-LABEL: knownbits_mask_srem_shuffle_lshr: ; X86: # %bb.0: -; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; X86-NEXT: vpsrad $31, %xmm0, %xmm0 +; X86-NEXT: vpsrld $28, %xmm0, %xmm0 +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3] +; X86-NEXT: vpsrld $22, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: knownbits_mask_srem_shuffle_lshr: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; X64-NEXT: vpsrad $31, %xmm0, %xmm0 +; X64-NEXT: vpsrld $28, %xmm0, %xmm0 +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3] +; X64-NEXT: vpsrld $22, %xmm0, %xmm0 ; X64-NEXT: retq %1 = and <4 x i32> %a0, %2 = srem <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -157,8 +157,9 @@ ; ; X64-LABEL: signbits_ashr_extract_sitofp_0: ; X64: # %bb.0: -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = extractelement <2 x i64> %1, i32 0 @@ -179,8 +180,9 @@ ; ; X64-LABEL: signbits_ashr_extract_sitofp_1: ; X64: # %bb.0: -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: shrq $32, %rax +; X64-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = extractelement <2 x i64> %1, i32 0 @@ -203,10 +205,10 @@ ; ; X64-LABEL: signbits_ashr_shl_extract_sitofp: ; X64: # %bb.0: -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-NEXT: vpsrad $29, %xmm0, %xmm0 -; X64-NEXT: vpsllq $20, %xmm0, %xmm0 -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vmovq %xmm0, %rax +; X64-NEXT: sarq $61, %rax +; X64-NEXT: shll $20, %eax +; X64-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, %2 = shl <2 x i64> %1, @@ -220,9 +222,9 @@ ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrdl $30, %ecx, %eax ; X86-NEXT: sarl $30, %ecx -; X86-NEXT: shll $2, %eax ; X86-NEXT: vmovd %eax, %xmm0 ; X86-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X86-NEXT: vpsrlq $3, %xmm0, %xmm0 @@ -235,9 +237,8 @@ ; X64-LABEL: signbits_ashr_insert_ashr_extract_sitofp: ; X64: # %bb.0: ; X64-NEXT: sarq $30, %rdi -; X64-NEXT: vmovq %rdi, %xmm0 -; X64-NEXT: vpsrlq $3, %xmm0, %xmm0 -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: shrq $3, %rdi +; X64-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr i64 %a0, 30 %2 = insertelement <2 x i64> undef, i64 %1, i32 0 @@ -352,7 +353,8 @@ ; X64: # %bb.0: ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-NEXT: vpsrad $29, %xmm0, %xmm0 -; X64-NEXT: vmovd %edi, %xmm1 +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: vmovq %rax, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq @@ -405,24 +407,24 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp -; X86-NEXT: vmovapd 8(%ebp), %xmm3 -; X86-NEXT: vpsrad $31, %xmm2, %xmm4 -; X86-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; X86-NEXT: vpsrad $1, %xmm5, %xmm5 -; X86-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] -; X86-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-NEXT: vpmovsxdq 8(%ebp), %xmm4 +; X86-NEXT: vpmovsxdq 16(%ebp), %xmm3 ; X86-NEXT: vpsrad $31, %xmm2, %xmm5 +; X86-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] +; X86-NEXT: vpsrad $1, %xmm6, %xmm6 +; X86-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] +; X86-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-NEXT: vpsrad $31, %xmm2, %xmm6 ; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-NEXT: vpsrad $1, %xmm2, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] -; X86-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3] +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5],xmm6[6,7] ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 +; X86-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm4 ; X86-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm5, %xmm0 -; X86-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; X86-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] diff --git a/llvm/test/CodeGen/X86/lea-recursion.ll b/llvm/test/CodeGen/X86/lea-recursion.ll --- a/llvm/test/CodeGen/X86/lea-recursion.ll +++ b/llvm/test/CodeGen/X86/lea-recursion.ll @@ -21,27 +21,27 @@ ; CHECK-NEXT: leal 1(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+4(%rip) ; CHECK-NEXT: movl g1+4(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rdx), %ecx +; CHECK-NEXT: leal (%rdx,%rax), %ecx ; CHECK-NEXT: leal 2(%rax,%rdx), %eax ; CHECK-NEXT: movl %eax, g0+8(%rip) ; CHECK-NEXT: movl g1+8(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rcx), %edx -; CHECK-NEXT: leal 2(%rax,%rcx), %eax +; CHECK-NEXT: leal (%rcx,%rax), %edx +; CHECK-NEXT: leal 3(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+12(%rip) ; CHECK-NEXT: movl g1+12(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rdx), %ecx -; CHECK-NEXT: leal 2(%rax,%rdx), %eax +; CHECK-NEXT: leal (%rdx,%rax), %ecx +; CHECK-NEXT: leal 4(%rax,%rdx), %eax ; CHECK-NEXT: movl %eax, g0+16(%rip) ; CHECK-NEXT: movl g1+16(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rcx), %edx -; CHECK-NEXT: leal 2(%rax,%rcx), %eax +; CHECK-NEXT: leal (%rcx,%rax), %edx +; CHECK-NEXT: leal 5(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+20(%rip) ; CHECK-NEXT: movl g1+20(%rip), %eax -; CHECK-NEXT: leal 1(%rax,%rdx), %ecx -; CHECK-NEXT: leal 2(%rax,%rdx), %eax +; CHECK-NEXT: leal (%rdx,%rax), %ecx +; CHECK-NEXT: leal 6(%rax,%rdx), %eax ; CHECK-NEXT: movl %eax, g0+24(%rip) ; CHECK-NEXT: movl g1+24(%rip), %eax -; CHECK-NEXT: leal 2(%rax,%rcx), %eax +; CHECK-NEXT: leal 7(%rax,%rcx), %eax ; CHECK-NEXT: movl %eax, g0+28(%rip) ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/legalize-shift.ll b/llvm/test/CodeGen/X86/legalize-shift.ll --- a/llvm/test/CodeGen/X86/legalize-shift.ll +++ b/llvm/test/CodeGen/X86/legalize-shift.ll @@ -5,13 +5,17 @@ define void @PR36250() nounwind { ; X86-LABEL: PR36250: ; X86: # %bb.0: -; X86-NEXT: cmpl $0, (%eax) +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: leal (%eax,%eax), %ecx +; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete (%eax) ; X86-NEXT: retl ; ; X64-LABEL: PR36250: ; X64: # %bb.0: -; X64-NEXT: cmpq $0, (%rax) +; X64-NEXT: movq (%rax), %rax +; X64-NEXT: leaq (%rax,%rax), %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: sete (%rax) ; X64-NEXT: retq %1 = load i448, ptr undef diff --git a/llvm/test/CodeGen/X86/lifetime-alias.ll b/llvm/test/CodeGen/X86/lifetime-alias.ll --- a/llvm/test/CodeGen/X86/lifetime-alias.ll +++ b/llvm/test/CodeGen/X86/lifetime-alias.ll @@ -28,10 +28,10 @@ ; CHECK: # %bb.0: # %_ZNSt3__312basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED2Ev.exit50 ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movaps {{.*#+}} xmm0 = [97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97] -; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $7016996765293437281, %rax # imm = 0x6161616161616161 ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [97,97,97,97,97,97,97,97,97,97,97,97,97,97,97,97] +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $5632, {{[0-9]+}}(%rsp) # imm = 0x1600 ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -44,13 +44,11 @@ ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $21, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movabsq $7308613581744070988, %rax # imm = 0x656D69547473614C -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movups .L.str.1(%rip), %xmm1 ; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movabsq $7308613581744070988, %rax # imm = 0x656D69547473614C ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax @@ -61,6 +59,8 @@ ; CHECK-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax diff --git a/llvm/test/CodeGen/X86/load-chain.ll b/llvm/test/CodeGen/X86/load-chain.ll --- a/llvm/test/CodeGen/X86/load-chain.ll +++ b/llvm/test/CodeGen/X86/load-chain.ll @@ -11,9 +11,9 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movl $-32707, %ebp # imm = 0x803D -; CHECK-NEXT: andl (%rdi), %ebp +; CHECK-NEXT: movzwl (%rdi), %ebp ; CHECK-NEXT: callq maybe_mutate@PLT +; CHECK-NEXT: andl $32829, %ebp # imm = 0x803D ; CHECK-NEXT: orl $514, %ebp # imm = 0x202 ; CHECK-NEXT: movw %bp, (%rbx) ; CHECK-NEXT: addq $8, %rsp diff --git a/llvm/test/CodeGen/X86/load-combine.ll b/llvm/test/CodeGen/X86/load-combine.ll --- a/llvm/test/CodeGen/X86/load-combine.ll +++ b/llvm/test/CodeGen/X86/load-combine.ll @@ -894,7 +894,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_base_offset_index: @@ -939,13 +939,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 13(%eax,%ecx), %eax +; CHECK-NEXT: movl 13(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_base_offset_index_2: ; CHECK64: # %bb.0: ; CHECK64-NEXT: movl %esi, %eax -; CHECK64-NEXT: movl 13(%rax,%rdi), %eax +; CHECK64-NEXT: movl 13(%rdi,%rax), %eax ; CHECK64-NEXT: retq %tmp = add nuw nsw i32 %i, 4 %tmp2 = add nuw nsw i32 %i, 3 @@ -995,7 +995,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_zaext_loads: @@ -1051,7 +1051,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%eax,%ecx), %eax +; CHECK-NEXT: movl 12(%ecx,%eax), %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: load_i32_by_i8_zsext_loads: diff --git a/llvm/test/CodeGen/X86/load-local-v3i1.ll b/llvm/test/CodeGen/X86/load-local-v3i1.ll --- a/llvm/test/CodeGen/X86/load-local-v3i1.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i1.ll @@ -19,10 +19,10 @@ ; CHECK-NEXT: andb $1, %dl ; CHECK-NEXT: addb %dl, %dl ; CHECK-NEXT: orb %sil, %dl -; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: shlb $2, %cl ; CHECK-NEXT: orb %dl, %cl -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: andb $7, %cl +; CHECK-NEXT: testb %sil, %sil ; CHECK-NEXT: # implicit-def: $xmm0 ; CHECK-NEXT: jne .LBB0_1 ; CHECK-NEXT: # %bb.2: # %else @@ -56,10 +56,10 @@ ; CHECK-NEXT: andb $1, %dl ; CHECK-NEXT: addb %dl, %dl ; CHECK-NEXT: orb %sil, %dl -; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: shlb $2, %cl ; CHECK-NEXT: orb %dl, %cl -; CHECK-NEXT: testb $1, %cl +; CHECK-NEXT: andb $7, %cl +; CHECK-NEXT: testb %sil, %sil ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: # %bb.2: # %else ; CHECK-NEXT: testb $2, %cl diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll --- a/llvm/test/CodeGen/X86/load-local-v3i129.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll @@ -5,28 +5,35 @@ define void @_start() nounwind { ; FAST-SHLD-LABEL: _start: ; FAST-SHLD: # %bb.0: # %Entry -; FAST-SHLD-NEXT: movq -40(%rsp), %rax -; FAST-SHLD-NEXT: movq -32(%rsp), %rcx -; FAST-SHLD-NEXT: movq %rcx, %rdx -; FAST-SHLD-NEXT: shlq $62, %rdx -; FAST-SHLD-NEXT: shrq $2, %rcx -; FAST-SHLD-NEXT: shldq $2, %rdx, %rcx -; FAST-SHLD-NEXT: andq $-4, %rax -; FAST-SHLD-NEXT: orq $1, %rax -; FAST-SHLD-NEXT: movq %rax, -40(%rsp) -; FAST-SHLD-NEXT: movq %rcx, -32(%rsp) -; FAST-SHLD-NEXT: orq $-2, -56(%rsp) +; FAST-SHLD-NEXT: movl -24(%rsp), %eax +; FAST-SHLD-NEXT: movl %eax, %ecx +; FAST-SHLD-NEXT: shrl $2, %ecx +; FAST-SHLD-NEXT: movq -40(%rsp), %rdx +; FAST-SHLD-NEXT: movq -32(%rsp), %rsi +; FAST-SHLD-NEXT: shldq $62, %rsi, %rax +; FAST-SHLD-NEXT: shrdq $2, %rsi, %rdx +; FAST-SHLD-NEXT: leaq 1(,%rdx,4), %rsi +; FAST-SHLD-NEXT: movq %rsi, -40(%rsp) +; FAST-SHLD-NEXT: shrdq $62, %rax, %rdx +; FAST-SHLD-NEXT: movq %rdx, -32(%rsp) +; FAST-SHLD-NEXT: shrdq $62, %rcx, %rax +; FAST-SHLD-NEXT: andl $7, %eax +; FAST-SHLD-NEXT: movb %al, -24(%rsp) ; FAST-SHLD-NEXT: movq $-1, -48(%rsp) +; FAST-SHLD-NEXT: orq $-2, -56(%rsp) ; FAST-SHLD-NEXT: retq ; ; SLOW-SHLD-LABEL: _start: ; SLOW-SHLD: # %bb.0: # %Entry ; SLOW-SHLD-NEXT: movq -40(%rsp), %rax +; SLOW-SHLD-NEXT: movzbl -24(%rsp), %ecx +; SLOW-SHLD-NEXT: andl $7, %ecx +; SLOW-SHLD-NEXT: movb %cl, -24(%rsp) ; SLOW-SHLD-NEXT: andq $-4, %rax ; SLOW-SHLD-NEXT: orq $1, %rax ; SLOW-SHLD-NEXT: movq %rax, -40(%rsp) -; SLOW-SHLD-NEXT: orq $-2, -56(%rsp) ; SLOW-SHLD-NEXT: movq $-1, -48(%rsp) +; SLOW-SHLD-NEXT: orq $-2, -56(%rsp) ; SLOW-SHLD-NEXT: retq Entry: %y = alloca <3 x i129>, align 16 diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -104,14 +104,32 @@ } define <4 x float> @load_float4_float3_as_float2_float(ptr nocapture readonly dereferenceable(16)) nofree nosync { -; SSE-LABEL: load_float4_float3_as_float2_float: -; SSE: # %bb.0: -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: load_float4_float3_as_float2_float: +; SSE2: # %bb.0: +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_float4_float3_as_float2_float: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_float4_float3_as_float2_float: +; SSE41: # %bb.0: +; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: retq ; ; AVX-LABEL: load_float4_float3_as_float2_float: ; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: retq %2 = load <2 x float>, ptr %0, align 4 %3 = extractelement <2 x float> %2, i32 0 @@ -380,40 +398,48 @@ } define dso_local void @PR43227(ptr %explicit_0, ptr %explicit_1) { -; SSE-LABEL: PR43227: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm1, 672(%rsi) -; SSE-NEXT: movdqa %xmm0, 688(%rsi) -; SSE-NEXT: retq +; SSE2-LABEL: PR43227: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm1, 672(%rsi) +; SSE2-NEXT: movdqa %xmm0, 688(%rsi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR43227: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: psrlq $32, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, 672(%rsi) +; SSSE3-NEXT: movdqa %xmm0, 688(%rsi) +; SSSE3-NEXT: retq ; -; AVX1-LABEL: PR43227: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, 672(%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq +; SSE41-LABEL: PR43227: +; SSE41: # %bb.0: +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, 672(%rsi) +; SSE41-NEXT: movaps %xmm1, 688(%rsi) +; SSE41-NEXT: retq ; -; AVX2-LABEL: PR43227: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 672(%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: PR43227: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vmovaps %ymm0, 672(%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = getelementptr i32, ptr %explicit_0, i64 63 %2 = load <3 x i32>, ptr %1, align 1 %3 = shufflevector <3 x i32> %2, <3 x i32> undef, <2 x i32> @@ -423,3 +449,6 @@ store <8 x i32> %5, ptr %6, align 32 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX1: {{.*}} +; AVX2: {{.*}} diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll --- a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -365,7 +365,7 @@ ; SSE-NEXT: movzwl %cx, %eax ; SSE-NEXT: movswl %ax, %ecx ; SSE-NEXT: shrl $15, %eax -; SSE-NEXT: sarl $5, %ecx +; SSE-NEXT: shrl $5, %ecx ; SSE-NEXT: addl %eax, %ecx ; SSE-NEXT: movd %ecx, %xmm0 ; SSE-NEXT: retq @@ -379,7 +379,7 @@ ; AVX-NEXT: movzwl %cx, %eax ; AVX-NEXT: movswl %ax, %ecx ; AVX-NEXT: shrl $15, %eax -; AVX-NEXT: sarl $5, %ecx +; AVX-NEXT: shrl $5, %ecx ; AVX-NEXT: addl %eax, %ecx ; AVX-NEXT: vmovd %ecx, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -40,9 +40,9 @@ ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB0_1: # %vector.body ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm1 +; AVX-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm2 +; AVX-NEXT: vpmulld %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: addq $8, %rcx ; AVX-NEXT: cmpq %rcx, %rax @@ -96,7 +96,16 @@ ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 ; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 -; SSE2-NEXT: pmaddwd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $8, %rcx ; SSE2-NEXT: cmpq %rcx, %rax @@ -118,8 +127,13 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB1_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rdi,%rcx,2), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rsi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $8, %rcx @@ -144,17 +158,20 @@ ; AVX256-NEXT: .p2align 4, 0x90 ; AVX256-NEXT: .LBB1_1: # %vector.body ; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX256-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 -; AVX256-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 +; AVX256-NEXT: vpmovsxwd (%rdi,%rcx,2), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rsi,%rcx,2), %ymm2 +; AVX256-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX256-NEXT: addq $8, %rcx ; AVX256-NEXT: cmpq %rcx, %rax ; AVX256-NEXT: jne .LBB1_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -203,14 +220,32 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm3 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm4 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm3, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm4 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm5 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: pmulhw %xmm4, %xmm7 +; SSE2-NEXT: pmullw %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pmulhw %xmm5, %xmm7 +; SSE2-NEXT: pmullw %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm5[0,2] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB2_1 @@ -234,14 +269,24 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB2_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 -; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm3 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 -; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3 +; AVX1-NEXT: vpmovsxwd 24(%rdi,%rcx,2), %xmm2 +; AVX1-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovsxwd 8(%rdi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxwd 24(%rsi,%rcx,2), %xmm4 +; AVX1-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpmovsxwd 8(%rsi,%rcx,2), %xmm4 +; AVX1-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB2_1 @@ -268,8 +313,15 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 -; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 +; AVX2-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %ymm2 +; AVX2-NEXT: vpmovsxwd (%rdi,%rcx,2), %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %ymm3 +; AVX2-NEXT: vpmovsxwd (%rsi,%rcx,2), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax @@ -277,9 +329,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -294,8 +346,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB2_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %ymm1 -; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm1, %ymm1 +; AVX512-NEXT: vpmovsxwd (%rdi,%rcx,2), %zmm1 +; AVX512-NEXT: vpmovsxwd (%rsi,%rcx,2), %zmm2 +; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vphaddd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax @@ -304,9 +360,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -350,7 +406,6 @@ ; SSE2-LABEL: _Z10test_shortPsS_i_1024: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 @@ -359,26 +414,63 @@ ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB3_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm5 -; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm6 -; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm7 -; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm8 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm9 -; SSE2-NEXT: pmaddwd %xmm5, %xmm9 -; SSE2-NEXT: paddd %xmm9, %xmm2 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm6, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm4 -; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm7, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm5 -; SSE2-NEXT: pmaddwd %xmm8, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm7 +; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm10 +; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm11 +; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm12 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm5 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm6 +; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm8 +; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm9 +; SSE2-NEXT: movdqa %xmm5, %xmm13 +; SSE2-NEXT: pmulhw %xmm7, %xmm13 +; SSE2-NEXT: pmullw %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm6, %xmm13 +; SSE2-NEXT: pmulhw %xmm10, %xmm13 +; SSE2-NEXT: pmullw %xmm10, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm8, %xmm13 +; SSE2-NEXT: pmulhw %xmm11, %xmm13 +; SSE2-NEXT: pmullw %xmm11, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm9, %xmm13 +; SSE2-NEXT: pmulhw %xmm12, %xmm13 +; SSE2-NEXT: pmullw %xmm12, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; SSE2-NEXT: movdqa %xmm9, %xmm13 +; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm12[0,2] +; SSE2-NEXT: movdqa %xmm8, %xmm14 +; SSE2-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm11[0,2] +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm10[0,2] +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm12[1,3] +; SSE2-NEXT: paddd %xmm13, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm11[1,3] +; SSE2-NEXT: paddd %xmm14, %xmm8 +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm10[1,3] +; SSE2-NEXT: paddd %xmm15, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3] +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %middle.block +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 @@ -403,22 +495,42 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB3_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %xmm3 -; AVX1-NEXT: vmovdqu 16(%rsi,%rcx,2), %xmm4 -; AVX1-NEXT: vmovdqu 32(%rsi,%rcx,2), %xmm5 -; AVX1-NEXT: vmovdqu 48(%rsi,%rcx,2), %xmm6 -; AVX1-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3 -; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 -; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 -; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6 +; AVX1-NEXT: vpmovsxwd 56(%rdi,%rcx,2), %xmm3 +; AVX1-NEXT: vpmovsxwd 48(%rdi,%rcx,2), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxwd 40(%rdi,%rcx,2), %xmm4 +; AVX1-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovsxwd 24(%rdi,%rcx,2), %xmm5 +; AVX1-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %xmm6 +; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmovsxwd 8(%rdi,%rcx,2), %xmm6 +; AVX1-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm7 +; AVX1-NEXT: vpackssdw %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmovsxwd 56(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd 48(%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpmovsxwd 40(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmovsxwd 24(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmovsxwd 8(%rsi,%rcx,2), %xmm7 +; AVX1-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB3_1 @@ -451,11 +563,25 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB3_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 -; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4 -; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3 +; AVX2-NEXT: vpmovsxwd 48(%rdi,%rcx,2), %ymm3 +; AVX2-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %ymm4 +; AVX2-NEXT: vpmovsxwd (%rdi,%rcx,2), %ymm5 +; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmovsxwd 48(%rsi,%rcx,2), %ymm5 +; AVX2-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %ymm6 +; AVX2-NEXT: vpackssdw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %ymm6 +; AVX2-NEXT: vpmovsxwd (%rsi,%rcx,2), %ymm7 +; AVX2-NEXT: vpackssdw %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: addq $16, %rcx ; AVX2-NEXT: cmpq %rcx, %rax @@ -465,9 +591,9 @@ ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -483,10 +609,16 @@ ; AVX512F-NEXT: .p2align 4, 0x90 ; AVX512F-NEXT: .LBB3_1: # %vector.body ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vmovdqu (%rsi,%rcx,2), %ymm2 -; AVX512F-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm3 -; AVX512F-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm3, %ymm3 -; AVX512F-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm2, %ymm2 +; AVX512F-NEXT: vpmovsxwd (%rdi,%rcx,2), %zmm2 +; AVX512F-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %zmm3 +; AVX512F-NEXT: vpmovsxwd (%rsi,%rcx,2), %zmm4 +; AVX512F-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %zmm5 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm5, %ymm5 +; AVX512F-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: addq $16, %rcx @@ -497,9 +629,9 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax @@ -515,8 +647,17 @@ ; AVX512BW-NEXT: .p2align 4, 0x90 ; AVX512BW-NEXT: .LBB3_1: # %vector.body ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vmovdqu64 (%rsi,%rcx,2), %zmm2 -; AVX512BW-NEXT: vpmaddwd (%rdi,%rcx,2), %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovsxwd 32(%rdi,%rcx,2), %zmm2 +; AVX512BW-NEXT: vpmovsxwd (%rdi,%rcx,2), %zmm3 +; AVX512BW-NEXT: vpmovsxwd 32(%rsi,%rcx,2), %zmm4 +; AVX512BW-NEXT: vpmovsxwd (%rsi,%rcx,2), %zmm5 +; AVX512BW-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm5, %ymm3 +; AVX512BW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: addq $16, %rcx ; AVX512BW-NEXT: cmpq %rcx, %rax @@ -526,9 +667,9 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -668,7 +809,15 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmaddwd %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax @@ -690,9 +839,13 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB5_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 -; AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmovsxbd (%rdi,%rcx), %xmm1 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rcx), %xmm2 +; AVX1-NEXT: vpmovsxbd (%rsi,%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $16, %rcx @@ -717,18 +870,20 @@ ; AVX256-NEXT: .p2align 4, 0x90 ; AVX256-NEXT: .LBB5_1: # %vector.body ; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX256-NEXT: vpmovsxbw (%rdi,%rcx), %xmm1 -; AVX256-NEXT: vpmovsxbw (%rsi,%rcx), %xmm2 -; AVX256-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; AVX256-NEXT: vpmovsxbd (%rdi,%rcx), %ymm1 +; AVX256-NEXT: vpmovsxbd (%rsi,%rcx), %ymm2 +; AVX256-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX256-NEXT: addq $16, %rcx ; AVX256-NEXT: cmpq %rcx, %rax ; AVX256-NEXT: jne .LBB5_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -783,14 +938,30 @@ ; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: pmullw %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm4 -; SSE2-NEXT: pmaddwd %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm3[0,2] +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB6_1 @@ -814,11 +985,19 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB6_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 12(%rdi,%rcx), %xmm2 +; AVX1-NEXT: vpmovsxbd 8(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpmovsxbd (%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxbd 12(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 8(%rsi,%rcx), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd (%rsi,%rcx), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 @@ -850,8 +1029,14 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB6_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 -; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbd 8(%rdi,%rcx), %ymm2 +; AVX2-NEXT: vpmovsxbd (%rdi,%rcx), %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxbd 8(%rsi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbd (%rsi,%rcx), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] ; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rcx @@ -860,9 +1045,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -877,9 +1062,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB6_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm1 -; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm2 -; AVX512-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vpmovsxbd (%rdi,%rcx), %zmm1 +; AVX512-NEXT: vpmovsxbd (%rsi,%rcx), %zmm2 +; AVX512-NEXT: vpmulld %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vphaddd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax @@ -888,9 +1076,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -934,7 +1122,6 @@ ; SSE2-LABEL: _Z9test_charPcS_i_1024: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 @@ -944,37 +1131,70 @@ ; SSE2-NEXT: .LBB7_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm7 -; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm6 +; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm10 ; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm8 -; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; SSE2-NEXT: psraw $8, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; SSE2-NEXT: psraw $8, %xmm10 -; SSE2-NEXT: pmaddwd %xmm9, %xmm10 -; SSE2-NEXT: paddd %xmm10, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; SSE2-NEXT: psraw $8, %xmm11 +; SSE2-NEXT: pmullw %xmm5, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] +; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm8 -; SSE2-NEXT: pmaddwd %xmm7, %xmm8 -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] -; SSE2-NEXT: psraw $8, %xmm8 -; SSE2-NEXT: pmaddwd %xmm7, %xmm8 -; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmaddwd %xmm6, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; SSE2-NEXT: psraw $8, %xmm11 +; SSE2-NEXT: pmullw %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; SSE2-NEXT: psraw $8, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] +; SSE2-NEXT: psraw $8, %xmm13 +; SSE2-NEXT: pmullw %xmm11, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; SSE2-NEXT: psrad $16, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; SSE2-NEXT: psrad $16, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psraw $8, %xmm9 +; SSE2-NEXT: pmullw %xmm10, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm13 +; SSE2-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm10[0,2] +; SSE2-NEXT: movdqa %xmm11, %xmm14 +; SSE2-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm12[0,2] +; SSE2-NEXT: movdqa %xmm7, %xmm15 +; SSE2-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2],xmm8[0,2] +; SSE2-NEXT: movdqa %xmm5, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,3],xmm10[1,3] +; SSE2-NEXT: paddd %xmm13, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,3],xmm12[1,3] +; SSE2-NEXT: paddd %xmm14, %xmm11 +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm8[1,3] +; SSE2-NEXT: paddd %xmm15, %xmm7 +; SSE2-NEXT: paddd %xmm7, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: addq $32, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %middle.block +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 @@ -999,17 +1219,33 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB7_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw 24(%rdi,%rcx), %xmm3 -; AVX1-NEXT: vpmovsxbw 16(%rdi,%rcx), %xmm4 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm5 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm6 -; AVX1-NEXT: vpmovsxbw 24(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 28(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpmovsxbd 24(%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxbd 20(%rdi,%rcx), %xmm4 +; AVX1-NEXT: vpmovsxbd 16(%rdi,%rcx), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovsxbd 12(%rdi,%rcx), %xmm5 +; AVX1-NEXT: vpmovsxbd 8(%rdi,%rcx), %xmm6 +; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rcx), %xmm6 +; AVX1-NEXT: vpmovsxbd (%rdi,%rcx), %xmm7 +; AVX1-NEXT: vpackssdw %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmovsxbd 28(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 24(%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 20(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 16(%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 12(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 8(%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rcx), %xmm7 +; AVX1-NEXT: vpmovsxbd (%rsi,%rcx), %xmm8 +; AVX1-NEXT: vpackssdw %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 @@ -1051,14 +1287,26 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB7_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; AVX2-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; AVX2-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpmovsxbd 24(%rdi,%rcx), %ymm3 +; AVX2-NEXT: vpmovsxbd 16(%rdi,%rcx), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmovsxbd 8(%rdi,%rcx), %ymm4 +; AVX2-NEXT: vpmovsxbd (%rdi,%rcx), %ymm5 +; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmovsxbd 24(%rsi,%rcx), %ymm5 +; AVX2-NEXT: vpmovsxbd 16(%rsi,%rcx), %ymm6 +; AVX2-NEXT: vpackssdw %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpmovsxbd 8(%rsi,%rcx), %ymm6 +; AVX2-NEXT: vpmovsxbd (%rsi,%rcx), %ymm7 +; AVX2-NEXT: vpackssdw %ymm6, %ymm7, %ymm6 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,2,1,3] +; AVX2-NEXT: vpmaddwd %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; AVX2-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: addq $32, %rcx ; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB7_1 @@ -1067,9 +1315,9 @@ ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1101,9 +1349,9 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax @@ -1119,8 +1367,12 @@ ; AVX512BW-NEXT: .p2align 4, 0x90 ; AVX512BW-NEXT: .LBB7_1: # %vector.body ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; AVX512BW-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; AVX512BW-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm2 +; AVX512BW-NEXT: vpmovsxbw (%rdi,%rcx), %ymm3 +; AVX512BW-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm4 +; AVX512BW-NEXT: vpmovsxbw (%rsi,%rcx), %ymm5 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm3 ; AVX512BW-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: addq $32, %rcx @@ -1131,9 +1383,9 @@ ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax @@ -1335,9 +1587,9 @@ ; AVX256-NEXT: jne .LBB9_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -1490,9 +1742,9 @@ ; AVX2-NEXT: # %bb.2: # %middle.block ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1518,9 +1770,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1743,9 +1995,9 @@ ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -1777,9 +2029,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax @@ -1824,13 +2076,39 @@ define <4 x i32> @pmaddwd_8(<8 x i16> %A, <8 x i16> %B) { ; SSE2-LABEL: pmaddwd_8: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: pmaddwd_8: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddwd_8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddwd_8: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX256-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a = sext <8 x i16> %A to <8 x i32> %b = sext <8 x i16> %B to <8 x i32> %m = mul nsw <8 x i32> %a, %b @@ -1843,13 +2121,39 @@ define <4 x i32> @pmaddwd_8_swapped(<8 x i16> %A, <8 x i16> %B) { ; SSE2-LABEL: pmaddwd_8_swapped: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: pmaddwd_8_swapped: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddwd_8_swapped: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddwd_8_swapped: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX256-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a = sext <8 x i16> %A to <8 x i32> %b = sext <8 x i16> %B to <8 x i32> %m = mul nsw <8 x i32> %a, %b @@ -1877,13 +2181,24 @@ ; ; AVX1-LABEL: larger_mul: ; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: larger_mul: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1892,8 +2207,10 @@ ; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7] +; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a = sext <16 x i16> %A to <16 x i32> @@ -1908,8 +2225,26 @@ define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) { ; SSE2-LABEL: pmaddwd_16: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm2, %xmm0 -; SSE2-NEXT: pmaddwd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pmulhw %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_16: @@ -1921,10 +2256,20 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: pmaddwd_16: -; AVX256: # %bb.0: -; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: pmaddwd_16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: pmaddwd_16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512-NEXT: retq %a = sext <16 x i16> %A to <16 x i32> %b = sext <16 x i16> %B to <16 x i32> %m = mul nsw <16 x i32> %a, %b @@ -1937,10 +2282,46 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) { ; SSE2-LABEL: pmaddwd_32: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm4, %xmm0 -; SSE2-NEXT: pmaddwd %xmm5, %xmm1 -; SSE2-NEXT: pmaddwd %xmm6, %xmm2 -; SSE2-NEXT: pmaddwd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pmulhw %xmm4, %xmm8 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pmulhw %xmm5, %xmm8 +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pmulhw %xmm6, %xmm8 +; SSE2-NEXT: pmullw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pmulhw %xmm7, %xmm8 +; SSE2-NEXT: pmullw %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm7[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm6[0,2] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm5[0,2] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm7[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm6[1,3] +; SSE2-NEXT: paddd %xmm9, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm11, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_32: @@ -1988,13 +2369,36 @@ define <4 x i32> @pmaddwd_const(<8 x i16> %A) { ; SSE2-LABEL: pmaddwd_const: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: pmaddwd_const: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: pmaddwd_const: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpmovsxwd %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX256-LABEL: pmaddwd_const: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX256-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %a = sext <8 x i16> %A to <8 x i32> %m = mul nsw <8 x i32> %a, %odd = shufflevector <8 x i32> %m, <8 x i32> undef, <4 x i32> @@ -2058,9 +2462,9 @@ ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] ; SSE2-NEXT: paddd %xmm2, %xmm1 @@ -2095,13 +2499,41 @@ define <4 x i32> @jumbled_indices4(<8 x i16> %A, <8 x i16> %B) { ; SSE2-LABEL: jumbled_indices4: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: retq ; -; AVX-LABEL: jumbled_indices4: -; AVX: # %bb.0: -; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: jumbled_indices4: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX1-NEXT: retq +; +; AVX256-LABEL: jumbled_indices4: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX256-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %exta = sext <8 x i16> %A to <8 x i32> %extb = sext <8 x i16> %B to <8 x i32> %m = mul <8 x i32> %exta, %extb @@ -2114,8 +2546,26 @@ define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) { ; SSE2-LABEL: jumbled_indices8: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm2, %xmm0 -; SSE2-NEXT: pmaddwd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pmulhw %xmm2, %xmm4 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pmulhw %xmm3, %xmm4 +; SSE2-NEXT: pmullw %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm3[3,0] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm3[2,1] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[2,1] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: jumbled_indices8: @@ -2127,10 +2577,21 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: jumbled_indices8: -; AVX256: # %bb.0: -; AVX256-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: jumbled_indices8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: jumbled_indices8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,5,4,3,2,7,6] +; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq %exta = sext <16 x i16> %A to <16 x i32> %extb = sext <16 x i16> %B to <16 x i32> %m = mul <16 x i32> %exta, %extb @@ -2143,10 +2604,46 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) { ; SSE2-LABEL: jumbled_indices16: ; SSE2: # %bb.0: -; SSE2-NEXT: pmaddwd %xmm4, %xmm0 -; SSE2-NEXT: pmaddwd %xmm5, %xmm1 -; SSE2-NEXT: pmaddwd %xmm6, %xmm2 -; SSE2-NEXT: pmaddwd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pmulhw %xmm4, %xmm8 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm1, %xmm8 +; SSE2-NEXT: pmulhw %xmm5, %xmm8 +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pmulhw %xmm6, %xmm8 +; SSE2-NEXT: pmullw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: pmulhw %xmm7, %xmm8 +; SSE2-NEXT: pmullw %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,0],xmm7[3,1] +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm6[0,3] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm5[3,0] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm4[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,1],xmm7[2,0] +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[1,2] +; SSE2-NEXT: paddd %xmm9, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[2,1] +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm4[0,3] +; SSE2-NEXT: paddd %xmm11, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: jumbled_indices16: @@ -2194,16 +2691,99 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) { ; SSE2-LABEL: jumbled_indices32: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm1 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm2 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm3 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm5 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm6 -; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm7 -; SSE2-NEXT: movdqa %xmm7, 112(%rdi) +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 +; SSE2-NEXT: movdqa %xmm0, %xmm15 +; SSE2-NEXT: pmulhw %xmm7, %xmm15 +; SSE2-NEXT: pmullw %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm15[4],xmm7[5],xmm15[5],xmm7[6],xmm15[6],xmm7[7],xmm15[7] +; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm1, %xmm15 +; SSE2-NEXT: pmulhw %xmm9, %xmm15 +; SSE2-NEXT: pmullw %xmm9, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm2, %xmm15 +; SSE2-NEXT: pmulhw %xmm10, %xmm15 +; SSE2-NEXT: pmullw %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm3, %xmm15 +; SSE2-NEXT: pmulhw %xmm11, %xmm15 +; SSE2-NEXT: pmullw %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm4, %xmm15 +; SSE2-NEXT: pmulhw %xmm13, %xmm15 +; SSE2-NEXT: pmullw %xmm13, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm5, %xmm15 +; SSE2-NEXT: pmulhw %xmm14, %xmm15 +; SSE2-NEXT: pmullw %xmm14, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: pmulhw %xmm12, %xmm15 +; SSE2-NEXT: pmullw %xmm12, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: pmulhw %xmm15, %xmm7 +; SSE2-NEXT: pmullw %xmm15, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,0],xmm15[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,1],xmm15[0,2] +; SSE2-NEXT: paddd %xmm7, %xmm8 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,1],xmm12[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: movdqa %xmm5, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm14[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm14[0,2] +; SSE2-NEXT: paddd %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm13[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm13[0,3] +; SSE2-NEXT: paddd %xmm7, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm11[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm11[3,1] +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm10[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm10[3,1] +; SSE2-NEXT: paddd %xmm7, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm9[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm9[3,1] +; SSE2-NEXT: paddd %xmm7, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm7 +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm9[2,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm9[3,0] +; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm8, 112(%rdi) ; SSE2-NEXT: movdqa %xmm6, 96(%rdi) ; SSE2-NEXT: movdqa %xmm5, 80(%rdi) ; SSE2-NEXT: movdqa %xmm4, 64(%rdi) @@ -2306,10 +2886,44 @@ define <8 x i32> @pmaddwd_256(<16 x i16>* %Aptr, <16 x i16>* %Bptr) { ; SSE2-LABEL: pmaddwd_256: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 -; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: pmulhw %xmm4, %xmm2 +; SSE2-NEXT: pmullw %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pmulhw %xmm0, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_256: @@ -2321,11 +2935,52 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX256-LABEL: pmaddwd_256: -; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa (%rdi), %ymm0 -; AVX256-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; AVX256-NEXT: retq +; AVX2-LABEL: pmaddwd_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] +; AVX2-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm4, %xmm4 +; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpmovsxwd %xmm3, %ymm3 +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: vpmulld %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX2-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: pmaddwd_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm1 +; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm3 +; AVX512-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512-NEXT: vpmovsxwd %xmm3, %ymm3 +; AVX512-NEXT: vpmulld %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retq %A = load <16 x i16>, <16 x i16>* %Aptr %B = load <16 x i16>, <16 x i16>* %Bptr %A_even = shufflevector <16 x i16> %A, <16 x i16> undef, <8 x i32> @@ -2345,14 +3000,82 @@ define <16 x i32> @pmaddwd_512(<32 x i16>* %Aptr, <32 x i16>* %Bptr) { ; SSE2-LABEL: pmaddwd_512: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm3 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 -; SSE2-NEXT: pmaddwd 16(%rsi), %xmm1 -; SSE2-NEXT: pmaddwd 32(%rsi), %xmm2 -; SSE2-NEXT: pmaddwd 48(%rsi), %xmm3 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa 32(%rdi), %xmm3 +; SSE2-NEXT: movdqa 48(%rdi), %xmm9 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm7 +; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rsi), %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm8[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm5, %xmm1 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: packssdw %xmm9, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0] +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm7, %xmm0 +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm8, %xmm6 +; SSE2-NEXT: pmulhw %xmm9, %xmm6 +; SSE2-NEXT: pmullw %xmm8, %xmm9 +; SSE2-NEXT: movdqa %xmm9, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: pmulhw %xmm5, %xmm6 +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: pmulhw %xmm2, %xmm6 +; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: paddd %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pmulhw %xmm0, %xmm6 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_512: @@ -2377,20 +3100,24 @@ ; AVX2-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: pmaddwd_512: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: pmaddwd_512: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmaddwd (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: pmaddwd_512: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm1 +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vmovdqa64 (%rsi), %zmm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm3 +; AVX512-NEXT: vpsrld $16, %zmm2, %zmm2 +; AVX512-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512-NEXT: vpmovsxwd %ymm3, %zmm3 +; AVX512-NEXT: vpmulld %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512-NEXT: vpmulld %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %A = load <32 x i16>, <32 x i16>* %Aptr %B = load <32 x i16>, <32 x i16>* %Bptr %A_even = shufflevector <32 x i16> %A, <32 x i16> undef, <16 x i32> @@ -2411,30 +3138,176 @@ ; SSE2-LABEL: pmaddwd_1024: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pmaddwd (%rdx), %xmm0 -; SSE2-NEXT: pmaddwd 16(%rdx), %xmm1 -; SSE2-NEXT: pmaddwd 32(%rdx), %xmm2 -; SSE2-NEXT: pmaddwd 48(%rdx), %xmm3 -; SSE2-NEXT: movdqa 64(%rsi), %xmm4 -; SSE2-NEXT: pmaddwd 64(%rdx), %xmm4 -; SSE2-NEXT: movdqa 80(%rsi), %xmm5 -; SSE2-NEXT: pmaddwd 80(%rdx), %xmm5 -; SSE2-NEXT: movdqa 96(%rsi), %xmm6 -; SSE2-NEXT: pmaddwd 96(%rdx), %xmm6 +; SSE2-NEXT: movdqa 96(%rsi), %xmm9 ; SSE2-NEXT: movdqa 112(%rsi), %xmm7 -; SSE2-NEXT: pmaddwd 112(%rdx), %xmm7 -; SSE2-NEXT: movdqa %xmm7, 112(%rdi) -; SSE2-NEXT: movdqa %xmm6, 96(%rdi) -; SSE2-NEXT: movdqa %xmm5, 80(%rdi) -; SSE2-NEXT: movdqa %xmm4, 64(%rdi) -; SSE2-NEXT: movdqa %xmm3, 48(%rdi) -; SSE2-NEXT: movdqa %xmm2, 32(%rdi) -; SSE2-NEXT: movdqa %xmm1, 16(%rdi) -; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: movdqa 64(%rsi), %xmm12 +; SSE2-NEXT: movdqa 80(%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa 32(%rsi), %xmm10 +; SSE2-NEXT: movdqa 48(%rsi), %xmm8 +; SSE2-NEXT: movdqa 80(%rdx), %xmm11 +; SSE2-NEXT: movdqa (%rdx), %xmm5 +; SSE2-NEXT: movdqa 16(%rdx), %xmm14 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm3[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0] +; SSE2-NEXT: movdqa 32(%rdx), %xmm4 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa 48(%rdx), %xmm3 +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: packssdw %xmm8, %xmm10 +; SSE2-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm12 +; SSE2-NEXT: packssdw %xmm0, %xmm12 +; SSE2-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: psrad $16, %xmm7 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm12 = xmm12[0],xmm0[0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa 64(%rdx), %xmm8 +; SSE2-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm10[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] +; SSE2-NEXT: movdqa 112(%rdx), %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: movdqa 96(%rdx), %xmm10 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: psrad $16, %xmm14 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: packssdw %xmm14, %xmm5 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: packssdw %xmm3, %xmm4 +; SSE2-NEXT: psrad $16, %xmm11 +; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: packssdw %xmm11, %xmm8 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm10 +; SSE2-NEXT: packssdw %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pmulhw %xmm0, %xmm1 +; SSE2-NEXT: pmullw %xmm6, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm13, %xmm1 +; SSE2-NEXT: pmulhw %xmm15, %xmm1 +; SSE2-NEXT: pmullw %xmm13, %xmm15 +; SSE2-NEXT: movdqa %xmm15, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pmulhw %xmm12, %xmm1 +; SSE2-NEXT: pmullw %xmm2, %xmm12 +; SSE2-NEXT: movdqa %xmm12, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pmulhw %xmm7, %xmm1 +; SSE2-NEXT: pmullw %xmm2, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: pmulhw %xmm10, %xmm1 +; SSE2-NEXT: pmullw %xmm9, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: paddd %xmm14, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pmulhw %xmm8, %xmm1 +; SSE2-NEXT: pmullw %xmm0, %xmm8 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: paddd %xmm13, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: paddd %xmm15, %xmm8 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pmulhw %xmm4, %xmm1 +; SSE2-NEXT: pmullw %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: paddd %xmm12, %xmm4 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: pmulhw %xmm5, %xmm1 +; SSE2-NEXT: pmullw %xmm9, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE2-NEXT: paddd %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSE2-NEXT: paddd %xmm7, %xmm5 +; SSE2-NEXT: movdqa %xmm3, 112(%rdi) +; SSE2-NEXT: movdqa %xmm10, 96(%rdi) +; SSE2-NEXT: movdqa %xmm0, 80(%rdi) +; SSE2-NEXT: movdqa %xmm8, 64(%rdi) +; SSE2-NEXT: movdqa %xmm6, 48(%rdi) +; SSE2-NEXT: movdqa %xmm4, 32(%rdi) +; SSE2-NEXT: movdqa %xmm9, 16(%rdi) +; SSE2-NEXT: movdqa %xmm5, (%rdi) ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_1024: @@ -2514,13 +3387,26 @@ ; SSE2-LABEL: pmaddwd_commuted_mul: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pandn %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: pmaddwd_commuted_mul: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rsi), %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %A = load <8 x i16>, <8 x i16>* %Aptr %B = load <8 x i16>, <8 x i16>* %Bptr @@ -2541,14 +3427,20 @@ define <4 x i32> @pmaddwd_swapped_indices(<8 x i16>* %Aptr, <8 x i16>* %Bptr) { ; SSE2-LABEL: pmaddwd_swapped_indices: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pmaddwd (%rsi), %xmm0 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSE2-NEXT: pmaddwd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: pmaddwd_swapped_indices: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[1,0,2,3,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = mem[1,0,2,3,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7] +; AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %A = load <8 x i16>, <8 x i16>* %Aptr %B = load <8 x i16>, <8 x i16>* %Bptr @@ -2604,31 +3496,80 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 ; SSE2-NEXT: movdqu (%rsi), %xmm1 -; SSE2-NEXT: pmaddwd %xmm0, %xmm1 -; SSE2-NEXT: movdqu (%rdx), %xmm0 -; SSE2-NEXT: movdqu (%rcx), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pmulhw %xmm1, %xmm2 +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: movdqu (%rdx), %xmm1 +; SSE2-NEXT: movdqu (%rcx), %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: madd_double_reduction: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdx), %xmm1 -; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: madd_double_reduction: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 +; AVX1-NEXT: vpmovsxwd (%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd (%rdx), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rdx), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX256-LABEL: madd_double_reduction: +; AVX256: # %bb.0: +; AVX256-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX256-NEXT: vpmovsxwd (%rsi), %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpmovsxwd (%rdx), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rcx), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vmovd %xmm0, %eax +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %tmp = load <8 x i16>, <8 x i16>* %arg, align 1 %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1 %tmp7 = sext <8 x i16> %tmp to <8 x i32> @@ -2655,49 +3596,140 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu (%rsi), %xmm1 -; SSE2-NEXT: pmaddwd %xmm0, %xmm1 +; SSE2-NEXT: movdqu (%rdi), %xmm1 +; SSE2-NEXT: movdqu (%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pmulhw %xmm0, %xmm2 +; SSE2-NEXT: pmullw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: movdqu (%rdx), %xmm0 ; SSE2-NEXT: movdqu (%rcx), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: movdqu (%r8), %xmm0 -; SSE2-NEXT: movdqu (%r9), %xmm1 -; SSE2-NEXT: pmaddwd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movdqu (%r8), %xmm1 +; SSE2-NEXT: movdqu (%r9), %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movdqu (%r10), %xmm0 ; SSE2-NEXT: movdqu (%rax), %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pmulhw %xmm2, %xmm3 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: madd_quad_reduction: -; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdx), %xmm1 -; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqu (%r8), %xmm1 -; AVX-NEXT: vpmaddwd (%r9), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%r10), %xmm1 -; AVX-NEXT: vpmaddwd (%rax), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: madd_quad_reduction: +; AVX1: # %bb.0: +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 +; AVX1-NEXT: vpmovsxwd (%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd 8(%rsi), %xmm2 +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd (%rdx), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rdx), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rcx), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpmovsxwd (%r8), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%r8), %xmm2 +; AVX1-NEXT: vpmovsxwd (%r9), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%r9), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwd (%r10), %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%r10), %xmm2 +; AVX1-NEXT: vpmovsxwd (%rax), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxwd 8(%rax), %xmm3 +; AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX256-LABEL: madd_quad_reduction: +; AVX256: # %bb.0: +; AVX256-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX256-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX256-NEXT: vpmovsxwd (%rdi), %ymm0 +; AVX256-NEXT: vpmovsxwd (%rsi), %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX256-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpmovsxwd (%rdx), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rcx), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpmovsxwd (%r8), %ymm1 +; AVX256-NEXT: vpmovsxwd (%r9), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpmovsxwd (%r10), %ymm1 +; AVX256-NEXT: vpmovsxwd (%rax), %ymm2 +; AVX256-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vmovd %xmm0, %eax +; AVX256-NEXT: vzeroupper +; AVX256-NEXT: retq %tmp = load <8 x i16>, <8 x i16>* %arg, align 1 %tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1 %tmp7 = sext <8 x i16> %tmp to <8 x i32> @@ -2753,8 +3785,15 @@ ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm4 +; SSE2-NEXT: pmullw %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm5[0,2] +; SSE2-NEXT: psrld $16, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: paddd %xmm6, %xmm1 ; SSE2-NEXT: addq $8, %rdi ; SSE2-NEXT: addq $-8, %rax ; SSE2-NEXT: jne .LBB33_1 @@ -2835,16 +3874,16 @@ ; AVX256-NEXT: jne .LBB33_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX256-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX256-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vmovd %xmm1, %ecx ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -2908,7 +3947,16 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: psubw %xmm2, %xmm3 -; SSE2-NEXT: pmaddwd %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pmulhw %xmm3, %xmm2 +; SSE2-NEXT: pmullw %xmm3, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: addq $8, %rcx ; SSE2-NEXT: cmpq %rcx, %rax @@ -2930,10 +3978,15 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB34_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmulld %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmulld %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vphaddd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-NEXT: addq $8, %rcx @@ -2958,19 +4011,21 @@ ; AVX256-NEXT: .p2align 4, 0x90 ; AVX256-NEXT: .LBB34_1: # %vector.body ; AVX256-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX256-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX256-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; AVX256-NEXT: vpmaddwd %xmm1, %xmm1, %xmm1 +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX256-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX256-NEXT: vpsubd %ymm1, %ymm2, %ymm1 +; AVX256-NEXT: vpmulld %ymm1, %ymm1, %ymm1 +; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX256-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX256-NEXT: addq $8, %rcx ; AVX256-NEXT: cmpq %rcx, %rax ; AVX256-NEXT: jne .LBB34_1 ; AVX256-NEXT: # %bb.2: # %middle.block ; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX256-NEXT: vmovd %xmm0, %eax @@ -3114,14 +4169,30 @@ ; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: pmaddwd %xmm5, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm2 +; SSE2-NEXT: pmullw %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm3 +; SSE2-NEXT: pmullw %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm4[0,2] +; SSE2-NEXT: movdqa %xmm6, %xmm8 +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm5[1,3] +; SSE2-NEXT: paddd %xmm8, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm2 ; SSE2-NEXT: addq $16, %rax ; SSE2-NEXT: cmpq %r8, %rax ; SSE2-NEXT: jb .LBB38_1 @@ -3146,11 +4217,19 @@ ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB38_1: # %loop ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rax), %xmm2 -; AVX1-NEXT: vpmovsxbw (%rdi,%rax), %xmm3 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd 12(%rdi,%rax), %xmm2 +; AVX1-NEXT: vpmovsxbd 8(%rdi,%rax), %xmm3 +; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovsxbd 4(%rdi,%rax), %xmm3 +; AVX1-NEXT: vpmovsxbd (%rdi,%rax), %xmm4 +; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovsxbd 12(%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd 8(%rsi,%rax), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxbw (%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd 4(%rsi,%rax), %xmm4 +; AVX1-NEXT: vpmovsxbd (%rsi,%rax), %xmm5 +; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaddwd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 @@ -3183,8 +4262,14 @@ ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB38_1: # %loop ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw (%rdi,%rax), %ymm2 -; AVX2-NEXT: vpmovsxbw (%rsi,%rax), %ymm3 +; AVX2-NEXT: vpmovsxbd 8(%rdi,%rax), %ymm2 +; AVX2-NEXT: vpmovsxbd (%rdi,%rax), %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmovsxbd 8(%rsi,%rax), %ymm3 +; AVX2-NEXT: vpmovsxbd (%rsi,%rax), %ymm4 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] ; AVX2-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: addq $16, %rax @@ -3193,9 +4278,9 @@ ; AVX2-NEXT: # %bb.2: # %afterloop ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax @@ -3211,9 +4296,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB38_1: # %loop ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vpmovsxbw (%rdi,%rax), %ymm1 -; AVX512-NEXT: vpmovsxbw (%rsi,%rax), %ymm2 -; AVX512-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpmovsxbd (%rdi,%rax), %zmm1 +; AVX512-NEXT: vpmovsxbd (%rsi,%rax), %zmm2 +; AVX512-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vphaddd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rax ; AVX512-NEXT: cmpq %r8, %rax @@ -3222,9 +4310,9 @@ ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/mask-negated-bool.ll b/llvm/test/CodeGen/X86/mask-negated-bool.ll --- a/llvm/test/CodeGen/X86/mask-negated-bool.ll +++ b/llvm/test/CodeGen/X86/mask-negated-bool.ll @@ -27,7 +27,10 @@ define <4 x i32> @mask_negated_zext_bool_vec(<4 x i1> %x) { ; CHECK-LABEL: mask_negated_zext_bool_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubd %xmm0, %xmm1 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %ext = zext <4 x i1> %x to <4 x i32> %neg = sub <4 x i32> zeroinitializer, %ext @@ -61,7 +64,10 @@ define <4 x i32> @mask_negated_sext_bool_vec(<4 x i1> %x) { ; CHECK-LABEL: mask_negated_sext_bool_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubd %xmm0, %xmm1 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> %neg = sub <4 x i32> zeroinitializer, %ext diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -1290,7 +1290,7 @@ ; SSE2-NEXT: pmovmskb %xmm8, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne LBB6_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -1614,7 +1614,7 @@ ; SSE42-NEXT: pmovmskb %xmm8, %eax ; SSE42-NEXT: shll $16, %eax ; SSE42-NEXT: orl %ecx, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: jne LBB6_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al @@ -1895,7 +1895,7 @@ ; AVX1-NEXT: vpmovmskb %xmm4, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB6_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -1389,7 +1389,7 @@ ; SSE2-NEXT: pmovmskb %xmm8, %ecx ; SSE2-NEXT: shll $16, %ecx ; SSE2-NEXT: orl %edx, %ecx -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: testb $1, %dl ; SSE2-NEXT: jne LBB8_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %cl @@ -1746,7 +1746,7 @@ ; SSE42-NEXT: pmovmskb %xmm8, %ecx ; SSE42-NEXT: shll $16, %ecx ; SSE42-NEXT: orl %edx, %ecx -; SSE42-NEXT: testb $1, %cl +; SSE42-NEXT: testb $1, %dl ; SSE42-NEXT: jne LBB8_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %cl @@ -2043,7 +2043,7 @@ ; AVX1-NEXT: vpmovmskb %xmm4, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB8_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al @@ -2666,20 +2666,16 @@ ; define <2 x i64> @expandload_v2i64_const(ptr %base, <2 x i64> %src0) { -; SSE2-LABEL: expandload_v2i64_const: -; SSE2: ## %bb.0: -; SSE2-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE42-LABEL: expandload_v2i64_const: -; SSE42: ## %bb.0: -; SSE42-NEXT: pinsrq $1, (%rdi), %xmm0 -; SSE42-NEXT: retq +; SSE-LABEL: expandload_v2i64_const: +; SSE: ## %bb.0: +; SSE-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: expandload_v2i64_const: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0 +; AVX1OR2-NEXT: vmovddup (%rdi), %xmm1 ## xmm1 = mem[0,0] +; AVX1OR2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: expandload_v2i64_const: diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -139,8 +139,9 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: je .LBB0_2 ; AVX512F-NEXT: # %bb.1: # %cond.load ; AVX512F-NEXT: vmovq %xmm0, %rcx @@ -338,8 +339,9 @@ ; AVX512F-NEXT: vpsllq $2, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: je .LBB1_2 ; AVX512F-NEXT: # %bb.1: # %cond.load ; AVX512F-NEXT: vmovq %xmm0, %rcx @@ -533,8 +535,9 @@ ; AVX512F-NEXT: vpsllq $2, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: je .LBB2_2 ; AVX512F-NEXT: # %bb.1: # %cond.load ; AVX512F-NEXT: vmovq %xmm0, %rcx diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -867,8 +867,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $xmm0 ; KNL_64-NEXT: je .LBB14_2 ; KNL_64-NEXT: # %bb.1: # %cond.load @@ -908,8 +909,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: # implicit-def: $xmm0 ; KNL_32-NEXT: jne .LBB14_1 ; KNL_32-NEXT: # %bb.2: # %else @@ -981,8 +983,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB15_2 ; KNL_64-NEXT: # %bb.1: # %cond.load ; KNL_64-NEXT: vmovq %xmm0, %rcx @@ -1025,8 +1028,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB15_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1098,8 +1102,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB16_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1127,8 +1132,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB16_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1157,8 +1163,9 @@ ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB16_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1183,8 +1190,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm1, %k0 ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB16_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1220,8 +1228,9 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB17_2 ; KNL_64-NEXT: # %bb.1: # %cond.store ; KNL_64-NEXT: vmovq %xmm1, %rcx @@ -1257,8 +1266,9 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB17_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1320,8 +1330,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm2 ; KNL_64-NEXT: vpbroadcastq %xmm2, %ymm2 ; KNL_64-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB18_2 ; KNL_64-NEXT: # %bb.1: # %cond.store ; KNL_64-NEXT: vmovq %xmm1, %rcx @@ -1363,8 +1374,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 ; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: je .LBB18_2 ; KNL_32-NEXT: # %bb.1: # %cond.store ; KNL_32-NEXT: vmovd %xmm1, %ecx @@ -1423,8 +1435,9 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB19_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1447,8 +1460,9 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB19_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1471,8 +1485,9 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB19_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1493,8 +1508,9 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB19_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1520,8 +1536,9 @@ ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB20_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1544,8 +1561,9 @@ ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB20_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1568,8 +1586,9 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX-NEXT: vpmovq2m %xmm2, %k0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB20_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1590,8 +1609,9 @@ ; SKX_32: # %bb.0: ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; SKX_32-NEXT: vpmovq2m %xmm2, %k0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB20_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1624,8 +1644,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB21_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1654,8 +1675,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB21_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1685,8 +1707,9 @@ ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB21_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1712,8 +1735,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm1, %k0 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB21_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1747,8 +1771,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB22_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1778,8 +1803,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB22_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1808,8 +1834,9 @@ ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB22_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1836,8 +1863,9 @@ ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB22_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1874,8 +1902,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB23_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -1903,8 +1932,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB23_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -1933,8 +1963,9 @@ ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB23_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -1959,8 +1990,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm1, %k0 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB23_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -1993,8 +2025,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB24_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -2023,8 +2056,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB24_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -2052,8 +2086,9 @@ ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB24_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -2079,8 +2114,9 @@ ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB24_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -2165,8 +2201,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm1 ; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 ; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB26_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -2194,8 +2231,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 ; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB26_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -2226,8 +2264,9 @@ ; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 ; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB26_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -2252,8 +2291,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm1, %k0 ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB26_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -3251,7 +3291,8 @@ ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: kmovw %k0, %ecx +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $ymm1 ; KNL_64-NEXT: je .LBB42_2 ; KNL_64-NEXT: # %bb.1: # %cond.load @@ -3281,7 +3322,8 @@ ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; KNL_64-NEXT: .LBB42_8: # %else8 ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: kmovw %k0, %ecx +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $ymm3 ; KNL_64-NEXT: jne .LBB42_9 ; KNL_64-NEXT: # %bb.10: # %else15 @@ -3299,7 +3341,8 @@ ; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; KNL_64-NEXT: .LBB42_16: # %else33 ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: kmovw %k0, %ecx +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: # implicit-def: $ymm4 ; KNL_64-NEXT: jne .LBB42_17 ; KNL_64-NEXT: # %bb.18: # %else40 @@ -3364,16 +3407,19 @@ ; KNL_32-NEXT: movl %esp, %ebp ; KNL_32-NEXT: .cfi_def_cfa_register %ebp ; KNL_32-NEXT: pushl %ebx +; KNL_32-NEXT: pushl %edi ; KNL_32-NEXT: pushl %esi ; KNL_32-NEXT: andl $-32, %esp ; KNL_32-NEXT: subl $32, %esp -; KNL_32-NEXT: .cfi_offset %esi, -16 +; KNL_32-NEXT: .cfi_offset %esi, -20 +; KNL_32-NEXT: .cfi_offset %edi, -16 ; KNL_32-NEXT: .cfi_offset %ebx, -12 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kmovw %k0, %ebx -; KNL_32-NEXT: testb $1, %bl -; KNL_32-NEXT: vmovd %xmm0, %eax +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: vmovd %xmm0, %edi ; KNL_32-NEXT: # implicit-def: $ymm1 ; KNL_32-NEXT: je .LBB42_2 ; KNL_32-NEXT: # %bb.1: # %cond.load @@ -3406,7 +3452,8 @@ ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; KNL_32-NEXT: .LBB42_8: # %else8 ; KNL_32-NEXT: kmovw %k0, %ebx -; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al ; KNL_32-NEXT: # implicit-def: $ymm0 ; KNL_32-NEXT: jne .LBB42_9 ; KNL_32-NEXT: # %bb.10: # %else15 @@ -3425,7 +3472,8 @@ ; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; KNL_32-NEXT: .LBB42_16: # %else33 ; KNL_32-NEXT: kmovw %k0, %ebx -; KNL_32-NEXT: testb $1, %bl +; KNL_32-NEXT: kmovw %k0, %eax +; KNL_32-NEXT: testb $1, %al ; KNL_32-NEXT: # implicit-def: $ymm2 ; KNL_32-NEXT: jne .LBB42_17 ; KNL_32-NEXT: # %bb.18: # %else40 @@ -3445,8 +3493,9 @@ ; KNL_32-NEXT: .LBB42_24: # %else58 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; KNL_32-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; KNL_32-NEXT: leal -8(%ebp), %esp +; KNL_32-NEXT: leal -12(%ebp), %esp ; KNL_32-NEXT: popl %esi +; KNL_32-NEXT: popl %edi ; KNL_32-NEXT: popl %ebx ; KNL_32-NEXT: popl %ebp ; KNL_32-NEXT: .cfi_def_cfa %esp, 4 @@ -3705,8 +3754,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm2 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 ; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB47_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -3737,8 +3787,9 @@ ; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 ; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB47_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -3770,8 +3821,9 @@ ; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; SKX-NEXT: vpbroadcastq %rdi, %xmm2 ; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB47_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -3799,8 +3851,9 @@ ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB47_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -4016,8 +4069,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm2 ; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 ; KNL_64-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB52_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -4043,8 +4097,9 @@ ; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 ; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB52_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4071,8 +4126,9 @@ ; SKX-NEXT: vpmovsxdq %xmm1, %xmm1 ; SKX-NEXT: vpsllq $3, %xmm1, %xmm1 ; SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; SKX-NEXT: kmovw %k0, %ecx ; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: testb $1, %al +; SKX-NEXT: testb $1, %cl ; SKX-NEXT: jne .LBB52_1 ; SKX-NEXT: # %bb.2: # %else ; SKX-NEXT: testb $2, %al @@ -4095,8 +4151,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm2, %k0 ; SKX_32-NEXT: vpslld $3, %xmm1, %xmm1 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm1, %xmm1 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB52_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -4396,9 +4453,10 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB58_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -4423,9 +4481,10 @@ ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB58_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4451,9 +4510,10 @@ ; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; SKX_SMALL-NEXT: kmovw %k0, %ecx ; SKX_SMALL-NEXT: kmovw %k0, %eax ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_SMALL-NEXT: testb $1, %al +; SKX_SMALL-NEXT: testb $1, %cl ; SKX_SMALL-NEXT: jne .LBB58_1 ; SKX_SMALL-NEXT: # %bb.2: # %else ; SKX_SMALL-NEXT: testb $2, %al @@ -4477,9 +4537,10 @@ ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm1 +; SKX_LARGE-NEXT: kmovw %k0, %ecx ; SKX_LARGE-NEXT: kmovw %k0, %eax ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_LARGE-NEXT: testb $1, %al +; SKX_LARGE-NEXT: testb $1, %cl ; SKX_LARGE-NEXT: jne .LBB58_1 ; SKX_LARGE-NEXT: # %bb.2: # %else ; SKX_LARGE-NEXT: testb $2, %al @@ -4502,9 +4563,10 @@ ; SKX_32-NEXT: vpmovq2m %xmm0, %k0 ; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax ; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB58_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -4592,8 +4654,9 @@ ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0 ; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: jne .LBB60_1 ; KNL_64-NEXT: # %bb.2: # %else ; KNL_64-NEXT: testb $2, %al @@ -4618,8 +4681,9 @@ ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB60_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4644,8 +4708,9 @@ ; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0 ; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; SKX_SMALL-NEXT: kmovw %k0, %ecx ; SKX_SMALL-NEXT: kmovw %k0, %eax -; SKX_SMALL-NEXT: testb $1, %al +; SKX_SMALL-NEXT: testb $1, %cl ; SKX_SMALL-NEXT: jne .LBB60_1 ; SKX_SMALL-NEXT: # %bb.2: # %else ; SKX_SMALL-NEXT: testb $2, %al @@ -4669,8 +4734,9 @@ ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm0 +; SKX_LARGE-NEXT: kmovw %k0, %ecx ; SKX_LARGE-NEXT: kmovw %k0, %eax -; SKX_LARGE-NEXT: testb $1, %al +; SKX_LARGE-NEXT: testb $1, %cl ; SKX_LARGE-NEXT: jne .LBB60_1 ; SKX_LARGE-NEXT: # %bb.2: # %else ; SKX_LARGE-NEXT: testb $2, %al @@ -4693,8 +4759,9 @@ ; SKX_32-NEXT: vpmovq2m %xmm0, %k0 ; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 ; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; SKX_32-NEXT: kmovw %k0, %ecx ; SKX_32-NEXT: kmovw %k0, %eax -; SKX_32-NEXT: testb $1, %al +; SKX_32-NEXT: testb $1, %cl ; SKX_32-NEXT: jne .LBB60_1 ; SKX_32-NEXT: # %bb.2: # %else ; SKX_32-NEXT: testb $2, %al @@ -4780,8 +4847,9 @@ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB62_2 ; KNL_64-NEXT: # %bb.1: # %cond.load ; KNL_64-NEXT: vmovq %xmm0, %rcx @@ -4820,8 +4888,9 @@ ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB62_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al @@ -4890,8 +4959,9 @@ ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_64-NEXT: vmovq %rdi, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0 +; KNL_64-NEXT: kmovw %k0, %ecx ; KNL_64-NEXT: kmovw %k0, %eax -; KNL_64-NEXT: testb $1, %al +; KNL_64-NEXT: testb $1, %cl ; KNL_64-NEXT: je .LBB63_2 ; KNL_64-NEXT: # %bb.1: # %cond.store ; KNL_64-NEXT: vmovq %xmm0, %rcx @@ -4928,8 +4998,9 @@ ; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 +; KNL_32-NEXT: kmovw %k0, %ecx ; KNL_32-NEXT: kmovw %k0, %eax -; KNL_32-NEXT: testb $1, %al +; KNL_32-NEXT: testb $1, %cl ; KNL_32-NEXT: jne .LBB63_1 ; KNL_32-NEXT: # %bb.2: # %else ; KNL_32-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -12,8 +12,9 @@ ; WIDEN_SKX-NEXT: vpmovsxdq %xmm0, %xmm0 ; WIDEN_SKX-NEXT: vpsllq $3, %xmm0, %xmm0 ; WIDEN_SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB0_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -41,8 +42,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm1 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB0_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -83,8 +85,9 @@ ; WIDEN_SKX-NEXT: vpmovsxdq %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpsllq $3, %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB1_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -110,8 +113,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm2 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB1_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -165,8 +169,9 @@ ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k0 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB2_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -189,8 +194,9 @@ ; WIDEN_KNL: # %bb.0: ; WIDEN_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB2_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -227,8 +233,9 @@ ; WIDEN_SKX: # %bb.0: ; WIDEN_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 ; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k0 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB3_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -249,8 +256,9 @@ ; WIDEN_KNL: # %bb.0: ; WIDEN_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB3_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -302,8 +310,9 @@ ; WIDEN_SKX-NEXT: vpmovsxdq %xmm0, %xmm0 ; WIDEN_SKX-NEXT: vpsllq $2, %xmm0, %xmm0 ; WIDEN_SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB4_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -331,8 +340,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm1 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm1, %xmm1 ; WIDEN_KNL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB4_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al @@ -374,8 +384,9 @@ ; WIDEN_SKX-NEXT: vpmovsxdq %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpsllq $2, %xmm1, %xmm1 ; WIDEN_SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_SKX-NEXT: kmovw %k0, %ecx ; WIDEN_SKX-NEXT: kmovw %k0, %eax -; WIDEN_SKX-NEXT: testb $1, %al +; WIDEN_SKX-NEXT: testb $1, %cl ; WIDEN_SKX-NEXT: jne .LBB5_1 ; WIDEN_SKX-NEXT: # %bb.2: # %else ; WIDEN_SKX-NEXT: testb $2, %al @@ -401,8 +412,9 @@ ; WIDEN_KNL-NEXT: vmovq %rdi, %xmm2 ; WIDEN_KNL-NEXT: vpbroadcastq %xmm2, %xmm2 ; WIDEN_KNL-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; WIDEN_KNL-NEXT: kmovw %k0, %ecx ; WIDEN_KNL-NEXT: kmovw %k0, %eax -; WIDEN_KNL-NEXT: testb $1, %al +; WIDEN_KNL-NEXT: testb $1, %cl ; WIDEN_KNL-NEXT: jne .LBB5_1 ; WIDEN_KNL-NEXT: # %bb.2: # %else ; WIDEN_KNL-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -4521,7 +4521,7 @@ ; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne LBB24_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %al @@ -4940,7 +4940,7 @@ ; SSE42-NEXT: pmovmskb %xmm1, %eax ; SSE42-NEXT: shll $16, %eax ; SSE42-NEXT: orl %ecx, %eax -; SSE42-NEXT: testb $1, %al +; SSE42-NEXT: testb $1, %cl ; SSE42-NEXT: jne LBB24_1 ; SSE42-NEXT: ## %bb.2: ## %else ; SSE42-NEXT: testb $2, %al @@ -5174,7 +5174,7 @@ ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB24_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al @@ -6563,20 +6563,13 @@ } define <8 x float> @mload_constmask_v8f32_zero(ptr %addr, <8 x float> %dst) { -; SSE2-LABEL: mload_constmask_v8f32_zero: -; SSE2: ## %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE42-LABEL: mload_constmask_v8f32_zero: -; SSE42: ## %bb.0: -; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],zero -; SSE42-NEXT: xorps %xmm1, %xmm1 -; SSE42-NEXT: retq +; SSE-LABEL: mload_constmask_v8f32_zero: +; SSE: ## %bb.0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: retq ; ; AVX1OR2-LABEL: mload_constmask_v8f32_zero: ; AVX1OR2: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -3250,7 +3250,7 @@ ; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm2, %ecx ; SSE2-NEXT: jne LBB16_1 ; SSE2-NEXT: ## %bb.2: ## %else @@ -3458,7 +3458,7 @@ ; SSE4-NEXT: pmovmskb %xmm1, %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne LBB16_1 ; SSE4-NEXT: ## %bb.2: ## %else ; SSE4-NEXT: testb $2, %al @@ -3693,7 +3693,7 @@ ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne LBB16_1 ; AVX1-NEXT: ## %bb.2: ## %else ; AVX1-NEXT: testb $2, %al @@ -5241,10 +5241,10 @@ ; SSE2-NEXT: andb $1, %dl ; SSE2-NEXT: addb %dl, %dl ; SSE2-NEXT: orb %sil, %dl -; SSE2-NEXT: andb $1, %cl ; SSE2-NEXT: shlb $2, %cl ; SSE2-NEXT: orb %dl, %cl -; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: andb $7, %cl +; SSE2-NEXT: testb %sil, %sil ; SSE2-NEXT: jne LBB28_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %cl @@ -5274,10 +5274,10 @@ ; SSE4-NEXT: andb $1, %dl ; SSE4-NEXT: addb %dl, %dl ; SSE4-NEXT: orb %sil, %dl -; SSE4-NEXT: andb $1, %cl ; SSE4-NEXT: shlb $2, %cl ; SSE4-NEXT: orb %dl, %cl -; SSE4-NEXT: testb $1, %cl +; SSE4-NEXT: andb $7, %cl +; SSE4-NEXT: testb %sil, %sil ; SSE4-NEXT: jne LBB28_1 ; SSE4-NEXT: ## %bb.2: ## %else ; SSE4-NEXT: testb $2, %cl @@ -5623,37 +5623,38 @@ ; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pcmpgtd (%rdi), %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = mem[0,2,2,3] +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm10 +; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm10[0,1,0,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = mem[0,2,2,3] +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[0,1,0,2,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE2-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = mem[0,2,2,3] ; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,0,2,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pshuflw {{.*#+}} xmm9 = xmm10[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = mem[0,2,2,3] -; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: pmovmskb %xmm9, %r11d +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE2-NEXT: pmovmskb %xmm10, %r11d ; SSE2-NEXT: andl $21845, %r11d ## imm = 0x5555 ; SSE2-NEXT: pmovmskb %xmm7, %edi ; SSE2-NEXT: andl $85, %edi ; SSE2-NEXT: shll $16, %edi ; SSE2-NEXT: orl %r11d, %edi -; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: movd %xmm6, %r11d +; SSE2-NEXT: testb $1, %r11b ; SSE2-NEXT: jne LBB31_1 ; SSE2-NEXT: ## %bb.2: ## %else ; SSE2-NEXT: testb $2, %dil @@ -5898,24 +5899,23 @@ ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE4-NEXT: movl 56(%rsi), %eax ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; SSE4-NEXT: movl 52(%rsi), %eax -; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd 48(%rdi), %xmm1 -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] ; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpgtd 32(%rdi), %xmm2 +; SSE4-NEXT: pcmpgtd 48(%rdi), %xmm2 ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] -; SSE4-NEXT: packusdw %xmm1, %xmm2 ; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd 16(%rdi), %xmm1 +; SSE4-NEXT: pcmpgtd 32(%rdi), %xmm1 ; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] +; SSE4-NEXT: packusdw %xmm2, %xmm1 +; SSE4-NEXT: pxor %xmm2, %xmm2 +; SSE4-NEXT: pcmpgtd 16(%rdi), %xmm2 +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] ; SSE4-NEXT: pxor %xmm3, %xmm3 ; SSE4-NEXT: pcmpgtd (%rdi), %xmm3 +; SSE4-NEXT: movd %xmm3, %eax ; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] -; SSE4-NEXT: packusdw %xmm1, %xmm3 ; SSE4-NEXT: packusdw %xmm2, %xmm3 +; SSE4-NEXT: packusdw %xmm1, %xmm3 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpgtd 80(%rdi), %xmm1 ; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] @@ -5924,14 +5924,16 @@ ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] ; SSE4-NEXT: packusdw %xmm1, %xmm2 ; SSE4-NEXT: packusdw %xmm2, %xmm2 -; SSE4-NEXT: pmovmskb %xmm3, %eax -; SSE4-NEXT: andl $21845, %eax ## imm = 0x5555 +; SSE4-NEXT: pmovmskb %xmm3, %ecx +; SSE4-NEXT: andl $21845, %ecx ## imm = 0x5555 ; SSE4-NEXT: pmovmskb %xmm2, %edi ; SSE4-NEXT: andl $85, %edi ; SSE4-NEXT: shll $16, %edi -; SSE4-NEXT: orl %eax, %edi +; SSE4-NEXT: orl %ecx, %edi +; SSE4-NEXT: movl 52(%rsi), %ecx +; SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: movl 48(%rsi), %r13d -; SSE4-NEXT: testb $1, %dil ; SSE4-NEXT: movl 44(%rsi), %eax ; SSE4-NEXT: movl 40(%rsi), %ecx ; SSE4-NEXT: movl 36(%rsi), %r8d @@ -6176,26 +6178,24 @@ ; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpgtd 32(%rdi), %ymm3, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpgtd (%rdi), %ymm3, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpcmpgtd 64(%rdi), %ymm3, %ymm3 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 ; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 -; AVX2-NEXT: vpmaskmovd %ymm1, %ymm5, 32(%rdx) -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-NEXT: vpacksswb %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,u],zero,xmm3[u,6,u],zero,xmm3[u,12,u],zero,xmm3[u,14,u],zero,xmm3[u] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX2-NEXT: vpslld $31, %ymm4, %ymm4 +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm4, 32(%rdx) +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,u],zero,xmm3[u,2,u],zero,xmm3[u,8,u],zero,xmm3[u,10,u],zero,xmm3[u] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 ; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdx) -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx) @@ -6442,7 +6442,7 @@ ; ; AVX2-LABEL: undefshuffle: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -1307,8 +1307,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB4_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1538,8 +1539,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB5_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1777,8 +1779,9 @@ ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB7_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1896,8 +1899,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4191,8 +4195,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB13_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4365,8 +4370,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB14_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4435,7 +4441,7 @@ ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4652,7 +4658,7 @@ ; SSE4-NEXT: notl %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne .LBB15_1 ; SSE4-NEXT: # %bb.2: # %else ; SSE4-NEXT: testb $2, %al @@ -4897,7 +4903,7 @@ ; AVX1-NEXT: notl %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne .LBB15_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -16,103 +16,108 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 ; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pandn %xmm9, %xmm6 ; SSE2-NEXT: por %xmm2, %xmm6 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm9, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm11 -; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm0, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pandn %xmm0, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm12[0,2] -; SSE2-NEXT: movdqa %xmm2, %xmm11 -; SSE2-NEXT: pxor %xmm8, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[0,2] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: movdqa %xmm8, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm11, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: pxor %xmm11, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax @@ -121,8 +126,8 @@ ; SSE2-NEXT: # %bb.1: # %cond.store ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else -; SSE2-NEXT: por %xmm11, %xmm3 -; SSE2-NEXT: por %xmm8, %xmm10 +; SSE2-NEXT: por %xmm10, %xmm3 +; SSE2-NEXT: por %xmm9, %xmm8 ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 @@ -131,8 +136,8 @@ ; SSE2-NEXT: .LBB0_4: # %else2 ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pandn %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm8, %xmm6 +; SSE2-NEXT: pandn %xmm0, %xmm8 ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 @@ -140,7 +145,7 @@ ; SSE2-NEXT: movd %xmm0, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: por %xmm10, %xmm6 +; SSE2-NEXT: por %xmm8, %xmm6 ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 @@ -347,29 +352,30 @@ ; AVX512F-LABEL: truncstore_v8i64_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512VL-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v8i64_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -390,110 +396,115 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm8 ; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: pandn %xmm9, %xmm8 ; SSE2-NEXT: por %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm10 -; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm11 -; SSE2-NEXT: por %xmm1, %xmm11 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm3, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: packssdw %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: packssdw %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm10 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm12, %xmm10 +; SSE2-NEXT: por %xmm3, %xmm10 ; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm10 +; SSE2-NEXT: pandn %xmm1, %xmm10 ; SSE2-NEXT: por %xmm2, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm8 -; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm8, %xmm2 ; SSE2-NEXT: packssdw %xmm10, %xmm2 ; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 @@ -909,19 +920,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $24, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 +; AVX512BW-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -942,111 +953,116 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm2, %xmm8 ; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: pandn %xmm9, %xmm8 ; SSE2-NEXT: por %xmm2, %xmm8 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,3,3] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm2[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm9, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,0,2,2] -; SSE2-NEXT: pand %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm10 -; SSE2-NEXT: pand %xmm10, %xmm0 -; SSE2-NEXT: pandn %xmm9, %xmm10 -; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm13 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm13, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pandn %xmm9, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm10, %xmm11 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,2,2] -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm9, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm12, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm11, %xmm1 -; SSE2-NEXT: pandn %xmm9, %xmm11 -; SSE2-NEXT: por %xmm1, %xmm11 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm11, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pandn %xmm1, %xmm11 +; SSE2-NEXT: por %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm12 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] +; SSE2-NEXT: movdqa %xmm0, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm11 -; SSE2-NEXT: pandn %xmm3, %xmm12 -; SSE2-NEXT: por %xmm11, %xmm12 -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,3,3] ; SSE2-NEXT: por %xmm13, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm10, %xmm0 -; SSE2-NEXT: packssdw %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm11 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: packssdw %xmm11, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm10 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2] -; SSE2-NEXT: pand %xmm11, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm12, %xmm10 +; SSE2-NEXT: por %xmm3, %xmm10 ; SSE2-NEXT: pand %xmm10, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm10 +; SSE2-NEXT: pandn %xmm1, %xmm10 ; SSE2-NEXT: por %xmm2, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2] -; SSE2-NEXT: pand %xmm2, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: por %xmm7, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm8 -; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm8, %xmm2 ; SSE2-NEXT: packssdw %xmm10, %xmm2 ; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 @@ -1458,19 +1474,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1491,25 +1507,26 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm1 @@ -1518,32 +1535,33 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm1, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 @@ -1687,9 +1705,8 @@ ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1722,25 +1739,26 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm1 @@ -1749,33 +1767,34 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm7, %xmm6 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm7 +; SSE2-NEXT: pandn %xmm1, %xmm8 +; SSE2-NEXT: por %xmm7, %xmm8 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm5 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: packssdw %xmm6, %xmm0 +; SSE2-NEXT: packssdw %xmm8, %xmm0 ; SSE2-NEXT: packssdw %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %eax ; SSE2-NEXT: xorl $15, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 @@ -1968,8 +1987,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB4_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2015,9 +2035,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2038,25 +2057,26 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn %xmm6, %xmm5 ; SSE2-NEXT: por %xmm1, %xmm5 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm8 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,2,2] -; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm0 @@ -2065,37 +2085,38 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] +; SSE2-NEXT: pand %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,3,3] ; SSE2-NEXT: por %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm7 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pand %xmm6, %xmm7 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pandn %xmm1, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm6 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm6, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: movmskps %xmm2, %ecx +; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE2-NEXT: movmskps %xmm3, %ecx ; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax @@ -2294,8 +2315,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB5_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2341,9 +2363,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2363,34 +2384,36 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744069414584320,18446744069414584320] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movmskpd %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 @@ -2490,9 +2513,8 @@ ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v2i64_v2i32: @@ -2523,35 +2545,37 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147516415,2147516415] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147516415,2147516415] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562035200,18446744071562035200] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: por %xmm5, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movmskpd %xmm2, %eax +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: movmskpd %xmm1, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 @@ -2668,8 +2692,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB7_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2701,9 +2726,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp slt <2 x i64> %x, @@ -2722,25 +2746,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483775,2147483775] +; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 ; SSE2-NEXT: por %xmm0, %xmm5 ; SSE2-NEXT: pxor %xmm5, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744071562067840,18446744071562067840] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm5 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2749,9 +2775,9 @@ ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: packuswb %xmm3, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax ; SSE2-NEXT: xorl $3, %eax ; SSE2-NEXT: testb $1, %al @@ -2865,8 +2891,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2898,9 +2925,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp slt <2 x i64> %x, @@ -3580,18 +3606,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -4251,18 +4275,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -4640,9 +4662,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -5019,9 +5040,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -5152,8 +5172,9 @@ ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB13_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -5198,9 +5219,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp slt <4 x i32> %x, @@ -5392,8 +5412,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB14_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -5439,9 +5460,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovsdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp slt <4 x i32> %x, @@ -5466,7 +5486,7 @@ ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else @@ -5678,7 +5698,7 @@ ; SSE4-NEXT: notl %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne .LBB15_1 ; SSE4-NEXT: # %bb.2: # %else ; SSE4-NEXT: testb $2, %al @@ -5920,7 +5940,7 @@ ; AVX1-NEXT: notl %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne .LBB15_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al @@ -6621,19 +6641,18 @@ ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kmovd %k0, %k1 +; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -7259,9 +7278,8 @@ ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer @@ -7556,9 +7574,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i16_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i16> %mask, zeroinitializer %b = icmp slt <8 x i16> %x, diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -11,51 +11,51 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-LABEL: truncstore_v8i64_v8i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm10, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm7, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm12, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm1 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm8, %xmm10 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] +; SSE2-NEXT: pand %xmm11, %xmm10 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa %xmm7, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pand %xmm11, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm6, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,2] ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: pxor %xmm10, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm12, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm10, %xmm10 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm10, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax @@ -64,8 +64,8 @@ ; SSE2-NEXT: # %bb.1: # %cond.store ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: .LBB0_2: # %else -; SSE2-NEXT: pand %xmm11, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm9 +; SSE2-NEXT: pand %xmm10, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm8 ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB0_4 ; SSE2-NEXT: # %bb.3: # %cond.store1 @@ -74,8 +74,8 @@ ; SSE2-NEXT: .LBB0_4: # %else2 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm9, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm6, %xmm8 ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB0_6 ; SSE2-NEXT: # %bb.5: # %cond.store3 @@ -83,7 +83,7 @@ ; SSE2-NEXT: movd %xmm4, 8(%rdi) ; SSE2-NEXT: .LBB0_6: # %else4 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: por %xmm9, %xmm2 +; SSE2-NEXT: por %xmm8, %xmm2 ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB0_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 @@ -280,26 +280,30 @@ ; AVX512F-LABEL: truncstore_v8i64_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512F-NEXT: kshiftlw $8, %k0, %k0 +; AVX512F-NEXT: kshiftrw $8, %k0, %k1 +; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512VL-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v8i64_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 +; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -318,49 +322,49 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: pxor %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: pand %xmm12, %xmm7 +; SSE2-NEXT: pand %xmm11, %xmm7 ; SSE2-NEXT: pand %xmm7, %xmm2 ; SSE2-NEXT: pandn %xmm8, %xmm7 ; SSE2-NEXT: por %xmm2, %xmm7 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,0,2,2] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pand %xmm12, %xmm2 +; SSE2-NEXT: pand %xmm11, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm8, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm10, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pand %xmm12, %xmm3 +; SSE2-NEXT: pand %xmm11, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm8, %xmm3 ; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm11, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pand %xmm10, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm8, %xmm9 +; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] @@ -778,17 +782,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftld $24, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 +; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -807,51 +813,51 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 ; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 -; SSE2-NEXT: pand %xmm12, %xmm10 +; SSE2-NEXT: pand %xmm11, %xmm10 ; SSE2-NEXT: pand %xmm10, %xmm1 ; SSE2-NEXT: pandn %xmm7, %xmm10 ; SSE2-NEXT: por %xmm1, %xmm10 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 -; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pand %xmm11, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm7, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: packuswb %xmm10, %xmm1 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] -; SSE2-NEXT: movdqa %xmm9, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: movdqa %xmm9, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm11, %xmm0 +; SSE2-NEXT: pand %xmm10, %xmm0 ; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm7, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm10, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE2-NEXT: pand %xmm9, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm7, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: packuswb %xmm0, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] +; SSE2-NEXT: pand %xmm3, %xmm8 +; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pandn %xmm7, %xmm8 +; SSE2-NEXT: por %xmm2, %xmm8 +; SSE2-NEXT: packuswb %xmm0, %xmm8 +; SSE2-NEXT: packuswb %xmm8, %xmm1 ; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 @@ -1256,17 +1262,19 @@ ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 +; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1285,22 +1293,22 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pandn %xmm4, %xmm6 ; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSE2-NEXT: pand %xmm8, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm1 @@ -1448,8 +1456,8 @@ ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1480,27 +1488,27 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pxor %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm9, %xmm6 +; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm6, %xmm0 ; SSE2-NEXT: pandn %xmm4, %xmm6 ; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] @@ -1696,8 +1704,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB4_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -1743,8 +1752,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -1763,29 +1772,29 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm7, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE2-NEXT: pand %xmm9, %xmm4 +; SSE2-NEXT: pand %xmm8, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn %xmm5, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm5, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm7, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm0, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm5, %xmm6 +; SSE2-NEXT: por %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: packuswb %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm6, %xmm4 ; SSE2-NEXT: packuswb %xmm4, %xmm4 ; SSE2-NEXT: packuswb %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 @@ -1985,8 +1994,9 @@ ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB5_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2032,8 +2042,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i64_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -2050,11 +2060,12 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2160,8 +2171,8 @@ ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v2i64_v2i32: @@ -2189,11 +2200,12 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2317,8 +2329,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB7_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2350,8 +2363,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp ult <2 x i64> %x, @@ -2367,11 +2380,12 @@ ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] ; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pand %xmm5, %xmm3 ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 @@ -2493,8 +2507,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB8_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -2526,8 +2541,8 @@ ; AVX512BWVL-LABEL: truncstore_v2i64_v2i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp ult <2 x i64> %x, @@ -2541,22 +2556,22 @@ ; SSE2-LABEL: truncstore_v16i32_v16i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535] ; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm11, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm10, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm9, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: pxor %xmm11, %xmm13 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm9, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm13, %xmm8 ; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm8 +; SSE2-NEXT: pandn %xmm10, %xmm8 ; SSE2-NEXT: por %xmm0, %xmm8 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm13 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm13 ; SSE2-NEXT: pand %xmm13, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm13 +; SSE2-NEXT: pandn %xmm10, %xmm13 ; SSE2-NEXT: por %xmm1, %xmm13 ; SSE2-NEXT: pslld $16, %xmm13 ; SSE2-NEXT: psrad $16, %xmm13 @@ -2564,14 +2579,15 @@ ; SSE2-NEXT: psrad $16, %xmm8 ; SSE2-NEXT: packssdw %xmm13, %xmm8 ; SSE2-NEXT: pcmpeqd %xmm12, %xmm7 -; SSE2-NEXT: pxor %xmm9, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm12, %xmm6 -; SSE2-NEXT: pxor %xmm9, %xmm6 +; SSE2-NEXT: pxor %xmm0, %xmm6 ; SSE2-NEXT: packssdw %xmm7, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm12, %xmm5 -; SSE2-NEXT: pxor %xmm9, %xmm5 +; SSE2-NEXT: pxor %xmm0, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm12, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm4 +; SSE2-NEXT: pxor %xmm0, %xmm4 ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm6, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax @@ -2596,9 +2612,9 @@ ; SSE2-NEXT: pextrw $2, %xmm8, %ecx ; SSE2-NEXT: movw %cx, 4(%rdi) ; SSE2-NEXT: .LBB9_6: # %else4 -; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm10 +; SSE2-NEXT: pcmpgtd %xmm11, %xmm9 ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB9_8 ; SSE2-NEXT: # %bb.7: # %cond.store5 @@ -2606,9 +2622,9 @@ ; SSE2-NEXT: movw %cx, 6(%rdi) ; SSE2-NEXT: .LBB9_8: # %else6 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm10, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm0 +; SSE2-NEXT: pand %xmm9, %xmm3 +; SSE2-NEXT: pandn %xmm10, %xmm9 ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB9_10 ; SSE2-NEXT: # %bb.9: # %cond.store7 @@ -3237,16 +3253,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -3946,16 +3962,16 @@ ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -3970,30 +3986,31 @@ ; SSE2-LABEL: truncstore_v8i32_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm7, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm7, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm9, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm6, %xmm4 ; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm8 -; SSE2-NEXT: por %xmm1, %xmm8 -; SSE2-NEXT: pslld $16, %xmm8 -; SSE2-NEXT: psrad $16, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm9 +; SSE2-NEXT: pand %xmm9, %xmm1 +; SSE2-NEXT: pandn %xmm6, %xmm9 +; SSE2-NEXT: por %xmm1, %xmm9 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: packssdw %xmm8, %xmm4 +; SSE2-NEXT: packssdw %xmm9, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax @@ -4357,8 +4374,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -4754,8 +4771,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -4934,8 +4951,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB13_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -4981,8 +4999,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp ult <4 x i32> %x, @@ -5163,8 +5181,9 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: testb $1, %al +; AVX512F-NEXT: testb $1, %cl ; AVX512F-NEXT: jne .LBB14_1 ; AVX512F-NEXT: # %bb.2: # %else ; AVX512F-NEXT: testb $2, %al @@ -5210,8 +5229,8 @@ ; AVX512BWVL-LABEL: truncstore_v4i32_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp ult <4 x i32> %x, @@ -5241,7 +5260,7 @@ ; SSE2-NEXT: notl %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: testb $1, %al +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB15_1 ; SSE2-NEXT: # %bb.2: # %else @@ -5462,7 +5481,7 @@ ; SSE4-NEXT: notl %eax ; SSE4-NEXT: shll $16, %eax ; SSE4-NEXT: orl %ecx, %eax -; SSE4-NEXT: testb $1, %al +; SSE4-NEXT: testb $1, %cl ; SSE4-NEXT: jne .LBB15_1 ; SSE4-NEXT: # %bb.2: # %else ; SSE4-NEXT: testb $2, %al @@ -5709,7 +5728,7 @@ ; AVX1-NEXT: notl %eax ; AVX1-NEXT: shll $16, %eax ; AVX1-NEXT: orl %ecx, %eax -; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: testb $1, %cl ; AVX1-NEXT: jne .LBB15_1 ; AVX1-NEXT: # %bb.2: # %else ; AVX1-NEXT: testb $2, %al @@ -6419,17 +6438,18 @@ ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: kmovd %k0, %k1 +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -7068,8 +7088,8 @@ ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer @@ -7369,8 +7389,8 @@ ; AVX512BWVL-LABEL: truncstore_v8i16_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovuswb %xmm0, %xmm0 +; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i16> %mask, zeroinitializer %b = icmp ult <8 x i16> %x, diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -180,24 +180,22 @@ ; SSE-NEXT: movss {{.*#+}} xmm11 = mem[0],zero,zero,zero ; SSE-NEXT: addss %xmm13, %xmm1 ; SSE-NEXT: addss %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: mulss %xmm11, %xmm7 -; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1] +; SSE-NEXT: movaps %xmm2, %xmm12 +; SSE-NEXT: mulss %xmm11, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,0,0,0] ; SSE-NEXT: mulps %xmm0, %xmm11 -; SSE-NEXT: movaps %xmm5, %xmm12 -; SSE-NEXT: mulss %xmm10, %xmm12 -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: mulss %xmm10, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,0,0,0] ; SSE-NEXT: mulps %xmm3, %xmm10 ; SSE-NEXT: addps %xmm11, %xmm10 ; SSE-NEXT: movaps %xmm9, %xmm11 ; SSE-NEXT: mulss %xmm8, %xmm11 -; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0,0,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0,0,0] ; SSE-NEXT: mulps %xmm6, %xmm8 ; SSE-NEXT: addps %xmm10, %xmm8 -; SSE-NEXT: addss %xmm7, %xmm12 -; SSE-NEXT: addss %xmm11, %xmm12 -; SSE-NEXT: movaps %xmm8, %xmm7 -; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] +; SSE-NEXT: addss %xmm12, %xmm7 +; SSE-NEXT: addss %xmm11, %xmm7 ; SSE-NEXT: movss {{.*#+}} xmm10 = mem[0],zero,zero,zero ; SSE-NEXT: mulss %xmm10, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1] @@ -212,11 +210,12 @@ ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0,0,1,1] ; SSE-NEXT: mulps %xmm6, %xmm3 ; SSE-NEXT: addps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,1] ; SSE-NEXT: addss %xmm2, %xmm5 ; SSE-NEXT: addss %xmm9, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] +; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm8[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[0,1] ; SSE-NEXT: movss %xmm5, 32(%rdi) ; SSE-NEXT: movaps %xmm7, 16(%rdi) ; SSE-NEXT: movaps %xmm4, (%rdi) @@ -256,7 +255,6 @@ ; AVX1-NEXT: vaddss %xmm4, %xmm9, %xmm4 ; AVX1-NEXT: vmulss %xmm10, %xmm8, %xmm9 ; AVX1-NEXT: vaddss %xmm4, %xmm9, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3] ; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 ; AVX1-NEXT: vmulps %xmm0, %xmm9, %xmm0 ; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 @@ -270,15 +268,13 @@ ; AVX1-NEXT: vaddss %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vmulss %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm7[1,1,3,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,2,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-NEXT: vmovss %xmm2, 32(%rdi) -; AVX1-NEXT: vmovaps %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovaps %xmm1, (%rdi) +; AVX1-NEXT: vmovaps %xmm0, 16(%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_mul3x3_f32: @@ -315,36 +311,35 @@ ; AVX2-NEXT: vaddss %xmm4, %xmm9, %xmm4 ; AVX2-NEXT: vmulss %xmm10, %xmm8, %xmm9 ; AVX2-NEXT: vaddss %xmm4, %xmm9, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3] -; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm7 -; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm0 ; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 -; AVX2-NEXT: vmulps %xmm3, %xmm9, %xmm3 +; AVX2-NEXT: vmulps %xmm0, %xmm9, %xmm0 +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 +; AVX2-NEXT: vmulps %xmm3, %xmm10, %xmm3 ; AVX2-NEXT: vaddps %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm3 ; AVX2-NEXT: vmulps %xmm3, %xmm6, %xmm6 ; AVX2-NEXT: vaddps %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vmulss %xmm7, %xmm2, %xmm2 -; AVX2-NEXT: vmulss %xmm5, %xmm9, %xmm5 +; AVX2-NEXT: vmulss %xmm2, %xmm9, %xmm2 +; AVX2-NEXT: vmulss %xmm5, %xmm10, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vmulss %xmm3, %xmm8, %xmm3 ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,1,2,4,5,6,u,u> -; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm7[1,1,3,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0] ; AVX2-NEXT: vmovss %xmm2, 32(%rdi) -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovaps %xmm1, (%rdi) +; AVX2-NEXT: vmovaps %xmm0, 16(%rdi) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_mul3x3_f32: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] -; AVX512F-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX512F-NEXT: vmulps %xmm3, %xmm0, %xmm3 +; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX512F-NEXT: vmulps %xmm2, %xmm0, %xmm3 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[2,3] ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] ; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm4 ; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm4 @@ -356,105 +351,108 @@ ; AVX512F-NEXT: vaddps %xmm4, %xmm9, %xmm9 ; AVX512F-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX512F-NEXT: vmulss %xmm1, %xmm4, %xmm10 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3] -; AVX512F-NEXT: vmulss %xmm6, %xmm5, %xmm6 -; AVX512F-NEXT: vaddss %xmm6, %xmm10, %xmm6 -; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm10 -; AVX512F-NEXT: vmulss %xmm8, %xmm10, %xmm8 -; AVX512F-NEXT: vaddss %xmm6, %xmm8, %xmm6 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[0,1],xmm6[0],xmm9[3] +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm5[1,1,3,3] +; AVX512F-NEXT: vmulss %xmm6, %xmm11, %xmm5 +; AVX512F-NEXT: vaddss %xmm5, %xmm10, %xmm5 +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm6 +; AVX512F-NEXT: vmulss %xmm6, %xmm8, %xmm8 +; AVX512F-NEXT: vaddss %xmm5, %xmm8, %xmm5 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm5[0],xmm9[3] ; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm8 ; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm9 -; AVX512F-NEXT: vmovsldup {{.*#+}} xmm11 = xmm9[0,0,2,2] -; AVX512F-NEXT: vmulps %xmm2, %xmm11, %xmm11 -; AVX512F-NEXT: vaddps %xmm11, %xmm8, %xmm8 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm9[1,1,3,3] -; AVX512F-NEXT: vmulps %xmm3, %xmm11, %xmm12 +; AVX512F-NEXT: vmovsldup {{.*#+}} xmm10 = xmm9[0,0,2,2] +; AVX512F-NEXT: vmulps %xmm2, %xmm10, %xmm10 +; AVX512F-NEXT: vaddps %xmm10, %xmm8, %xmm8 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm9[1,1,3,3] +; AVX512F-NEXT: vmulps %xmm3, %xmm10, %xmm12 ; AVX512F-NEXT: vaddps %xmm12, %xmm8, %xmm8 ; AVX512F-NEXT: vmulss %xmm7, %xmm4, %xmm7 -; AVX512F-NEXT: vmulss %xmm5, %xmm9, %xmm12 +; AVX512F-NEXT: vmulss %xmm9, %xmm11, %xmm12 ; AVX512F-NEXT: vaddss %xmm7, %xmm12, %xmm7 -; AVX512F-NEXT: vmulss %xmm11, %xmm10, %xmm11 -; AVX512F-NEXT: vaddss %xmm7, %xmm11, %xmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3] -; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3] -; AVX512F-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0] +; AVX512F-NEXT: vmulss %xmm6, %xmm10, %xmm10 +; AVX512F-NEXT: vaddss %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3] +; AVX512F-NEXT: vshufpd {{.*#+}} xmm12 = xmm9[1,0] ; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2] ; AVX512F-NEXT: vmulps %xmm0, %xmm9, %xmm0 -; AVX512F-NEXT: vmulps %xmm2, %xmm8, %xmm2 +; AVX512F-NEXT: vmulps %xmm2, %xmm10, %xmm2 ; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm1 ; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2 ; AVX512F-NEXT: vmulps %xmm2, %xmm3, %xmm2 ; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vmulss %xmm4, %xmm11, %xmm2 -; AVX512F-NEXT: vmulss %xmm5, %xmm8, %xmm3 +; AVX512F-NEXT: vmulss %xmm4, %xmm12, %xmm2 +; AVX512F-NEXT: vmulss %xmm10, %xmm11, %xmm3 ; AVX512F-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vmulss %xmm1, %xmm10, %xmm1 +; AVX512F-NEXT: vmulss %xmm1, %xmm6, %xmm1 ; AVX512F-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm2 -; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm8[1,1,3,3] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[2,3] +; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1,2],xmm8[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_mul3x3_f32: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] -; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm3 -; AVX512VL-NEXT: vmulps %xmm3, %xmm0, %xmm3 -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX512VL-NEXT: vmulps %xmm2, %xmm0, %xmm2 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vmulps %xmm5, %xmm2, %xmm6 -; AVX512VL-NEXT: vaddps %xmm6, %xmm3, %xmm3 -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm4[1,0] +; AVX512VL-NEXT: vmulps %xmm5, %xmm4, %xmm6 +; AVX512VL-NEXT: vaddps %xmm6, %xmm2, %xmm2 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm3[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3] ; AVX512VL-NEXT: vshufpd {{.*#+}} xmm8 = xmm1[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm6, %xmm9, %xmm9 -; AVX512VL-NEXT: vaddps %xmm3, %xmm9, %xmm3 +; AVX512VL-NEXT: vaddps %xmm2, %xmm9, %xmm2 ; AVX512VL-NEXT: vshufpd {{.*#+}} xmm9 = xmm0[1,0] ; AVX512VL-NEXT: vmulss %xmm1, %xmm9, %xmm10 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX512VL-NEXT: vmulss %xmm5, %xmm4, %xmm5 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX512VL-NEXT: vmulss %xmm5, %xmm3, %xmm5 ; AVX512VL-NEXT: vaddss %xmm5, %xmm10, %xmm5 ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm10 ; AVX512VL-NEXT: vmulss %xmm8, %xmm10, %xmm8 ; AVX512VL-NEXT: vaddss %xmm5, %xmm8, %xmm5 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm5[0],xmm2[3] ; AVX512VL-NEXT: vmulps %xmm7, %xmm0, %xmm5 ; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm8 ; AVX512VL-NEXT: vmovsldup {{.*#+}} xmm11 = xmm8[0,0,2,2] -; AVX512VL-NEXT: vmulps %xmm2, %xmm11, %xmm11 +; AVX512VL-NEXT: vmulps %xmm4, %xmm11, %xmm11 ; AVX512VL-NEXT: vaddps %xmm5, %xmm11, %xmm5 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm11 = xmm8[1,1,3,3] ; AVX512VL-NEXT: vmulps %xmm6, %xmm11, %xmm12 ; AVX512VL-NEXT: vaddps %xmm5, %xmm12, %xmm5 ; AVX512VL-NEXT: vmulss %xmm7, %xmm9, %xmm7 -; AVX512VL-NEXT: vmulss %xmm4, %xmm8, %xmm12 +; AVX512VL-NEXT: vmulss %xmm3, %xmm8, %xmm12 ; AVX512VL-NEXT: vaddss %xmm7, %xmm12, %xmm7 ; AVX512VL-NEXT: vmulss %xmm11, %xmm10, %xmm11 ; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] -; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] -; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm8[1,0] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm11 = xmm8[3,3,3,3] +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm12 = xmm8[1,0] ; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0 -; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2 -; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vmulps %xmm4, %xmm11, %xmm4 +; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm1 -; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2 -; AVX512VL-NEXT: vmulps %xmm2, %xmm6, %xmm2 -; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vmulss %xmm11, %xmm9, %xmm2 -; AVX512VL-NEXT: vmulss %xmm7, %xmm4, %xmm4 -; AVX512VL-NEXT: vaddss %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm4 +; AVX512VL-NEXT: vmulps %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmulss %xmm12, %xmm9, %xmm4 +; AVX512VL-NEXT: vmulss %xmm3, %xmm11, %xmm3 +; AVX512VL-NEXT: vaddss %xmm3, %xmm4, %xmm3 ; AVX512VL-NEXT: vmulss %xmm1, %xmm10, %xmm1 -; AVX512VL-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm3, %zmm2 -; AVX512VL-NEXT: vmovaps {{.*#+}} zmm0 = <0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 +; AVX512VL-NEXT: vaddss %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm5[1,1,3,3] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[2,3] +; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq entry: %block = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> @@ -617,7 +615,6 @@ ; AVX1-NEXT: vaddsd %xmm4, %xmm9, %xmm4 ; AVX1-NEXT: vmulsd %xmm7, %xmm8, %xmm7 ; AVX1-NEXT: vaddsd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX1-NEXT: vmulpd %xmm7, %xmm1, %xmm9 ; AVX1-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] @@ -644,15 +641,13 @@ ; AVX1-NEXT: vaddsd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vmulsd %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm9[0] +; AVX1-NEXT: vshufpd {{.*#+}} xmm4 = xmm9[1],xmm7[0] ; AVX1-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX1-NEXT: vmovapd %ymm1, 32(%rdi) -; AVX1-NEXT: vmovapd %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper +; AVX1-NEXT: vmovapd %xmm1, 48(%rdi) +; AVX1-NEXT: vmovapd %xmm0, (%rdi) +; AVX1-NEXT: vmovapd %xmm4, 32(%rdi) +; AVX1-NEXT: vmovapd %xmm3, 16(%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_mul3x3_f64: @@ -675,7 +670,6 @@ ; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4 ; AVX2-NEXT: vmulsd %xmm7, %xmm8, %xmm7 ; AVX2-NEXT: vaddsd %xmm7, %xmm4, %xmm4 -; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX2-NEXT: vmulpd %xmm7, %xmm1, %xmm9 ; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] @@ -702,70 +696,68 @@ ; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vmulsd %xmm3, %xmm8, %xmm3 ; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 -; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm9[0] +; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm9[1],xmm7[0] ; AVX2-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX2-NEXT: vmovapd %ymm1, 32(%rdi) -; AVX2-NEXT: vmovapd %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vmovapd %xmm1, 48(%rdi) +; AVX2-NEXT: vmovapd %xmm0, (%rdi) +; AVX2-NEXT: vmovapd %xmm4, 32(%rdi) +; AVX2-NEXT: vmovapd %xmm3, 16(%rdi) ; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_mul3x3_f64: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm0, %xmm9, %xmm10 -; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0] -; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm3, %xmm1, %xmm4 -; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 +; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm0 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm4, %xmm3, %xmm10 +; AVX512F-NEXT: vaddpd %xmm0, %xmm10, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] ; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] ; AVX512F-NEXT: vmulpd %xmm7, %xmm6, %xmm10 -; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 +; AVX512F-NEXT: vaddpd %xmm0, %xmm10, %xmm0 ; AVX512F-NEXT: vmulsd %xmm2, %xmm9, %xmm9 -; AVX512F-NEXT: vmulsd %xmm3, %xmm5, %xmm3 -; AVX512F-NEXT: vaddsd %xmm3, %xmm9, %xmm3 -; AVX512F-NEXT: vmulsd %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vaddsd %xmm7, %xmm3, %xmm3 -; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm4, %xmm0, %xmm7 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm10 -; AVX512F-NEXT: vaddpd %xmm7, %xmm10, %xmm7 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm6, %xmm10, %xmm11 -; AVX512F-NEXT: vaddpd %xmm7, %xmm11, %xmm7 -; AVX512F-NEXT: vmulsd %xmm4, %xmm2, %xmm4 -; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm9 +; AVX512F-NEXT: vmulsd %xmm4, %xmm5, %xmm4 ; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512F-NEXT: vmulsd %xmm10, %xmm8, %xmm9 -; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-NEXT: vmulsd %xmm7, %xmm8, %xmm7 +; AVX512F-NEXT: vaddsd %xmm7, %xmm4, %xmm4 ; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm7, %xmm0, %xmm0 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm1 -; AVX512F-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] -; AVX512F-NEXT: vmulpd %xmm1, %xmm6, %xmm6 -; AVX512F-NEXT: vaddpd %xmm6, %xmm0, %xmm0 -; AVX512F-NEXT: vmulsd %xmm7, %xmm2, %xmm2 -; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm5 +; AVX512F-NEXT: vmulpd %xmm7, %xmm1, %xmm9 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm3, %xmm10, %xmm11 +; AVX512F-NEXT: vaddpd %xmm11, %xmm9, %xmm9 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm6, %xmm11, %xmm12 +; AVX512F-NEXT: vaddpd %xmm12, %xmm9, %xmm9 +; AVX512F-NEXT: vmulsd %xmm7, %xmm2, %xmm7 +; AVX512F-NEXT: vmulsd %xmm5, %xmm10, %xmm10 +; AVX512F-NEXT: vaddsd %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vmulsd %xmm11, %xmm8, %xmm10 +; AVX512F-NEXT: vaddsd %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm1, %xmm10, %xmm1 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm3, %xmm11, %xmm3 +; AVX512F-NEXT: vaddpd %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm3, %xmm6, %xmm6 +; AVX512F-NEXT: vaddpd %xmm6, %xmm1, %xmm1 +; AVX512F-NEXT: vmulsd %xmm2, %xmm10, %xmm2 +; AVX512F-NEXT: vmulsd %xmm5, %xmm11, %xmm5 ; AVX512F-NEXT: vaddsd %xmm5, %xmm2, %xmm2 -; AVX512F-NEXT: vmulsd %xmm1, %xmm8, %xmm1 -; AVX512F-NEXT: vaddsd %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm3, %zmm2 -; AVX512F-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] -; AVX512F-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 -; AVX512F-NEXT: vmovsd %xmm1, 64(%rdi) -; AVX512F-NEXT: vmovapd %zmm3, (%rdi) +; AVX512F-NEXT: vmulsd %xmm3, %xmm8, %xmm3 +; AVX512F-NEXT: vaddsd %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vshufpd {{.*#+}} xmm3 = xmm9[1],xmm7[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm9[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vmovsd %xmm2, 64(%rdi) +; AVX512F-NEXT: vmovapd %zmm0, (%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -789,39 +781,39 @@ ; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vmulsd %xmm7, %xmm8, %xmm4 ; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] ; AVX512VL-NEXT: vmulpd %xmm4, %xmm0, %xmm7 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm10 -; AVX512VL-NEXT: vaddpd %xmm7, %xmm10, %xmm7 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm6, %xmm10, %xmm11 +; AVX512VL-NEXT: vmulpd %xmm3, %xmm10, %xmm11 ; AVX512VL-NEXT: vaddpd %xmm7, %xmm11, %xmm7 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm6, %xmm11, %xmm12 +; AVX512VL-NEXT: vaddpd %xmm7, %xmm12, %xmm7 ; AVX512VL-NEXT: vmulsd %xmm4, %xmm2, %xmm4 -; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm9 -; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512VL-NEXT: vmulsd %xmm10, %xmm8, %xmm9 -; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 -; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm7, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] -; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm3 +; AVX512VL-NEXT: vmulsd %xmm5, %xmm10, %xmm10 +; AVX512VL-NEXT: vaddsd %xmm4, %xmm10, %xmm4 +; AVX512VL-NEXT: vmulsd %xmm11, %xmm8, %xmm10 +; AVX512VL-NEXT: vaddsd %xmm4, %xmm10, %xmm4 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm3, %xmm11, %xmm3 ; AVX512VL-NEXT: vaddpd %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] ; AVX512VL-NEXT: vmulpd %xmm3, %xmm6, %xmm6 ; AVX512VL-NEXT: vaddpd %xmm6, %xmm0, %xmm0 -; AVX512VL-NEXT: vmulsd %xmm7, %xmm2, %xmm2 -; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm5 +; AVX512VL-NEXT: vmulsd %xmm2, %xmm10, %xmm2 +; AVX512VL-NEXT: vmulsd %xmm5, %xmm11, %xmm5 ; AVX512VL-NEXT: vaddsd %xmm5, %xmm2, %xmm2 ; AVX512VL-NEXT: vmulsd %xmm3, %xmm8, %xmm3 ; AVX512VL-NEXT: vaddsd %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] -; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm3 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm3 = xmm7[1],xmm4[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512VL-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX512VL-NEXT: vmovapd %zmm3, (%rdi) +; AVX512VL-NEXT: vmovapd %zmm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll @@ -178,8 +178,8 @@ ; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorw (%eax), %dx ; X86-NEXT: movzbl 2(%ecx), %ecx -; X86-NEXT: xorb 2(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 2(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orw %dx, %ax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -311,8 +311,8 @@ ; X86-NEXT: movl (%ecx), %edx ; X86-NEXT: xorl (%eax), %edx ; X86-NEXT: movzbl 4(%ecx), %ecx -; X86-NEXT: xorb 4(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 4(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: setne %al ; X86-NEXT: retl @@ -514,8 +514,8 @@ ; X86-NEXT: xorl 4(%eax), %esi ; X86-NEXT: orl %edx, %esi ; X86-NEXT: movzbl 8(%ecx), %ecx -; X86-NEXT: xorb 8(%eax), %cl -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: movzbl 8(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %esi, %eax ; X86-NEXT: sete %al ; X86-NEXT: popl %esi @@ -537,8 +537,8 @@ ; X86-NEXT: xorl 4(%eax), %esi ; X86-NEXT: orl %edx, %esi ; X86-NEXT: movzwl 8(%ecx), %ecx -; X86-NEXT: xorw 8(%eax), %cx -; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: movzwl 8(%eax), %eax +; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: orl %esi, %eax ; X86-NEXT: sete %al ; X86-NEXT: popl %esi @@ -645,8 +645,8 @@ ; X86-NEXT: movl 8(%edx), %esi ; X86-NEXT: xorl 8(%ecx), %esi ; X86-NEXT: movzbl 12(%edx), %edx -; X86-NEXT: xorb 12(%ecx), %dl -; X86-NEXT: movzbl %dl, %ecx +; X86-NEXT: movzbl 12(%ecx), %ecx +; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al @@ -671,8 +671,8 @@ ; X86-NEXT: movl 8(%edx), %esi ; X86-NEXT: xorl 8(%ecx), %esi ; X86-NEXT: movzwl 12(%edx), %edx -; X86-NEXT: xorw 12(%ecx), %dx -; X86-NEXT: movzwl %dx, %ecx +; X86-NEXT: movzwl 12(%ecx), %ecx +; X86-NEXT: xorl %edx, %ecx ; X86-NEXT: orl %esi, %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -167,9 +167,9 @@ ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: xorw (%rsi), %ax ; X64-NEXT: movzbl 2(%rdi), %ecx -; X64-NEXT: xorb 2(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orw %ax, %cx +; X64-NEXT: movzbl 2(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orw %ax, %dx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind @@ -284,9 +284,9 @@ ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: xorl (%rsi), %eax ; X64-NEXT: movzbl 4(%rdi), %ecx -; X64-NEXT: xorb 4(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movzbl 4(%rsi), %edx +; X64-NEXT: xorl %ecx, %edx +; X64-NEXT: orl %eax, %edx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind @@ -443,9 +443,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movzbl 8(%rdi), %ecx -; X64-NEXT: xorb 8(%rsi), %cl -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movzbl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: sete %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind @@ -459,9 +459,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movzwl 8(%rdi), %ecx -; X64-NEXT: xorw 8(%rsi), %cx -; X64-NEXT: movzwl %cx, %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movzwl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: sete %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind @@ -490,8 +490,9 @@ ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: xorq (%rsi), %rax ; X64-NEXT: movl 8(%rdi), %ecx -; X64-NEXT: xorl 8(%rsi), %ecx -; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl 8(%rsi), %edx +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %rax, %rdx ; X64-NEXT: setne %al ; X64-NEXT: retq %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind @@ -1636,10 +1637,96 @@ ; X64-AVX1-LABEL: length48_eq: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX1-NEXT: vmovups 32(%rsi), %xmm2 -; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 +; X64-AVX1-NEXT: movq 32(%rdi), %rax +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vmovd %eax, %xmm1 +; X64-AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 40(%rdi), %rax +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 32(%rsi), %rcx +; X64-AVX1-NEXT: movq 40(%rsi), %rax +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $8, %edx +; X64-AVX1-NEXT: vmovd %ecx, %xmm2 +; X64-AVX1-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $16, %edx +; X64-AVX1-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %ecx, %edx +; X64-AVX1-NEXT: shrl $24, %edx +; X64-AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $32, %rdx +; X64-AVX1-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $40, %rdx +; X64-AVX1-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rcx, %rdx +; X64-AVX1-NEXT: shrq $48, %rdx +; X64-AVX1-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: shrq $56, %rcx +; X64-AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al @@ -1649,10 +1736,96 @@ ; X64-AVX2-LABEL: length48_eq: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX2-NEXT: vmovdqu 32(%rsi), %xmm2 -; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX2-NEXT: movq 32(%rdi), %rax +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 40(%rdi), %rax +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 32(%rsi), %rcx +; X64-AVX2-NEXT: movq 40(%rsi), %rax +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $8, %edx +; X64-AVX2-NEXT: vmovd %ecx, %xmm2 +; X64-AVX2-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $16, %edx +; X64-AVX2-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %ecx, %edx +; X64-AVX2-NEXT: shrl $24, %edx +; X64-AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $32, %rdx +; X64-AVX2-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $40, %rdx +; X64-AVX2-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rcx, %rdx +; X64-AVX2-NEXT: shrq $48, %rdx +; X64-AVX2-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: shrq $56, %rcx +; X64-AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al @@ -1662,10 +1835,96 @@ ; X64-AVX512-LABEL: length48_eq: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX512-NEXT: vmovdqu 32(%rsi), %xmm2 -; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 +; X64-AVX512-NEXT: movq 32(%rdi), %rax +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vmovd %eax, %xmm1 +; X64-AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 40(%rdi), %rax +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 32(%rsi), %rcx +; X64-AVX512-NEXT: movq 40(%rsi), %rax +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $8, %edx +; X64-AVX512-NEXT: vmovd %ecx, %xmm2 +; X64-AVX512-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $16, %edx +; X64-AVX512-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %ecx, %edx +; X64-AVX512-NEXT: shrl $24, %edx +; X64-AVX512-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $32, %rdx +; X64-AVX512-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $40, %rdx +; X64-AVX512-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rcx, %rdx +; X64-AVX512-NEXT: shrq $48, %rdx +; X64-AVX512-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; X64-AVX512-NEXT: shrq $56, %rcx +; X64-AVX512-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 ; X64-AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: sete %al @@ -1676,8 +1935,22 @@ ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %ymm0 ; X64-MIC-AVX-NEXT: vmovdqu (%rsi), %ymm1 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rdi), %xmm2 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rsi), %xmm3 +; X64-MIC-AVX-NEXT: movq 32(%rdi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm2 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: movq 40(%rdi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; X64-MIC-AVX-NEXT: movq 32(%rsi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm3 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; X64-MIC-AVX-NEXT: movq 40(%rsi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm3, %zmm2, %k0 ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ; X64-MIC-AVX-NEXT: kortestw %k0, %k1 @@ -1823,9 +2096,52 @@ ; X64-AVX1-LABEL: length48_eq_const: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 -; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX1-NEXT: movq 32(%rdi), %rax +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vmovd %eax, %xmm1 +; X64-AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq 40(%rdi), %rax +; X64-AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $8, %ecx +; X64-AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $16, %ecx +; X64-AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: shrl $24, %ecx +; X64-AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $32, %rcx +; X64-AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $40, %rcx +; X64-AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: movq %rax, %rcx +; X64-AVX1-NEXT: shrq $48, %rcx +; X64-AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX1-NEXT: shrq $56, %rax +; X64-AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al @@ -1835,9 +2151,52 @@ ; X64-AVX2-LABEL: length48_eq_const: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: movq 32(%rdi), %rax +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vmovd %eax, %xmm1 +; X64-AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq 40(%rdi), %rax +; X64-AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $8, %ecx +; X64-AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $16, %ecx +; X64-AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: shrl $24, %ecx +; X64-AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $32, %rcx +; X64-AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $40, %rcx +; X64-AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: movq %rax, %rcx +; X64-AVX2-NEXT: shrq $48, %rcx +; X64-AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX2-NEXT: shrq $56, %rax +; X64-AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al @@ -1847,9 +2206,52 @@ ; X64-AVX512-LABEL: length48_eq_const: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0 -; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX512-NEXT: movq 32(%rdi), %rax +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vmovd %eax, %xmm1 +; X64-AVX512-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq 40(%rdi), %rax +; X64-AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $8, %ecx +; X64-AVX512-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $16, %ecx +; X64-AVX512-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movl %eax, %ecx +; X64-AVX512-NEXT: shrl $24, %ecx +; X64-AVX512-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $32, %rcx +; X64-AVX512-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $40, %rcx +; X64-AVX512-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: movq %rax, %rcx +; X64-AVX512-NEXT: shrq $48, %rcx +; X64-AVX512-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512-NEXT: shrq $56, %rax +; X64-AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 ; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vptest %ymm0, %ymm0 ; X64-AVX512-NEXT: setne %al @@ -1859,12 +2261,19 @@ ; X64-MIC-AVX-LABEL: length48_eq_const: ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %ymm0 -; X64-MIC-AVX-NEXT: vmovdqu 32(%rdi), %xmm1 -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0] -; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960] -; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 -; X64-MIC-AVX-NEXT: kortestw %k0, %k1 +; X64-MIC-AVX-NEXT: movq 32(%rdi), %rax +; X64-MIC-AVX-NEXT: vmovd %eax, %xmm1 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: movq 40(%rdi), %rax +; X64-MIC-AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: shrq $32, %rax +; X64-MIC-AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960] +; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm0, %k0 +; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [892613426,959985462,858927408,926299444,0,0,0,0] +; X64-MIC-AVX-NEXT: vpcmpneqd %zmm0, %zmm1, %k1 +; X64-MIC-AVX-NEXT: kortestw %k1, %k0 ; X64-MIC-AVX-NEXT: setne %al ; X64-MIC-AVX-NEXT: vzeroupper ; X64-MIC-AVX-NEXT: retq @@ -2388,23 +2797,231 @@ ; X64-AVX512BW-LABEL: length96_eq: ; X64-AVX512BW: # %bb.0: ; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 -; X64-AVX512BW-NEXT: vmovdqu 64(%rdi), %ymm1 -; X64-AVX512BW-NEXT: vmovdqu 64(%rsi), %ymm2 -; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k0 -; X64-AVX512BW-NEXT: vpcmpneqb %zmm2, %zmm1, %k1 -; X64-AVX512BW-NEXT: kortestq %k1, %k0 +; X64-AVX512BW-NEXT: movq 80(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm1 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 88(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; X64-AVX512BW-NEXT: movq 64(%rdi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 72(%rdi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X64-AVX512BW-NEXT: movq 80(%rsi), %rax +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vmovd %eax, %xmm2 +; X64-AVX512BW-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 88(%rsi), %rax +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; X64-AVX512BW-NEXT: movq 64(%rsi), %rcx +; X64-AVX512BW-NEXT: movq 72(%rsi), %rax +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $8, %edx +; X64-AVX512BW-NEXT: vmovd %ecx, %xmm3 +; X64-AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $16, %edx +; X64-AVX512BW-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %ecx, %edx +; X64-AVX512BW-NEXT: shrl $24, %edx +; X64-AVX512BW-NEXT: vpinsrb $3, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $32, %rdx +; X64-AVX512BW-NEXT: vpinsrb $4, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $40, %rdx +; X64-AVX512BW-NEXT: vpinsrb $5, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rcx, %rdx +; X64-AVX512BW-NEXT: shrq $48, %rdx +; X64-AVX512BW-NEXT: vpinsrb $6, %edx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: shrq $56, %rcx +; X64-AVX512BW-NEXT: vpinsrb $7, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $8, %ecx +; X64-AVX512BW-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $16, %ecx +; X64-AVX512BW-NEXT: vpinsrb $10, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movl %eax, %ecx +; X64-AVX512BW-NEXT: shrl $24, %ecx +; X64-AVX512BW-NEXT: vpinsrb $11, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $32, %rcx +; X64-AVX512BW-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $40, %rcx +; X64-AVX512BW-NEXT: vpinsrb $13, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: movq %rax, %rcx +; X64-AVX512BW-NEXT: shrq $48, %rcx +; X64-AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm3, %xmm3 +; X64-AVX512BW-NEXT: shrq $56, %rax +; X64-AVX512BW-NEXT: vpinsr