diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -1156,7 +1156,7 @@ FeatureFP16FML, FeatureDotProd]>; -def : ProcessorModel<"generic", NoSchedModel, [ +def : ProcessorModel<"generic", CortexA55Model, [ FeatureFPARMv8, FeatureFuseAES, FeatureNEON, diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll @@ -4,7 +4,7 @@ ; COST-LABEL: sel.v8i8 ; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ; CODE-LABEL: sel.v8i8 -; CODE: tbl v0.8b, { v0.16b }, v2.8b +; CODE: tbl v0.8b, { v0.16b }, v1.8b define <8 x i8> @sel.v8i8(<8 x i8> %v0, <8 x i8> %v1) { %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ret <8 x i8> %tmp0 diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll --- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll +++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll @@ -119,15 +119,15 @@ ; CODE-LABEL: v3i64_select_sle ; CODE: bb.0 -; CODE: ldr ; CODE: mov +; CODE: ldr ; CODE: mov ; CODE: mov ; CODE: cmge ; CODE: cmge ; CODE: bif -; CODE: ext ; CODE: bif +; CODE: ext ; CODE: ret define <3 x i64> @v3i64_select_sle(<3 x i64> %a, <3 x i64> %b, <3 x i64> %c) { diff --git a/llvm/test/CodeGen/AArch64/DAGCombine_vscale.ll b/llvm/test/CodeGen/AArch64/DAGCombine_vscale.ll --- a/llvm/test/CodeGen/AArch64/DAGCombine_vscale.ll +++ b/llvm/test/CodeGen/AArch64/DAGCombine_vscale.ll @@ -51,8 +51,8 @@ ; CHECK-LABEL: ashr_add_shl_nxv4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #16777216 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: lsl z0.s, z0.s, #24 +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: asr z0.s, z0.s, #24 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll @@ -705,14 +705,14 @@ define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) #0 { ; CHECK-NOLSE-O1-LABEL: atomic_load_relaxed_8: ; CHECK-NOLSE-O1: ; %bb.0: -; CHECK-NOLSE-O1-NEXT: ldrb w8, [x0, #4095] -; CHECK-NOLSE-O1-NEXT: ldrb w9, [x0, w1, sxtw] -; CHECK-NOLSE-O1-NEXT: ldurb w10, [x0, #-256] -; CHECK-NOLSE-O1-NEXT: add x11, x0, #291, lsl #12 ; =1191936 -; CHECK-NOLSE-O1-NEXT: ldrb w11, [x11] -; CHECK-NOLSE-O1-NEXT: add w8, w8, w9 -; CHECK-NOLSE-O1-NEXT: add w8, w8, w10 -; CHECK-NOLSE-O1-NEXT: add w0, w8, w11 +; CHECK-NOLSE-O1-NEXT: add x8, x0, #291, lsl #12 ; =1191936 +; CHECK-NOLSE-O1-NEXT: ldrb w9, [x0, #4095] +; CHECK-NOLSE-O1-NEXT: ldrb w10, [x0, w1, sxtw] +; CHECK-NOLSE-O1-NEXT: ldurb w11, [x0, #-256] +; CHECK-NOLSE-O1-NEXT: ldrb w8, [x8] +; CHECK-NOLSE-O1-NEXT: add w9, w9, w10 +; CHECK-NOLSE-O1-NEXT: add w9, w9, w11 +; CHECK-NOLSE-O1-NEXT: add w0, w9, w8 ; CHECK-NOLSE-O1-NEXT: ret ; ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_8: @@ -775,14 +775,14 @@ define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) #0 { ; CHECK-NOLSE-O1-LABEL: atomic_load_relaxed_16: ; CHECK-NOLSE-O1: ; %bb.0: -; CHECK-NOLSE-O1-NEXT: ldrh w8, [x0, #8190] -; CHECK-NOLSE-O1-NEXT: ldrh w9, [x0, w1, sxtw #1] -; CHECK-NOLSE-O1-NEXT: ldurh w10, [x0, #-256] -; CHECK-NOLSE-O1-NEXT: add x11, x0, #291, lsl #12 ; =1191936 -; CHECK-NOLSE-O1-NEXT: ldrh w11, [x11] -; CHECK-NOLSE-O1-NEXT: add w8, w8, w9 -; CHECK-NOLSE-O1-NEXT: add w8, w8, w10 -; CHECK-NOLSE-O1-NEXT: add w0, w8, w11 +; CHECK-NOLSE-O1-NEXT: add x8, x0, #291, lsl #12 ; =1191936 +; CHECK-NOLSE-O1-NEXT: ldrh w9, [x0, #8190] +; CHECK-NOLSE-O1-NEXT: ldrh w10, [x0, w1, sxtw #1] +; CHECK-NOLSE-O1-NEXT: ldurh w11, [x0, #-256] +; CHECK-NOLSE-O1-NEXT: ldrh w8, [x8] +; CHECK-NOLSE-O1-NEXT: add w9, w9, w10 +; CHECK-NOLSE-O1-NEXT: add w9, w9, w11 +; CHECK-NOLSE-O1-NEXT: add w0, w9, w8 ; CHECK-NOLSE-O1-NEXT: ret ; ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_16: @@ -845,14 +845,14 @@ define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) #0 { ; CHECK-NOLSE-O1-LABEL: atomic_load_relaxed_32: ; CHECK-NOLSE-O1: ; %bb.0: -; CHECK-NOLSE-O1-NEXT: ldr w8, [x0, #16380] -; CHECK-NOLSE-O1-NEXT: ldr w9, [x0, w1, sxtw #2] -; CHECK-NOLSE-O1-NEXT: ldur w10, [x0, #-256] -; CHECK-NOLSE-O1-NEXT: add x11, x0, #291, lsl #12 ; =1191936 -; CHECK-NOLSE-O1-NEXT: ldr w11, [x11] -; CHECK-NOLSE-O1-NEXT: add w8, w8, w9 -; CHECK-NOLSE-O1-NEXT: add w8, w8, w10 -; CHECK-NOLSE-O1-NEXT: add w0, w8, w11 +; CHECK-NOLSE-O1-NEXT: add x8, x0, #291, lsl #12 ; =1191936 +; CHECK-NOLSE-O1-NEXT: ldr w9, [x0, #16380] +; CHECK-NOLSE-O1-NEXT: ldr w10, [x0, w1, sxtw #2] +; CHECK-NOLSE-O1-NEXT: ldur w11, [x0, #-256] +; CHECK-NOLSE-O1-NEXT: ldr w8, [x8] +; CHECK-NOLSE-O1-NEXT: add w9, w9, w10 +; CHECK-NOLSE-O1-NEXT: add w9, w9, w11 +; CHECK-NOLSE-O1-NEXT: add w0, w9, w8 ; CHECK-NOLSE-O1-NEXT: ret ; ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_32: @@ -911,14 +911,14 @@ define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) #0 { ; CHECK-NOLSE-O1-LABEL: atomic_load_relaxed_64: ; CHECK-NOLSE-O1: ; %bb.0: -; CHECK-NOLSE-O1-NEXT: ldr x8, [x0, #32760] -; CHECK-NOLSE-O1-NEXT: ldr x9, [x0, w1, sxtw #3] -; CHECK-NOLSE-O1-NEXT: ldur x10, [x0, #-256] -; CHECK-NOLSE-O1-NEXT: add x11, x0, #291, lsl #12 ; =1191936 -; CHECK-NOLSE-O1-NEXT: ldr x11, [x11] -; CHECK-NOLSE-O1-NEXT: add x8, x8, x9 -; CHECK-NOLSE-O1-NEXT: add x8, x8, x10 -; CHECK-NOLSE-O1-NEXT: add x0, x8, x11 +; CHECK-NOLSE-O1-NEXT: add x8, x0, #291, lsl #12 ; =1191936 +; CHECK-NOLSE-O1-NEXT: ldr x9, [x0, #32760] +; CHECK-NOLSE-O1-NEXT: ldr x10, [x0, w1, sxtw #3] +; CHECK-NOLSE-O1-NEXT: ldur x11, [x0, #-256] +; CHECK-NOLSE-O1-NEXT: ldr x8, [x8] +; CHECK-NOLSE-O1-NEXT: add x9, x9, x10 +; CHECK-NOLSE-O1-NEXT: add x9, x9, x11 +; CHECK-NOLSE-O1-NEXT: add x0, x9, x8 ; CHECK-NOLSE-O1-NEXT: ret ; ; CHECK-NOLSE-O0-LABEL: atomic_load_relaxed_64: @@ -2717,8 +2717,8 @@ ; CHECK-NOLSE-O1-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NOLSE-O1-NEXT: ret ; CHECK-NOLSE-O1-NEXT: LBB47_4: ; %cmpxchg.nostore -; CHECK-NOLSE-O1-NEXT: clrex ; CHECK-NOLSE-O1-NEXT: mov w1, wzr +; CHECK-NOLSE-O1-NEXT: clrex ; CHECK-NOLSE-O1-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NOLSE-O1-NEXT: ret ; @@ -2783,8 +2783,8 @@ ; CHECK-NOLSE-O1-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NOLSE-O1-NEXT: ret ; CHECK-NOLSE-O1-NEXT: LBB48_4: ; %cmpxchg.nostore -; CHECK-NOLSE-O1-NEXT: clrex ; CHECK-NOLSE-O1-NEXT: mov w1, wzr +; CHECK-NOLSE-O1-NEXT: clrex ; CHECK-NOLSE-O1-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NOLSE-O1-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll b/llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll @@ -27,8 +27,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #288 ; CHECK-NEXT: stp x29, x30, [sp, #256] // 16-byte Folded Spill -; CHECK-NEXT: str x28, [sp, #272] // 8-byte Folded Spill ; CHECK-NEXT: add x29, sp, #256 +; CHECK-NEXT: str x28, [sp, #272] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa w29, 32 ; CHECK-NEXT: .cfi_offset w28, -16 ; CHECK-NEXT: .cfi_offset w30, -24 @@ -66,8 +66,8 @@ ; CHECK-NEXT: ldr q0, [x0, #240] ; CHECK-NEXT: str q0, [sp, #240] ; CHECK-NEXT: bl byval_a64i32 -; CHECK-NEXT: ldr x28, [sp, #272] // 8-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp, #256] // 16-byte Folded Reload +; CHECK-NEXT: ldr x28, [sp, #272] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #288 ; CHECK-NEXT: ret call void @byval_a64i32([64 x i32]* byval([64 x i32]) %incoming) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll b/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-variadic-musttail.ll @@ -63,15 +63,12 @@ ; CHECK-NEXT: mov x25, x6 ; CHECK-NEXT: mov x26, x7 ; CHECK-NEXT: stp q1, q0, [sp, #96] ; 32-byte Folded Spill +; CHECK-NEXT: mov x27, x8 ; CHECK-NEXT: stp q3, q2, [sp, #64] ; 32-byte Folded Spill ; CHECK-NEXT: stp q5, q4, [sp, #32] ; 32-byte Folded Spill ; CHECK-NEXT: stp q7, q6, [sp] ; 32-byte Folded Spill -; CHECK-NEXT: mov x27, x8 ; CHECK-NEXT: bl _puts ; CHECK-NEXT: ldp q1, q0, [sp, #96] ; 32-byte Folded Reload -; CHECK-NEXT: ldp q3, q2, [sp, #64] ; 32-byte Folded Reload -; CHECK-NEXT: ldp q5, q4, [sp, #32] ; 32-byte Folded Reload -; CHECK-NEXT: ldp q7, q6, [sp] ; 32-byte Folded Reload ; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: mov x2, x21 @@ -81,6 +78,9 @@ ; CHECK-NEXT: mov x6, x25 ; CHECK-NEXT: mov x7, x26 ; CHECK-NEXT: mov x8, x27 +; CHECK-NEXT: ldp q3, q2, [sp, #64] ; 32-byte Folded Reload +; CHECK-NEXT: ldp q5, q4, [sp, #32] ; 32-byte Folded Reload +; CHECK-NEXT: ldp q7, q6, [sp] ; 32-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp, #208] ; 16-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #192] ; 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #176] ; 16-byte Folded Reload @@ -122,9 +122,8 @@ ; CHECK-NEXT: .cfi_offset w26, -80 ; CHECK-NEXT: .cfi_offset w27, -88 ; CHECK-NEXT: .cfi_offset w28, -96 -; CHECK-NEXT: mov x27, x8 -; CHECK-NEXT: add x8, sp, #128 -; CHECK-NEXT: add x9, sp, #256 +; CHECK-NEXT: add x9, sp, #128 +; CHECK-NEXT: add x10, sp, #256 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: mov x20, x1 ; CHECK-NEXT: mov x21, x2 @@ -134,16 +133,14 @@ ; CHECK-NEXT: mov x25, x6 ; CHECK-NEXT: mov x26, x7 ; CHECK-NEXT: stp q1, q0, [sp, #96] ; 32-byte Folded Spill +; CHECK-NEXT: mov x27, x8 ; CHECK-NEXT: stp q3, q2, [sp, #64] ; 32-byte Folded Spill ; CHECK-NEXT: stp q5, q4, [sp, #32] ; 32-byte Folded Spill ; CHECK-NEXT: stp q7, q6, [sp] ; 32-byte Folded Spill -; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: str x10, [x9] ; CHECK-NEXT: bl _get_f -; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: ldp q1, q0, [sp, #96] ; 32-byte Folded Reload -; CHECK-NEXT: ldp q3, q2, [sp, #64] ; 32-byte Folded Reload -; CHECK-NEXT: ldp q5, q4, [sp, #32] ; 32-byte Folded Reload -; CHECK-NEXT: ldp q7, q6, [sp] ; 32-byte Folded Reload +; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: mov x2, x21 @@ -153,6 +150,9 @@ ; CHECK-NEXT: mov x6, x25 ; CHECK-NEXT: mov x7, x26 ; CHECK-NEXT: mov x8, x27 +; CHECK-NEXT: ldp q3, q2, [sp, #64] ; 32-byte Folded Reload +; CHECK-NEXT: ldp q5, q4, [sp, #32] ; 32-byte Folded Reload +; CHECK-NEXT: ldp q7, q6, [sp] ; 32-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp, #240] ; 16-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #224] ; 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #208] ; 16-byte Folded Reload @@ -195,9 +195,9 @@ ; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x10, _g@GOTPAGE ; CHECK-NEXT: ldr x9, [x0, #16] +; CHECK-NEXT: mov w11, #42 ; CHECK-NEXT: Lloh3: ; CHECK-NEXT: ldr x10, [x10, _g@GOTPAGEOFF] -; CHECK-NEXT: mov w11, #42 ; CHECK-NEXT: Lloh4: ; CHECK-NEXT: str w11, [x10] ; CHECK-NEXT: br x9 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -18,20 +18,20 @@ ; ; GISEL-LABEL: combine_vec_udiv_uniform: ; GISEL: // %bb.0: -; GISEL-NEXT: adrp x8, .LCPI0_1 -; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI0_1] -; GISEL-NEXT: adrp x8, .LCPI0_0 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI0_0] ; GISEL-NEXT: adrp x8, .LCPI0_2 -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] -; GISEL-NEXT: sub v1.8h, v2.8h, v1.8h -; GISEL-NEXT: neg v1.8h, v1.8h -; GISEL-NEXT: umull2 v2.4s, v0.8h, v3.8h -; GISEL-NEXT: umull v3.4s, v0.4h, v3.4h -; GISEL-NEXT: uzp2 v2.8h, v3.8h, v2.8h -; GISEL-NEXT: sub v0.8h, v0.8h, v2.8h -; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h -; GISEL-NEXT: add v0.8h, v0.8h, v2.8h +; GISEL-NEXT: adrp x9, .LCPI0_0 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI0_2] +; GISEL-NEXT: adrp x8, .LCPI0_1 +; GISEL-NEXT: ldr q4, [x9, :lo12:.LCPI0_0] +; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI0_1] +; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h +; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; GISEL-NEXT: sub v2.8h, v4.8h, v3.8h +; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h +; GISEL-NEXT: neg v2.8h, v2.8h +; GISEL-NEXT: ushl v0.8h, v0.8h, v2.8h +; GISEL-NEXT: add v0.8h, v0.8h, v1.8h ; GISEL-NEXT: ushr v0.8h, v0.8h, #4 ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, @@ -44,53 +44,53 @@ ; SDAG-NEXT: adrp x8, .LCPI1_0 ; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] ; SDAG-NEXT: adrp x8, .LCPI1_1 +; SDAG-NEXT: ushl v1.8h, v0.8h, v1.8h ; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] ; SDAG-NEXT: adrp x8, .LCPI1_2 -; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] -; SDAG-NEXT: ushl v1.8h, v0.8h, v1.8h -; SDAG-NEXT: umull2 v4.4s, v1.8h, v2.8h +; SDAG-NEXT: umull2 v3.4s, v1.8h, v2.8h ; SDAG-NEXT: umull v1.4s, v1.4h, v2.4h +; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_2] ; SDAG-NEXT: adrp x8, .LCPI1_3 -; SDAG-NEXT: uzp2 v1.8h, v1.8h, v4.8h -; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_3] +; SDAG-NEXT: uzp2 v1.8h, v1.8h, v3.8h ; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h -; SDAG-NEXT: umull2 v4.4s, v0.8h, v3.8h -; SDAG-NEXT: umull v0.4s, v0.4h, v3.4h -; SDAG-NEXT: uzp2 v0.8h, v0.8h, v4.8h +; SDAG-NEXT: umull2 v3.4s, v0.8h, v2.8h +; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h +; SDAG-NEXT: uzp2 v0.8h, v0.8h, v3.8h ; SDAG-NEXT: add v0.8h, v0.8h, v1.8h -; SDAG-NEXT: ushl v0.8h, v0.8h, v2.8h +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_3] +; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h ; SDAG-NEXT: ret ; ; GISEL-LABEL: combine_vec_udiv_nonuniform: ; GISEL: // %bb.0: -; GISEL-NEXT: adrp x8, .LCPI1_5 -; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_5] ; GISEL-NEXT: adrp x8, .LCPI1_4 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_4] +; GISEL-NEXT: adrp x10, .LCPI1_0 +; GISEL-NEXT: adrp x9, .LCPI1_1 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_4] ; GISEL-NEXT: adrp x8, .LCPI1_3 -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI1_3] -; GISEL-NEXT: adrp x8, .LCPI1_1 -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI1_1] -; GISEL-NEXT: adrp x8, .LCPI1_0 -; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI1_0] +; GISEL-NEXT: ldr q5, [x10, :lo12:.LCPI1_0] +; GISEL-NEXT: ldr q6, [x9, :lo12:.LCPI1_1] +; GISEL-NEXT: neg v1.8h, v1.8h +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_3] ; GISEL-NEXT: adrp x8, .LCPI1_2 -; GISEL-NEXT: neg v2.8h, v2.8h -; GISEL-NEXT: ldr q6, [x8, :lo12:.LCPI1_2] -; GISEL-NEXT: ushl v2.8h, v0.8h, v2.8h -; GISEL-NEXT: cmeq v1.8h, v1.8h, v5.8h -; GISEL-NEXT: umull2 v5.4s, v2.8h, v3.8h +; GISEL-NEXT: ushl v1.8h, v0.8h, v1.8h +; GISEL-NEXT: umull2 v3.4s, v1.8h, v2.8h +; GISEL-NEXT: umull v1.4s, v1.4h, v2.4h +; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] +; GISEL-NEXT: adrp x8, .LCPI1_5 +; GISEL-NEXT: sub v2.8h, v0.8h, v1.8h +; GISEL-NEXT: umull2 v4.4s, v2.8h, v3.8h ; GISEL-NEXT: umull v2.4s, v2.4h, v3.4h -; GISEL-NEXT: uzp2 v2.8h, v2.8h, v5.8h -; GISEL-NEXT: sub v3.8h, v0.8h, v2.8h -; GISEL-NEXT: umull2 v5.4s, v3.8h, v6.8h -; GISEL-NEXT: umull v3.4s, v3.4h, v6.4h -; GISEL-NEXT: uzp2 v3.8h, v3.8h, v5.8h -; GISEL-NEXT: neg v4.8h, v4.8h -; GISEL-NEXT: shl v1.8h, v1.8h, #15 -; GISEL-NEXT: add v2.8h, v3.8h, v2.8h -; GISEL-NEXT: ushl v2.8h, v2.8h, v4.8h -; GISEL-NEXT: sshr v1.8h, v1.8h, #15 -; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI1_5] +; GISEL-NEXT: cmeq v3.8h, v3.8h, v5.8h +; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h +; GISEL-NEXT: neg v4.8h, v6.8h +; GISEL-NEXT: add v1.8h, v2.8h, v1.8h +; GISEL-NEXT: shl v2.8h, v3.8h, #15 +; GISEL-NEXT: ushl v1.8h, v1.8h, v4.8h +; GISEL-NEXT: sshr v2.8h, v2.8h, #15 +; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, ret <8 x i16> %1 @@ -100,41 +100,41 @@ ; SDAG-LABEL: combine_vec_udiv_nonuniform2: ; SDAG: // %bb.0: ; SDAG-NEXT: adrp x8, .LCPI2_0 -; SDAG-NEXT: adrp x9, .LCPI2_1 ; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; SDAG-NEXT: ldr q2, [x9, :lo12:.LCPI2_1] +; SDAG-NEXT: adrp x8, .LCPI2_1 +; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_1] ; SDAG-NEXT: adrp x8, .LCPI2_2 -; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] +; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h +; SDAG-NEXT: umull v0.4s, v0.4h, v1.4h +; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_2] +; SDAG-NEXT: uzp2 v0.8h, v0.8h, v2.8h ; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h -; SDAG-NEXT: umull2 v1.4s, v0.8h, v2.8h -; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h -; SDAG-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; SDAG-NEXT: ushl v0.8h, v0.8h, v3.8h ; SDAG-NEXT: ret ; ; GISEL-LABEL: combine_vec_udiv_nonuniform2: ; GISEL: // %bb.0: -; GISEL-NEXT: adrp x8, .LCPI2_4 -; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_4] ; GISEL-NEXT: adrp x8, .LCPI2_3 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_3] -; GISEL-NEXT: adrp x8, .LCPI2_1 -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI2_1] -; GISEL-NEXT: adrp x8, .LCPI2_0 -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI2_0] +; GISEL-NEXT: adrp x9, .LCPI2_4 +; GISEL-NEXT: adrp x10, .LCPI2_0 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_3] ; GISEL-NEXT: adrp x8, .LCPI2_2 -; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI2_2] +; GISEL-NEXT: ldr q3, [x9, :lo12:.LCPI2_4] +; GISEL-NEXT: ldr q4, [x10, :lo12:.LCPI2_0] +; GISEL-NEXT: neg v1.8h, v1.8h +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_2] +; GISEL-NEXT: adrp x8, .LCPI2_1 +; GISEL-NEXT: cmeq v3.8h, v3.8h, v4.8h +; GISEL-NEXT: ushl v1.8h, v0.8h, v1.8h +; GISEL-NEXT: shl v3.8h, v3.8h, #15 +; GISEL-NEXT: umull2 v5.4s, v1.8h, v2.8h +; GISEL-NEXT: umull v1.4s, v1.4h, v2.4h +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] ; GISEL-NEXT: neg v2.8h, v2.8h -; GISEL-NEXT: ushl v2.8h, v0.8h, v2.8h -; GISEL-NEXT: cmeq v1.8h, v1.8h, v4.8h -; GISEL-NEXT: umull2 v4.4s, v2.8h, v5.8h -; GISEL-NEXT: umull v2.4s, v2.4h, v5.4h -; GISEL-NEXT: neg v3.8h, v3.8h -; GISEL-NEXT: shl v1.8h, v1.8h, #15 -; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h -; GISEL-NEXT: ushl v2.8h, v2.8h, v3.8h -; GISEL-NEXT: sshr v1.8h, v1.8h, #15 -; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b +; GISEL-NEXT: uzp2 v1.8h, v1.8h, v5.8h +; GISEL-NEXT: ushl v1.8h, v1.8h, v2.8h +; GISEL-NEXT: sshr v2.8h, v3.8h, #15 +; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, ret <8 x i16> %1 @@ -146,43 +146,43 @@ ; SDAG-NEXT: adrp x8, .LCPI3_0 ; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] ; SDAG-NEXT: adrp x8, .LCPI3_1 -; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI3_1] ; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h ; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h ; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h ; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h ; SDAG-NEXT: usra v1.8h, v0.8h, #1 -; SDAG-NEXT: ushl v0.8h, v1.8h, v3.8h +; SDAG-NEXT: ldr q0, [x8, :lo12:.LCPI3_1] +; SDAG-NEXT: ushl v0.8h, v1.8h, v0.8h ; SDAG-NEXT: ret ; ; GISEL-LABEL: combine_vec_udiv_nonuniform3: ; GISEL: // %bb.0: -; GISEL-NEXT: adrp x8, .LCPI3_5 -; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_5] ; GISEL-NEXT: adrp x8, .LCPI3_4 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_4] -; GISEL-NEXT: adrp x8, .LCPI3_2 -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] -; GISEL-NEXT: adrp x8, .LCPI3_1 -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI3_1] -; GISEL-NEXT: adrp x8, .LCPI3_3 -; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI3_3] -; GISEL-NEXT: adrp x8, .LCPI3_0 -; GISEL-NEXT: ldr q6, [x8, :lo12:.LCPI3_0] -; GISEL-NEXT: sub v3.8h, v4.8h, v3.8h -; GISEL-NEXT: umull2 v4.4s, v0.8h, v2.8h -; GISEL-NEXT: umull v2.4s, v0.4h, v2.4h -; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h -; GISEL-NEXT: neg v3.8h, v3.8h -; GISEL-NEXT: sub v4.8h, v0.8h, v2.8h -; GISEL-NEXT: cmeq v1.8h, v1.8h, v6.8h -; GISEL-NEXT: ushl v3.8h, v4.8h, v3.8h -; GISEL-NEXT: neg v5.8h, v5.8h -; GISEL-NEXT: shl v1.8h, v1.8h, #15 -; GISEL-NEXT: add v2.8h, v3.8h, v2.8h -; GISEL-NEXT: ushl v2.8h, v2.8h, v5.8h -; GISEL-NEXT: sshr v1.8h, v1.8h, #15 -; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b +; GISEL-NEXT: adrp x9, .LCPI3_2 +; GISEL-NEXT: adrp x10, .LCPI3_1 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_4] +; GISEL-NEXT: adrp x8, .LCPI3_5 +; GISEL-NEXT: ldr q2, [x9, :lo12:.LCPI3_2] +; GISEL-NEXT: adrp x9, .LCPI3_3 +; GISEL-NEXT: ldr q3, [x10, :lo12:.LCPI3_1] +; GISEL-NEXT: adrp x10, .LCPI3_0 +; GISEL-NEXT: umull2 v4.4s, v0.8h, v1.8h +; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h +; GISEL-NEXT: ldr q6, [x9, :lo12:.LCPI3_3] +; GISEL-NEXT: sub v2.8h, v3.8h, v2.8h +; GISEL-NEXT: ldr q5, [x10, :lo12:.LCPI3_0] +; GISEL-NEXT: uzp2 v1.8h, v1.8h, v4.8h +; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI3_5] +; GISEL-NEXT: neg v2.8h, v2.8h +; GISEL-NEXT: sub v3.8h, v0.8h, v1.8h +; GISEL-NEXT: ushl v2.8h, v3.8h, v2.8h +; GISEL-NEXT: cmeq v3.8h, v4.8h, v5.8h +; GISEL-NEXT: neg v4.8h, v6.8h +; GISEL-NEXT: add v1.8h, v2.8h, v1.8h +; GISEL-NEXT: shl v2.8h, v3.8h, #15 +; GISEL-NEXT: ushl v1.8h, v1.8h, v4.8h +; GISEL-NEXT: sshr v2.8h, v2.8h, #15 +; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, ret <8 x i16> %1 @@ -192,39 +192,39 @@ ; SDAG-LABEL: combine_vec_udiv_nonuniform4: ; SDAG: // %bb.0: ; SDAG-NEXT: adrp x8, .LCPI4_0 +; SDAG-NEXT: adrp x9, .LCPI4_3 ; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] ; SDAG-NEXT: adrp x8, .LCPI4_1 +; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI4_3] +; SDAG-NEXT: umull2 v2.8h, v0.16b, v1.16b +; SDAG-NEXT: umull v1.8h, v0.8b, v1.8b +; SDAG-NEXT: and v0.16b, v0.16b, v3.16b +; SDAG-NEXT: uzp2 v1.16b, v1.16b, v2.16b ; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] ; SDAG-NEXT: adrp x8, .LCPI4_2 -; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] -; SDAG-NEXT: adrp x8, .LCPI4_3 -; SDAG-NEXT: ldr q4, [x8, :lo12:.LCPI4_3] -; SDAG-NEXT: umull2 v5.8h, v0.16b, v1.16b -; SDAG-NEXT: umull v1.8h, v0.8b, v1.8b -; SDAG-NEXT: uzp2 v1.16b, v1.16b, v5.16b ; SDAG-NEXT: ushl v1.16b, v1.16b, v2.16b -; SDAG-NEXT: and v1.16b, v1.16b, v3.16b -; SDAG-NEXT: and v0.16b, v0.16b, v4.16b +; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_2] +; SDAG-NEXT: and v1.16b, v1.16b, v2.16b ; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b ; SDAG-NEXT: ret ; ; GISEL-LABEL: combine_vec_udiv_nonuniform4: ; GISEL: // %bb.0: ; GISEL-NEXT: adrp x8, .LCPI4_3 +; GISEL-NEXT: adrp x9, .LCPI4_2 +; GISEL-NEXT: adrp x10, .LCPI4_1 ; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI4_3] ; GISEL-NEXT: adrp x8, .LCPI4_0 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] -; GISEL-NEXT: adrp x8, .LCPI4_2 -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] -; GISEL-NEXT: adrp x8, .LCPI4_1 -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI4_1] -; GISEL-NEXT: cmeq v1.16b, v1.16b, v2.16b -; GISEL-NEXT: umull2 v2.8h, v0.16b, v3.16b -; GISEL-NEXT: umull v3.8h, v0.8b, v3.8b -; GISEL-NEXT: neg v4.16b, v4.16b -; GISEL-NEXT: uzp2 v2.16b, v3.16b, v2.16b +; GISEL-NEXT: ldr q2, [x9, :lo12:.LCPI4_2] +; GISEL-NEXT: ldr q3, [x10, :lo12:.LCPI4_1] +; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI4_0] +; GISEL-NEXT: umull2 v5.8h, v0.16b, v2.16b +; GISEL-NEXT: umull v2.8h, v0.8b, v2.8b +; GISEL-NEXT: cmeq v1.16b, v1.16b, v4.16b +; GISEL-NEXT: neg v3.16b, v3.16b +; GISEL-NEXT: uzp2 v2.16b, v2.16b, v5.16b ; GISEL-NEXT: shl v1.16b, v1.16b, #7 -; GISEL-NEXT: ushl v2.16b, v2.16b, v4.16b +; GISEL-NEXT: ushl v2.16b, v2.16b, v3.16b ; GISEL-NEXT: sshr v1.16b, v1.16b, #7 ; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b ; GISEL-NEXT: ret @@ -236,55 +236,55 @@ ; SDAG-LABEL: pr38477: ; SDAG: // %bb.0: ; SDAG-NEXT: adrp x8, .LCPI5_0 +; SDAG-NEXT: adrp x9, .LCPI5_4 ; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] ; SDAG-NEXT: adrp x8, .LCPI5_1 -; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] -; SDAG-NEXT: adrp x8, .LCPI5_2 -; SDAG-NEXT: umull2 v4.4s, v0.8h, v1.8h +; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h +; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI5_1] ; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h -; SDAG-NEXT: uzp2 v1.8h, v1.8h, v4.8h -; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] -; SDAG-NEXT: adrp x8, .LCPI5_3 -; SDAG-NEXT: sub v4.8h, v0.8h, v1.8h -; SDAG-NEXT: umull2 v5.4s, v4.8h, v2.8h -; SDAG-NEXT: umull v2.4s, v4.4h, v2.4h -; SDAG-NEXT: ldr q4, [x8, :lo12:.LCPI5_3] -; SDAG-NEXT: adrp x8, .LCPI5_4 -; SDAG-NEXT: uzp2 v2.8h, v2.8h, v5.8h -; SDAG-NEXT: ldr q5, [x8, :lo12:.LCPI5_4] +; SDAG-NEXT: adrp x8, .LCPI5_2 +; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; SDAG-NEXT: sub v2.8h, v0.8h, v1.8h +; SDAG-NEXT: umull2 v4.4s, v2.8h, v3.8h +; SDAG-NEXT: umull v2.4s, v2.4h, v3.4h +; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI5_4] +; SDAG-NEXT: and v0.16b, v0.16b, v3.16b +; SDAG-NEXT: uzp2 v2.8h, v2.8h, v4.8h ; SDAG-NEXT: add v1.8h, v2.8h, v1.8h -; SDAG-NEXT: ushl v1.8h, v1.8h, v3.8h -; SDAG-NEXT: and v1.16b, v1.16b, v4.16b -; SDAG-NEXT: and v0.16b, v0.16b, v5.16b +; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_2] +; SDAG-NEXT: adrp x8, .LCPI5_3 +; SDAG-NEXT: ushl v1.8h, v1.8h, v2.8h +; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_3] +; SDAG-NEXT: and v1.16b, v1.16b, v2.16b ; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b ; SDAG-NEXT: ret ; ; GISEL-LABEL: pr38477: ; GISEL: // %bb.0: -; GISEL-NEXT: adrp x8, .LCPI5_4 -; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_4] ; GISEL-NEXT: adrp x8, .LCPI5_3 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI5_3] +; GISEL-NEXT: adrp x10, .LCPI5_0 +; GISEL-NEXT: adrp x9, .LCPI5_1 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_3] ; GISEL-NEXT: adrp x8, .LCPI5_2 +; GISEL-NEXT: ldr q5, [x10, :lo12:.LCPI5_0] +; GISEL-NEXT: ldr q6, [x9, :lo12:.LCPI5_1] +; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h ; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] -; GISEL-NEXT: adrp x8, .LCPI5_1 -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI5_1] -; GISEL-NEXT: adrp x8, .LCPI5_0 -; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI5_0] -; GISEL-NEXT: umull2 v6.4s, v0.8h, v2.8h -; GISEL-NEXT: umull v2.4s, v0.4h, v2.4h -; GISEL-NEXT: uzp2 v2.8h, v2.8h, v6.8h -; GISEL-NEXT: cmeq v1.8h, v1.8h, v5.8h -; GISEL-NEXT: sub v5.8h, v0.8h, v2.8h -; GISEL-NEXT: umull2 v6.4s, v5.8h, v3.8h -; GISEL-NEXT: umull v3.4s, v5.4h, v3.4h -; GISEL-NEXT: uzp2 v3.8h, v3.8h, v6.8h -; GISEL-NEXT: neg v4.8h, v4.8h -; GISEL-NEXT: shl v1.8h, v1.8h, #15 -; GISEL-NEXT: add v2.8h, v3.8h, v2.8h -; GISEL-NEXT: ushl v2.8h, v2.8h, v4.8h -; GISEL-NEXT: sshr v1.8h, v1.8h, #15 -; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b +; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h +; GISEL-NEXT: adrp x8, .LCPI5_4 +; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; GISEL-NEXT: sub v2.8h, v0.8h, v1.8h +; GISEL-NEXT: umull2 v4.4s, v2.8h, v3.8h +; GISEL-NEXT: umull v2.4s, v2.4h, v3.4h +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_4] +; GISEL-NEXT: cmeq v3.8h, v3.8h, v5.8h +; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h +; GISEL-NEXT: neg v4.8h, v6.8h +; GISEL-NEXT: add v1.8h, v2.8h, v1.8h +; GISEL-NEXT: shl v2.8h, v3.8h, #15 +; GISEL-NEXT: ushl v1.8h, v1.8h, v4.8h +; GISEL-NEXT: sshr v2.8h, v2.8h, #15 +; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b ; GISEL-NEXT: ret %1 = udiv <8 x i16> %a0, ret <8 x i16> %1 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll b/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll @@ -207,8 +207,8 @@ ; CHECK-LABEL: merge_hole2: ; CHECK: ; %bb.0: ; CHECK-NEXT: lsr w8, w0, #16 -; CHECK-NEXT: strh w8, [x1, #2] ; CHECK-NEXT: strb w0, [x1] +; CHECK-NEXT: strh w8, [x1, #2] ; CHECK-NEXT: ret %pcast = bitcast i8* %p to i16* %p2 = getelementptr inbounds i16, i16* %pcast, i64 1 @@ -259,8 +259,8 @@ ; CHECK-LABEL: load_between_stores: ; CHECK: ; %bb.0: ; CHECK-NEXT: strh w0, [x1] -; CHECK-NEXT: ldr w8, [x2] ; CHECK-NEXT: lsr w9, w0, #16 +; CHECK-NEXT: ldr w8, [x2] ; CHECK-NEXT: strh w9, [x1, #2] ; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret @@ -278,8 +278,8 @@ ; CHECK-LABEL: invalid_shift: ; CHECK: ; %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: lsr w8, w8, #4 ; CHECK-NEXT: strb w0, [x1] +; CHECK-NEXT: lsr w8, w8, #4 ; CHECK-NEXT: strb w8, [x1, #1] ; CHECK-NEXT: ret %t1 = trunc i16 %x to i8 @@ -317,8 +317,8 @@ ; CHECK-LABEL: different_base_reg: ; CHECK: ; %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: lsr w8, w8, #8 ; CHECK-NEXT: strb w0, [x1] +; CHECK-NEXT: lsr w8, w8, #8 ; CHECK-NEXT: strb w8, [x2, #1] ; CHECK-NEXT: ret %t1 = trunc i16 %x to i8 @@ -334,8 +334,8 @@ ; CHECK-LABEL: second_store_is_volatile: ; CHECK: ; %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: lsr w8, w8, #8 ; CHECK-NEXT: strb w0, [x1] +; CHECK-NEXT: lsr w8, w8, #8 ; CHECK-NEXT: strb w8, [x1, #1] ; CHECK-NEXT: ret %t1 = trunc i16 %x to i8 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll b/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -frame-pointer=all -global-isel < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck %s +; RUN: llc -verify-machineinstrs -frame-pointer=all -global-isel < %s -mtriple=aarch64-apple-ios | FileCheck %s declare i8* @malloc(i64) declare void @free(i8*) @@ -11,8 +11,8 @@ ; CHECK: mov w0, #16 ; CHECK: malloc ; CHECK: mov [[ID:w[0-9]+]], #1 -; CHECK: strb [[ID]], [x0, #8] ; CHECK: mov x21, x0 +; CHECK: strb [[ID]], [x0, #8] ; CHECK-NOT: x21 entry: @@ -100,8 +100,8 @@ ; CHECK: cbz w0 ; CHECK: mov w0, #16 ; CHECK: malloc -; CHECK: mov x21, x0 -; CHECK: mov [[ID:w[0-9]+]], #1 +; CHECK-DAG: mov x21, x0 +; CHECK-DAG: mov [[ID:w[0-9]+]], #1 ; CHECK: strb [[ID]], [x0, #8] ; CHECK-NOT: x21 ; CHECK: ret @@ -161,13 +161,13 @@ ; parameter. define void @foo_sret(%struct.S* sret(%struct.S) %agg.result, i32 %val1, %swift_error** swifterror %error_ptr_ref) { ; CHECK-LABEL: foo_sret: -; CHECK: mov [[SRET:x[0-9]+]], x8 -; CHECK: mov w0, #16 +; CHECK-DAG: mov [[SRET:x[0-9]+]], x8 +; CHECK-DAG: mov w0, #16 ; CHECK: malloc ; CHECK: mov [[ID:w[0-9]+]], #1 +; CHECK: mov x21, x0 ; CHECK: strb [[ID]], [x0, #8] ; CHECK: str w{{.*}}, [{{.*}}[[SRET]], #4] -; CHECK: mov x21, x0 ; CHECK-NOT: x21 entry: @@ -220,17 +220,20 @@ ; CHECK-LABEL: foo_vararg: ; CHECK: mov w0, #16 ; CHECK: malloc -; CHECK-DAG: mov [[ID:w[0-9]+]], #1 -; CHECK-DAG: strb [[ID]], [x0, #8] +; CHECK: mov [[ID:w[0-9]+]], #1 +; CHECK: mov x21, x0 +; CHECK-NOT: x21 +; CHECK: strb [[ID]], [x0, #8] +; CHECK-NOT: x21 ; First vararg -; CHECK: ldr {{w[0-9]+}}, [x[[ARG1:[0-9]+]]], #8 +; CHECK: ldr {{w[0-9]+}}, [x[[ARG1:[0-9]+]]] +; CHECK-NOT: x21 ; Second vararg -; CHECK: ldr {{w[0-9]+}}, [x[[ARG1]]], #8 +; CHECK: ldr {{w[0-9]+}}, [x[[ARG1]]] +; CHECK-NOT: x21 ; Third vararg -; CHECK: ldr {{w[0-9]+}}, [x[[ARG1]]], #8 - -; CHECK: mov x21, x0 +; CHECK: ldr {{w[0-9]+}}, [x[[ARG1]]] ; CHECK-NOT: x21 entry: %call = call i8* @malloc(i64 16) @@ -259,10 +262,10 @@ define float @caller4(i8* %error_ref) { ; CHECK-LABEL: caller4: +; CHECK: mov x21, xzr ; CHECK: mov [[ID:x[0-9]+]], x0 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp] ; CHECK: str {{x[0-9]+}}, [sp, #16] -; CHECK: mov x21, xzr ; CHECK: bl {{.*}}foo_vararg ; CHECK: mov x0, x21 @@ -315,7 +318,7 @@ ; CHECK-LABEL: params_in_reg ; Save callee saved registers and swifterror since it will be clobbered by the first call to params_in_reg2. -; CHECK: stp x28, x0, [sp +; CHECK: str x28, [sp ; CHECK: stp x27, x26, [sp ; CHECK: stp x25, x24, [sp ; CHECK: stp x23, x22, [sp @@ -339,11 +342,10 @@ ; CHECK: mov w5, #6 ; CHECK: mov w6, #7 ; CHECK: mov w7, #8 -; CHECK: str xzr, [sp] ; CHECK: mov x21, xzr +; CHECK: str xzr, [sp] ; CHECK: bl _params_in_reg2 ; Restore original arguments for next call. -; CHECK: ldr x0, [sp ; CHECK: mov x1, x20 ; CHECK: mov x2, x22 ; CHECK: mov x3, x23 @@ -353,6 +355,7 @@ ; CHECK: mov x7, x27 ; Restore original swiftself argument and swifterror %err. ; CHECK: mov x21, x28 +; CHECK: ldr x8, [sp ; CHECK: bl _params_in_reg2 ; Restore calle save registers but don't clober swifterror x21. ; CHECK-NOT: x21 @@ -380,7 +383,7 @@ ; CHECK-LABEL: params_and_return_in_reg ; Store callee saved registers. -; CHECK: stp x28, x0, [sp, #16 +; CHECK: stp x28, x21, [sp, #16 ; CHECK: stp x27, x26, [sp ; CHECK: stp x25, x24, [sp ; CHECK: stp x23, x22, [sp @@ -394,7 +397,6 @@ ; CHECK: mov x25, x5 ; CHECK: mov x26, x6 ; CHECK: mov x27, x7 -; CHECK: mov x28, x21 ; Setup call arguments. ; CHECK: mov w0, #1 ; CHECK: mov w1, #2 @@ -409,7 +411,7 @@ ; Store swifterror %error_ptr_ref. ; CHECK: stp {{x[0-9]+}}, x21, [sp] ; Setup call arguments from original arguments. -; CHECK: ldr x0, [sp, #24 +; CHECK: mov x0, x19 ; CHECK: mov x1, x20 ; CHECK: mov x2, x22 ; CHECK: mov x3, x23 @@ -417,19 +419,19 @@ ; CHECK: mov x5, x25 ; CHECK: mov x6, x26 ; CHECK: mov x7, x27 -; CHECK: mov x21, x28 +; CHECK: ldr x21, [sp, #24 ; CHECK: bl _params_and_return_in_reg2 ; Store return values. -; CHECK: mov x20, x0 -; CHECK: mov x22, x1 -; CHECK: mov x23, x2 -; CHECK: mov x24, x3 -; CHECK: mov x25, x4 -; CHECK: mov x26, x5 -; CHECK: mov x27, x6 -; CHECK: mov x28, x7 +; CHECK: mov x19, x0 +; CHECK: mov x20, x1 +; CHECK: mov x22, x2 +; CHECK: mov x23, x3 +; CHECK: mov x24, x4 +; CHECK: mov x25, x5 +; CHECK: mov x26, x6 +; CHECK: mov x27, x7 ; Save swifterror %err. -; CHECK: mov x19, x21 +; CHECK: mov x28, x21 ; Setup call. ; CHECK: mov w0, #1 ; CHECK: mov w1, #2 @@ -443,17 +445,17 @@ ; CHECK: ldr x21, [sp, #8] ; CHECK: bl _params_in_reg2 ; Restore return values for return from this function. -; CHECK: mov x0, x20 -; CHECK: mov x1, x22 -; CHECK: mov x2, x23 -; CHECK: mov x3, x24 -; CHECK: mov x4, x25 -; CHECK: mov x5, x26 -; CHECK: mov x6, x27 -; CHECK: mov x7, x28 -; CHECK: mov x21, x19 -; Restore callee save registers. +; CHECK: mov x0, x19 +; CHECK: mov x1, x20 +; CHECK: mov x2, x22 +; CHECK: mov x3, x23 +; CHECK: mov x4, x24 +; CHECK: mov x5, x25 +; CHECK: mov x6, x26 +; CHECK: mov x7, x27 ; CHECK: ldp x29, x30, [sp +; CHECK: mov x21, x28 +; Restore callee save registers. ; CHECK: ldp x20, x19, [sp ; CHECK: ldp x23, x22, [sp ; CHECK: ldp x25, x24, [sp @@ -475,7 +477,7 @@ ; Make sure we don't tail call if the caller returns a swifterror value. We ; would have to move into the swifterror register before the tail call. -; CHECK: tailcall_from_swifterror: +; CHECK-LABEL: tailcall_from_swifterror: ; CHECK-NOT: b _acallee ; CHECK: bl _acallee diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -92,8 +92,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0, #32] ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll b/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll --- a/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll @@ -8,8 +8,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -24,8 +24,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: movi v1.4s, #1, lsl #8 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -40,8 +40,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: movi v1.4s, #1, lsl #16 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -56,8 +56,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: movi v1.4s, #1, lsl #24 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -72,8 +72,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -88,8 +88,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: movi v1.8h, #1, lsl #8 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -104,8 +104,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: movi v1.4s, #1, msl #8 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -120,8 +120,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: movi v1.4s, #1, msl #16 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -136,8 +136,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: movi v1.16b, #1 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -152,8 +152,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -168,8 +168,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: fmov v1.4s, #3.00000000 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -184,8 +184,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: fmov v1.2d, #0.17968750 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -200,8 +200,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: mvni v1.4s, #1 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -216,8 +216,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: mvni v1.4s, #1, lsl #8 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -232,8 +232,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: mvni v1.4s, #1, lsl #16 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -248,8 +248,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: mvni v1.4s, #1, lsl #24 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -264,8 +264,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: mvni v1.8h, #1 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -280,8 +280,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: mvni v1.8h, #1, lsl #8 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -296,8 +296,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: mvni v1.4s, #1, msl #8 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret @@ -312,8 +312,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, vec_v8i16 ; CHECK-NEXT: add x8, x8, :lo12:vec_v8i16 -; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: mvni v1.4s, #1, msl #16 +; CHECK-NEXT: ld1 { v0.8h }, [x8] ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h ; CHECK-NEXT: st1 { v0.8h }, [x8] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll @@ -5,8 +5,8 @@ ; CHECK-LABEL: dupsext_v2i8_v2i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -21,8 +21,8 @@ ; CHECK-LABEL: dupsext_v4i8_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -37,8 +37,8 @@ ; CHECK-LABEL: dupsext_v8i8_v8i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: @@ -53,8 +53,8 @@ ; CHECK-LABEL: dupsext_v2i8_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -69,8 +69,8 @@ ; CHECK-LABEL: dupsext_v4i8_v4i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -86,8 +86,8 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtb x8, w0 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -102,8 +102,8 @@ ; CHECK-LABEL: dupsext_v2i16_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -118,8 +118,8 @@ ; CHECK-LABEL: dupsext_v4i16_v4i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -135,8 +135,8 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxth x8, w0 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -152,8 +152,8 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -168,8 +168,8 @@ ; CHECK-LABEL: dupzext_v2i8_v2i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -184,8 +184,8 @@ ; CHECK-LABEL: dupzext_v4i8_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -200,8 +200,8 @@ ; CHECK-LABEL: dupzext_v8i8_v8i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: @@ -216,8 +216,8 @@ ; CHECK-LABEL: dupzext_v2i8_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -232,8 +232,8 @@ ; CHECK-LABEL: dupzext_v4i8_v4i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -249,8 +249,8 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xff -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -265,8 +265,8 @@ ; CHECK-LABEL: dupzext_v2i16_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -281,8 +281,8 @@ ; CHECK-LABEL: dupzext_v4i16_v4i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -298,8 +298,8 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xffff -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -314,8 +314,8 @@ ; CHECK-LABEL: dupzext_v2i32_v2i64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w8, w0 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -98,10 +98,10 @@ define <2 x i16> @dupsext_v2i8_v2i16(i8 %src, <2 x i8> %b) { ; CHECK-LABEL: dupsext_v2i8_v2i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxtb w8, w0 ; CHECK-NEXT: shl v0.2s, v0.2s, #24 -; CHECK-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-NEXT: sxtb w8, w0 ; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret entry: @@ -117,15 +117,15 @@ ; CHECK-LABEL: dupzext_v2i16_v2i64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi d1, #0x00ffff0000ffff -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xffff -; CHECK-NEXT: fmov x10, d0 -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: mul x10, x8, x10 -; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: mul x9, x8, x9 +; CHECK-NEXT: mul x8, x8, x10 +; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: mov v0.d[1], x8 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll --- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll @@ -13,11 +13,11 @@ ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: ubfx x21, x1, #9, #8 ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldrh w20, [x0, x21, lsl #1] ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: ldrh w20, [x0, x21, lsl #1] ; CHECK-NEXT: bl foo -; CHECK-NEXT: strh w20, [x19, x21, lsl #1] ; CHECK-NEXT: mov w0, w20 +; CHECK-NEXT: strh w20, [x19, x21, lsl #1] ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -38,11 +38,11 @@ ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: ubfx x21, x1, #9, #8 ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr w20, [x0, x21, lsl #2] ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: ldr w20, [x0, x21, lsl #2] ; CHECK-NEXT: bl foo -; CHECK-NEXT: str w20, [x19, x21, lsl #2] ; CHECK-NEXT: mov w0, w20 +; CHECK-NEXT: str w20, [x19, x21, lsl #2] ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -63,11 +63,11 @@ ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: ubfx x21, x1, #9, #8 ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr x20, [x0, x21, lsl #3] ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: ldr x20, [x0, x21, lsl #3] ; CHECK-NEXT: bl foo -; CHECK-NEXT: str x20, [x19, x21, lsl #3] ; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: str x20, [x19, x21, lsl #3] ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll --- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll @@ -106,18 +106,18 @@ ; CHECK-LE-LABEL: fsext_v2i32: ; CHECK-LE: // %bb.0: ; CHECK-LE-NEXT: ldrsb w8, [x0] -; CHECK-LE-NEXT: ldrsb w9, [x0, #1] ; CHECK-LE-NEXT: fmov s0, w8 -; CHECK-LE-NEXT: mov v0.s[1], w9 +; CHECK-LE-NEXT: ldrsb w8, [x0, #1] +; CHECK-LE-NEXT: mov v0.s[1], w8 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: fsext_v2i32: ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: ldrsb w8, [x0] -; CHECK-BE-NEXT: ldrsb w9, [x0, #1] ; CHECK-BE-NEXT: fmov s0, w8 -; CHECK-BE-NEXT: mov v0.s[1], w9 +; CHECK-BE-NEXT: ldrsb w8, [x0, #1] +; CHECK-BE-NEXT: mov v0.s[1], w8 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %x = load <2 x i8>, <2 x i8>* %a @@ -187,12 +187,12 @@ ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: ld1 { v0.8b }, [x0] ; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-BE-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-BE-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-BE-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-BE-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: rev64 v1.4s, v1.4s ; CHECK-BE-NEXT: rev64 v0.4s, v0.4s -; CHECK-BE-NEXT: rev64 v2.4s, v1.4s -; CHECK-BE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-BE-NEXT: ext v0.16b, v2.16b, v2.16b, #8 +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret %x = load <8 x i8>, <8 x i8>* %a %y = sext <8 x i8> %x to <8 x i32> @@ -251,18 +251,18 @@ ; CHECK-LE-LABEL: fsext_v2i16: ; CHECK-LE: // %bb.0: ; CHECK-LE-NEXT: ldrsb w8, [x0] -; CHECK-LE-NEXT: ldrsb w9, [x0, #1] ; CHECK-LE-NEXT: fmov s0, w8 -; CHECK-LE-NEXT: mov v0.s[1], w9 +; CHECK-LE-NEXT: ldrsb w8, [x0, #1] +; CHECK-LE-NEXT: mov v0.s[1], w8 ; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: fsext_v2i16: ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: ldrsb w8, [x0] -; CHECK-BE-NEXT: ldrsb w9, [x0, #1] ; CHECK-BE-NEXT: fmov s0, w8 -; CHECK-BE-NEXT: mov v0.s[1], w9 +; CHECK-BE-NEXT: ldrsb w8, [x0, #1] +; CHECK-BE-NEXT: mov v0.s[1], w8 ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %x = load <2 x i8>, <2 x i8>* %a @@ -344,12 +344,12 @@ ; CHECK-BE-LABEL: fsext_v16i16: ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: ld1 { v0.16b }, [x0] -; CHECK-BE-NEXT: sshll v1.8h, v0.8b, #0 -; CHECK-BE-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-BE-NEXT: sshll2 v1.8h, v0.16b, #0 +; CHECK-BE-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: rev64 v1.8h, v1.8h ; CHECK-BE-NEXT: rev64 v0.8h, v0.8h -; CHECK-BE-NEXT: rev64 v2.8h, v1.8h -; CHECK-BE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-BE-NEXT: ext v0.16b, v2.16b, v2.16b, #8 +; CHECK-BE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-BE-NEXT: ret %x = load <16 x i8>, <16 x i8>* %a %y = sext <16 x i8> %x to <16 x i16> diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -4,16 +4,16 @@ define void @matrix_mul_unsigned(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) { ; CHECK-LABEL: matrix_mul_unsigned: ; CHECK: // %bb.0: // %vector.header -; CHECK-NEXT: and w9, w3, #0xffff +; CHECK-NEXT: and w8, w3, #0xffff ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: dup v0.4h, w8 ; CHECK-NEXT: and x8, x0, #0xfffffff8 -; CHECK-NEXT: dup v0.4h, w9 ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x2, w0, uxtw #1 +; CHECK-NEXT: subs x8, x8, #8 ; CHECK-NEXT: ldp d1, d2, [x9] ; CHECK-NEXT: add x9, x1, w0, uxtw #2 -; CHECK-NEXT: subs x8, x8, #8 ; CHECK-NEXT: add w0, w0, #8 ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h @@ -68,16 +68,16 @@ define void @matrix_mul_signed(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) { ; CHECK-LABEL: matrix_mul_signed: ; CHECK: // %bb.0: // %vector.header -; CHECK-NEXT: sxth w9, w3 +; CHECK-NEXT: sxth w8, w3 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: dup v0.4h, w8 ; CHECK-NEXT: and x8, x0, #0xfffffff8 -; CHECK-NEXT: dup v0.4h, w9 ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x2, w0, sxtw #1 +; CHECK-NEXT: subs x8, x8, #8 ; CHECK-NEXT: ldp d1, d2, [x9] ; CHECK-NEXT: add x9, x1, w0, sxtw #2 -; CHECK-NEXT: subs x8, x8, #8 ; CHECK-NEXT: add w0, w0, #8 ; CHECK-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-NEXT: smull v2.4s, v0.4h, v2.4h @@ -133,20 +133,20 @@ define void @matrix_mul_double_shuffle(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) { ; CHECK-LABEL: matrix_mul_double_shuffle: ; CHECK: // %bb.0: // %vector.header -; CHECK-NEXT: and w9, w3, #0xffff +; CHECK-NEXT: and w8, w3, #0xffff ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: dup v0.4h, w8 ; CHECK-NEXT: and x8, x0, #0xfffffff8 -; CHECK-NEXT: dup v0.4h, w9 ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh w9, [x2], #16 -; CHECK-NEXT: mov w10, w0 ; CHECK-NEXT: subs x8, x8, #8 -; CHECK-NEXT: lsl x10, x10, #2 ; CHECK-NEXT: dup v1.4h, w9 -; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: mov w9, w0 +; CHECK-NEXT: lsl x9, x9, #2 ; CHECK-NEXT: add w0, w0, #8 -; CHECK-NEXT: str q1, [x1, x10] +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: str q1, [x1, x9] ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %for.end12 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -111,11 +111,11 @@ define <4 x i32> @amull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: amull_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff +; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -129,11 +129,11 @@ define <2 x i64> @amull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind { ; CHECK-LABEL: amull_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: movi v0.2d, #0x000000ffffffff +; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -147,8 +147,8 @@ define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { ; CHECK-LABEL: smlal_v8i8_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b ; CHECK-NEXT: ret @@ -165,8 +165,8 @@ define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { ; CHECK-LABEL: smlal_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret @@ -183,8 +183,8 @@ define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { ; CHECK-LABEL: smlal_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret @@ -201,8 +201,8 @@ define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { ; CHECK-LABEL: umlal_v8i8_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlal v0.8h, v1.8b, v2.8b ; CHECK-NEXT: ret @@ -219,8 +219,8 @@ define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { ; CHECK-LABEL: umlal_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret @@ -237,8 +237,8 @@ define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { ; CHECK-LABEL: umlal_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret @@ -255,8 +255,8 @@ define <8 x i16> @amlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { ; CHECK-LABEL: amlal_v8i8_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlal v0.8h, v1.8b, v2.8b ; CHECK-NEXT: bic v0.8h, #255, lsl #8 @@ -275,12 +275,12 @@ define <4 x i32> @amlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { ; CHECK-LABEL: amlal_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x2] -; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: ldr d3, [x2] +; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff +; CHECK-NEXT: smlal v2.4s, v1.4h, v3.4h +; CHECK-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -296,12 +296,12 @@ define <2 x i64> @amlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { ; CHECK-LABEL: amlal_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x2] -; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: ldr d3, [x2] +; CHECK-NEXT: movi v0.2d, #0x000000ffffffff +; CHECK-NEXT: smlal v2.2d, v1.2s, v3.2s +; CHECK-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -317,8 +317,8 @@ define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { ; CHECK-LABEL: smlsl_v8i8_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b ; CHECK-NEXT: ret @@ -335,8 +335,8 @@ define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { ; CHECK-LABEL: smlsl_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret @@ -353,8 +353,8 @@ define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { ; CHECK-LABEL: smlsl_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret @@ -371,8 +371,8 @@ define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { ; CHECK-LABEL: umlsl_v8i8_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlsl v0.8h, v1.8b, v2.8b ; CHECK-NEXT: ret @@ -389,8 +389,8 @@ define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { ; CHECK-LABEL: umlsl_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.4h ; CHECK-NEXT: ret @@ -407,8 +407,8 @@ define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { ; CHECK-LABEL: umlsl_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.2s ; CHECK-NEXT: ret @@ -425,8 +425,8 @@ define <8 x i16> @amlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { ; CHECK-LABEL: amlsl_v8i8_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d2, [x2] ; CHECK-NEXT: smlsl v0.8h, v1.8b, v2.8b ; CHECK-NEXT: bic v0.8h, #255, lsl #8 @@ -445,12 +445,12 @@ define <4 x i32> @amlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { ; CHECK-LABEL: amlsl_v4i16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x2] -; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.4h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: ldr d3, [x2] +; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff +; CHECK-NEXT: smlsl v2.4s, v1.4h, v3.4h +; CHECK-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -466,12 +466,12 @@ define <2 x i64> @amlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { ; CHECK-LABEL: amlsl_v2i32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x2] -; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.2s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: ldr d3, [x2] +; CHECK-NEXT: movi v0.2d, #0x000000ffffffff +; CHECK-NEXT: smlsl v2.2d, v1.2s, v3.2s +; CHECK-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-NEXT: ret %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -599,9 +599,9 @@ ; CHECK-LABEL: amull_extvec_v4i16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #1234 -; CHECK-NEXT: dup v1.4h, w8 -; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h ; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff +; CHECK-NEXT: dup v2.4h, w8 +; CHECK-NEXT: smull v0.4s, v0.4h, v2.4h ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp3 = zext <4 x i16> %arg to <4 x i32> @@ -614,9 +614,9 @@ ; CHECK-LABEL: amull_extvec_v2i32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #1234 -; CHECK-NEXT: dup v1.2s, w8 -; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEXT: movi v1.2d, #0x000000ffffffff +; CHECK-NEXT: dup v2.2s, w8 +; CHECK-NEXT: smull v0.2d, v0.2s, v2.2s ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %tmp3 = zext <2 x i32> %arg to <2 x i64> @@ -752,11 +752,11 @@ define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) { ; CHECK-LABEL: amull2_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: smull v2.8h, v0.8b, v1.8b -; CHECK-NEXT: smull2 v1.8h, v0.16b, v1.16b +; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: bic v2.8h, #255, lsl #8 -; CHECK-NEXT: bic v1.8h, #255, lsl #8 -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: bic v0.8h, #255, lsl #8 +; CHECK-NEXT: mov v1.16b, v2.16b ; CHECK-NEXT: ret %arg1_ext = zext <16 x i8> %arg1 to <16 x i16> %arg2_ext = zext <16 x i8> %arg2 to <16 x i16> @@ -768,11 +768,11 @@ define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) { ; CHECK-LABEL: amull2_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: smull v2.4s, v0.4h, v1.4h -; CHECK-NEXT: smull2 v0.4s, v0.8h, v1.8h -; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v1.16b, v0.16b, v3.16b -; CHECK-NEXT: and v0.16b, v2.16b, v3.16b +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-NEXT: smull2 v3.4s, v0.8h, v1.8h +; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-NEXT: and v1.16b, v3.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %arg1_ext = zext <8 x i16> %arg1 to <8 x i32> %arg2_ext = zext <8 x i16> %arg2 to <8 x i32> @@ -784,11 +784,11 @@ define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) { ; CHECK-LABEL: amull2_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: smull v2.2d, v0.2s, v1.2s -; CHECK-NEXT: smull2 v0.2d, v0.4s, v1.4s -; CHECK-NEXT: movi v3.2d, #0x000000ffffffff -; CHECK-NEXT: and v1.16b, v0.16b, v3.16b -; CHECK-NEXT: and v0.16b, v2.16b, v3.16b +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff +; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: and v1.16b, v3.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %arg1_ext = zext <4 x i32> %arg1 to <4 x i64> %arg2_ext = zext <4 x i32> %arg2 to <4 x i64> diff --git a/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll b/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll --- a/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-tail-dup-size.ll @@ -28,9 +28,9 @@ ; CHECK-O2-NEXT: mov x8, xzr ; CHECK-O2-NEXT: .LBB0_3: // %if.end ; CHECK-O2-NEXT: adrp x9, global_int -; CHECK-O2-NEXT: ldr w1, [x9, :lo12:global_int] ; CHECK-O2-NEXT: add x2, x8, #16 ; CHECK-O2-NEXT: mov w0, #10 +; CHECK-O2-NEXT: ldr w1, [x9, :lo12:global_int] ; CHECK-O2-NEXT: b externalfunc ; ; CHECK-O3-LABEL: testcase: @@ -43,16 +43,16 @@ ; CHECK-O3-NEXT: str x9, [x0] ; CHECK-O3-NEXT: ldr x8, [x8, :lo12:global_ptr] ; CHECK-O3-NEXT: adrp x9, global_int -; CHECK-O3-NEXT: ldr w1, [x9, :lo12:global_int] ; CHECK-O3-NEXT: add x2, x8, #16 ; CHECK-O3-NEXT: mov w0, #10 +; CHECK-O3-NEXT: ldr w1, [x9, :lo12:global_int] ; CHECK-O3-NEXT: b externalfunc ; CHECK-O3-NEXT: .LBB0_2: ; CHECK-O3-NEXT: mov x8, xzr ; CHECK-O3-NEXT: adrp x9, global_int -; CHECK-O3-NEXT: ldr w1, [x9, :lo12:global_int] ; CHECK-O3-NEXT: add x2, x8, #16 ; CHECK-O3-NEXT: mov w0, #10 +; CHECK-O3-NEXT: ldr w1, [x9, :lo12:global_int] ; CHECK-O3-NEXT: b externalfunc entry: %0 = load %a*, %a** @global_ptr, align 8 diff --git a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll --- a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll +++ b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll @@ -8,8 +8,8 @@ ; CHECK-NEXT: add x8, sp, #40 ; CHECK-NEXT: add x0, sp, #40 ; CHECK-NEXT: stp x30, x18, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x1, x2, [sp, #40] ; CHECK-NEXT: stp x3, x4, [sp, #56] +; CHECK-NEXT: stp x1, x2, [sp, #40] ; CHECK-NEXT: stp x5, x6, [sp, #72] ; CHECK-NEXT: str x7, [sp, #88] ; CHECK-NEXT: str x8, [sp, #8] @@ -70,8 +70,8 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x18, [sp, #-32]! // 8-byte Folded Spill ; CHECK-NEXT: add x8, sp, #24 -; CHECK-NEXT: str x7, [sp, #24] ; CHECK-NEXT: add x0, sp, #24 +; CHECK-NEXT: str x7, [sp, #24] ; CHECK-NEXT: str x8, [sp, #8] ; CHECK-NEXT: ldr x18, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/addimm-mulimm.ll b/llvm/test/CodeGen/AArch64/addimm-mulimm.ll --- a/llvm/test/CodeGen/AArch64/addimm-mulimm.ll +++ b/llvm/test/CodeGen/AArch64/addimm-mulimm.ll @@ -52,8 +52,8 @@ define i64 @addimm_mulimm_accept_10(i64 %a) { ; CHECK-LABEL: addimm_mulimm_accept_10: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #32888 ; CHECK-NEXT: mov w8, #37 +; CHECK-NEXT: mov w9, #32888 ; CHECK-NEXT: movk w9, #17, lsl #16 ; CHECK-NEXT: madd x0, x0, x8, x9 ; CHECK-NEXT: ret @@ -65,8 +65,8 @@ define i64 @addimm_mulimm_accept_11(i64 %a) { ; CHECK-LABEL: addimm_mulimm_accept_11: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #-32888 ; CHECK-NEXT: mov w8, #37 +; CHECK-NEXT: mov x9, #-32888 ; CHECK-NEXT: movk x9, #65518, lsl #16 ; CHECK-NEXT: madd x0, x0, x8, x9 ; CHECK-NEXT: ret @@ -78,8 +78,8 @@ define signext i32 @addimm_mulimm_accept_12(i32 signext %a) { ; CHECK-LABEL: addimm_mulimm_accept_12: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #32888 ; CHECK-NEXT: mov w8, #37 +; CHECK-NEXT: mov w9, #32888 ; CHECK-NEXT: movk w9, #17, lsl #16 ; CHECK-NEXT: madd w0, w0, w8, w9 ; CHECK-NEXT: ret @@ -91,8 +91,8 @@ define signext i32 @addimm_mulimm_accept_13(i32 signext %a) { ; CHECK-LABEL: addimm_mulimm_accept_13: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #32648 ; CHECK-NEXT: mov w8, #37 +; CHECK-NEXT: mov w9, #32648 ; CHECK-NEXT: movk w9, #65518, lsl #16 ; CHECK-NEXT: madd w0, w0, w8, w9 ; CHECK-NEXT: ret @@ -104,9 +104,9 @@ define i64 @addimm_mulimm_reject_00(i64 %a) { ; CHECK-LABEL: addimm_mulimm_reject_00: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x0, #3100 -; CHECK-NEXT: mov w9, #3700 -; CHECK-NEXT: mul x0, x8, x9 +; CHECK-NEXT: mov w8, #3700 +; CHECK-NEXT: add x9, x0, #3100 +; CHECK-NEXT: mul x0, x9, x8 ; CHECK-NEXT: ret %tmp0 = add i64 %a, 3100 %tmp1 = mul i64 %tmp0, 3700 @@ -116,9 +116,9 @@ define i64 @addimm_mulimm_reject_01(i64 %a) { ; CHECK-LABEL: addimm_mulimm_reject_01: ; CHECK: // %bb.0: -; CHECK-NEXT: sub x8, x0, #3100 -; CHECK-NEXT: mov w9, #3700 -; CHECK-NEXT: mul x0, x8, x9 +; CHECK-NEXT: mov w8, #3700 +; CHECK-NEXT: sub x9, x0, #3100 +; CHECK-NEXT: mul x0, x9, x8 ; CHECK-NEXT: ret %tmp0 = add i64 %a, -3100 %tmp1 = mul i64 %tmp0, 3700 @@ -128,9 +128,9 @@ define signext i32 @addimm_mulimm_reject_02(i32 signext %a) { ; CHECK-LABEL: addimm_mulimm_reject_02: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w0, #3100 -; CHECK-NEXT: mov w9, #3700 -; CHECK-NEXT: mul w0, w8, w9 +; CHECK-NEXT: mov w8, #3700 +; CHECK-NEXT: add w9, w0, #3100 +; CHECK-NEXT: mul w0, w9, w8 ; CHECK-NEXT: ret %tmp0 = add i32 %a, 3100 %tmp1 = mul i32 %tmp0, 3700 @@ -140,9 +140,9 @@ define signext i32 @addimm_mulimm_reject_03(i32 signext %a) { ; CHECK-LABEL: addimm_mulimm_reject_03: ; CHECK: // %bb.0: -; CHECK-NEXT: sub w8, w0, #3100 -; CHECK-NEXT: mov w9, #3700 -; CHECK-NEXT: mul w0, w8, w9 +; CHECK-NEXT: mov w8, #3700 +; CHECK-NEXT: sub w9, w0, #3100 +; CHECK-NEXT: mul w0, w9, w8 ; CHECK-NEXT: ret %tmp0 = add i32 %a, -3100 %tmp1 = mul i32 %tmp0, 3700 diff --git a/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll b/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll --- a/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll +++ b/llvm/test/CodeGen/AArch64/addsub-constant-folding.ll @@ -57,9 +57,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: bl vec_use +; CHECK-NEXT: movi v0.4s, #10 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: movi v0.4s, #10 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret @@ -134,9 +134,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: bl vec_use +; CHECK-NEXT: movi v0.4s, #6 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: movi v0.4s, #6 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret @@ -213,9 +213,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: bl vec_use +; CHECK-NEXT: mvni v0.4s, #5 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: mvni v0.4s, #5 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret @@ -290,9 +290,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: bl vec_use +; CHECK-NEXT: mvni v0.4s, #5 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: mvni v0.4s, #5 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret @@ -367,9 +367,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: bl vec_use +; CHECK-NEXT: movi v0.4s, #10 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: movi v0.4s, #10 ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret @@ -446,9 +446,9 @@ ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: bl vec_use +; CHECK-NEXT: movi v0.4s, #2 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: movi v0.4s, #2 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret @@ -526,9 +526,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: bl vec_use +; CHECK-NEXT: movi v0.4s, #10 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: movi v0.4s, #10 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret @@ -606,9 +606,9 @@ ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: bl vec_use +; CHECK-NEXT: movi v0.4s, #6 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: movi v0.4s, #6 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret @@ -685,9 +685,9 @@ ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: bl vec_use +; CHECK-NEXT: movi v0.4s, #2 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: movi v0.4s, #2 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add sp, sp, #32 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll --- a/llvm/test/CodeGen/AArch64/addsub.ll +++ b/llvm/test/CodeGen/AArch64/addsub.ll @@ -247,8 +247,8 @@ ; CHECK-NEXT: b.ne .LBB13_6 ; CHECK-NEXT: // %bb.1: // %test2 ; CHECK-NEXT: adrp x10, :got:var2_i32 -; CHECK-NEXT: ldr x10, [x10, :got_lo12:var2_i32] ; CHECK-NEXT: add w11, w9, #1 +; CHECK-NEXT: ldr x10, [x10, :got_lo12:var2_i32] ; CHECK-NEXT: str w11, [x8] ; CHECK-NEXT: ldr w10, [x10] ; CHECK-NEXT: cmp w10, #3567, lsl #12 // =14610432 diff --git a/llvm/test/CodeGen/AArch64/align-down.ll b/llvm/test/CodeGen/AArch64/align-down.ll --- a/llvm/test/CodeGen/AArch64/align-down.ll +++ b/llvm/test/CodeGen/AArch64/align-down.ll @@ -54,9 +54,9 @@ define i32 @t3_extrause0(i32 %ptr, i32 %alignment, i32* %mask_storage) nounwind { ; CHECK-LABEL: t3_extrause0: ; CHECK: // %bb.0: -; CHECK-NEXT: neg w9, w1 +; CHECK-NEXT: neg w8, w1 +; CHECK-NEXT: and w0, w0, w8 ; CHECK-NEXT: sub w8, w1, #1 -; CHECK-NEXT: and w0, w0, w9 ; CHECK-NEXT: str w8, [x2] ; CHECK-NEXT: ret %mask = add i32 %alignment, -1 @@ -83,10 +83,10 @@ ; CHECK-LABEL: n5_extrause2: ; CHECK: // %bb.0: ; CHECK-NEXT: sub w8, w1, #1 +; CHECK-NEXT: and w9, w0, w8 +; CHECK-NEXT: sub w0, w0, w9 ; CHECK-NEXT: str w8, [x2] -; CHECK-NEXT: and w8, w0, w8 -; CHECK-NEXT: sub w0, w0, w8 -; CHECK-NEXT: str w8, [x3] +; CHECK-NEXT: str w9, [x3] ; CHECK-NEXT: ret %mask = add i32 %alignment, -1 store i32 %mask, i32* %mask_storage diff --git a/llvm/test/CodeGen/AArch64/and-mask-removal.ll b/llvm/test/CodeGen/AArch64/and-mask-removal.ll --- a/llvm/test/CodeGen/AArch64/and-mask-removal.ll +++ b/llvm/test/CodeGen/AArch64/and-mask-removal.ll @@ -10,17 +10,17 @@ ; CHECK-LABEL: new_position: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: adrp x9, _board@GOTPAGE -; CHECK-NEXT: ldr x9, [x9, _board@GOTPAGEOFF] ; CHECK-NEXT: ; kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: ldr x9, [x9, _board@GOTPAGEOFF] ; CHECK-NEXT: ldrb w9, [x9, x8] ; CHECK-NEXT: sub w9, w9, #1 ; CHECK-NEXT: cmp w9, #1 ; CHECK-NEXT: b.hi LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %if.then ; CHECK-NEXT: adrp x9, _next_string@GOTPAGE -; CHECK-NEXT: ldr x9, [x9, _next_string@GOTPAGEOFF] ; CHECK-NEXT: adrp x10, _string_number@GOTPAGE +; CHECK-NEXT: ldr x9, [x9, _next_string@GOTPAGEOFF] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: ldr x10, [x10, _string_number@GOTPAGEOFF] ; CHECK-NEXT: str w9, [x10, x8, lsl #2] @@ -218,8 +218,8 @@ ; CHECK-LABEL: test16_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: mov w8, #16882 -; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: mov w9, #40700 +; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: cmp w9, w8, uxth ; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret @@ -254,8 +254,8 @@ ; CHECK-LABEL: test16_4: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: mov w8, #29985 -; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: mov w9, #15676 +; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: cmp w9, w8, uxth ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -290,8 +290,8 @@ ; CHECK-LABEL: test16_6: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: mov w8, #-32194 -; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: mov w9, #24320 +; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret @@ -309,8 +309,8 @@ ; CHECK-LABEL: test16_7: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: mov w8, #9272 -; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: mov w9, #22619 +; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: cmp w9, w8, uxth ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll --- a/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll +++ b/llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll @@ -385,9 +385,12 @@ ; CHECK-NEXT: bl return_in_block ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: stp d0, d1, [x8] -; CHECK-NEXT: stp d2, d3, [x8, #16] -; CHECK-NEXT: stp d4, d5, [x8, #32] +; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: str d1, [x8, #8] +; CHECK-NEXT: str d2, [x8, #16] +; CHECK-NEXT: str d3, [x8, #24] +; CHECK-NEXT: str d4, [x8, #32] +; CHECK-NEXT: str d5, [x8, #40] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %1 = call %T_IN_BLOCK @return_in_block() @@ -400,9 +403,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: stp d4, d5, [x8, #32] -; CHECK-NEXT: stp d2, d3, [x8, #16] -; CHECK-NEXT: stp d0, d1, [x8] +; CHECK-NEXT: str d5, [x8, #40] +; CHECK-NEXT: str d4, [x8, #32] +; CHECK-NEXT: str d3, [x8, #24] +; CHECK-NEXT: str d2, [x8, #16] +; CHECK-NEXT: str d1, [x8, #8] +; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ret store %T_IN_BLOCK %a, %T_IN_BLOCK* @in_block_store ret void @@ -451,17 +457,17 @@ ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: add x8, sp, #8 ; CHECK-NEXT: bl return_in_memory -; CHECK-NEXT: ldr d0, [sp, #72] -; CHECK-NEXT: ldur q1, [sp, #24] -; CHECK-NEXT: ldur q2, [sp, #8] -; CHECK-NEXT: ldur q3, [sp, #56] -; CHECK-NEXT: ldur q4, [sp, #40] -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: adrp x8, in_memory_store +; CHECK-NEXT: ldur q0, [sp, #24] +; CHECK-NEXT: ldur q1, [sp, #8] ; CHECK-NEXT: add x8, x8, :lo12:in_memory_store -; CHECK-NEXT: stp q2, q1, [x8] -; CHECK-NEXT: stp q4, q3, [x8, #32] -; CHECK-NEXT: str d0, [x8, #64] +; CHECK-NEXT: ldur q2, [sp, #56] +; CHECK-NEXT: ldur q3, [sp, #40] +; CHECK-NEXT: ldr d4, [sp, #72] +; CHECK-NEXT: stp q1, q0, [x8] +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: stp q3, q2, [x8, #32] +; CHECK-NEXT: str d4, [x8, #64] ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %1 = call %T_IN_MEMORY @return_in_memory() @@ -472,14 +478,15 @@ define void @callee_in_memory(%T_IN_MEMORY %a) { ; CHECK-LABEL: callee_in_memory: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [sp, #32] -; CHECK-NEXT: ldr d2, [sp, #64] -; CHECK-NEXT: ldp q3, q4, [sp] ; CHECK-NEXT: adrp x8, in_memory_store +; CHECK-NEXT: ldr d0, [sp, #64] +; CHECK-NEXT: ldp q1, q2, [sp, #32] ; CHECK-NEXT: add x8, x8, :lo12:in_memory_store -; CHECK-NEXT: str d2, [x8, #64] -; CHECK-NEXT: stp q0, q1, [x8, #32] -; CHECK-NEXT: stp q3, q4, [x8] +; CHECK-NEXT: str d0, [x8, #64] +; CHECK-NEXT: ldr q3, [sp, #16] +; CHECK-NEXT: stp q1, q2, [x8, #32] +; CHECK-NEXT: ldr q0, [sp] +; CHECK-NEXT: stp q0, q3, [x8] ; CHECK-NEXT: ret store %T_IN_MEMORY %a, %T_IN_MEMORY* @in_memory_store ret void @@ -496,8 +503,8 @@ ; CHECK-NEXT: add x8, x8, :lo12:in_memory_store ; CHECK-NEXT: ldp q0, q1, [x8] ; CHECK-NEXT: ldp q2, q3, [x8, #32] -; CHECK-NEXT: ldr d4, [x8, #64] ; CHECK-NEXT: stp q0, q1, [sp] +; CHECK-NEXT: ldr d4, [x8, #64] ; CHECK-NEXT: stp q2, q3, [sp, #32] ; CHECK-NEXT: str d4, [sp, #64] ; CHECK-NEXT: bl callee_in_memory diff --git a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll --- a/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll +++ b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll @@ -5,19 +5,19 @@ define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone { ; CHECK-LABEL: bar: ; CHECK: // %bb.0: -; CHECK-NEXT: add.2d v2, v0, v1 -; CHECK-NEXT: add d0, d2, d1 -; CHECK-NEXT: sub d1, d2, d1 -; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: add.2d v0, v0, v1 +; CHECK-NEXT: sub d2, d0, d1 +; CHECK-NEXT: add d0, d0, d1 +; CHECK-NEXT: fmov x8, d2 ; CHECK-NEXT: mov.d v0[1], x8 ; CHECK-NEXT: ret ; ; GENERIC-LABEL: bar: ; GENERIC: // %bb.0: -; GENERIC-NEXT: add v2.2d, v0.2d, v1.2d -; GENERIC-NEXT: add d0, d2, d1 -; GENERIC-NEXT: sub d1, d2, d1 -; GENERIC-NEXT: fmov x8, d1 +; GENERIC-NEXT: add v0.2d, v0.2d, v1.2d +; GENERIC-NEXT: sub d2, d0, d1 +; GENERIC-NEXT: add d0, d0, d1 +; GENERIC-NEXT: fmov x8, d2 ; GENERIC-NEXT: mov v0.d[1], x8 ; GENERIC-NEXT: ret %add = add <2 x i64> %a, %b @@ -68,16 +68,16 @@ define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone { ; CHECK-LABEL: add_sub_su64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d2, xzr ; CHECK-NEXT: add d0, d1, d0 -; CHECK-NEXT: sub d0, d2, d0 +; CHECK-NEXT: fmov d1, xzr +; CHECK-NEXT: sub d0, d1, d0 ; CHECK-NEXT: ret ; ; GENERIC-LABEL: add_sub_su64: ; GENERIC: // %bb.0: -; GENERIC-NEXT: fmov d2, xzr ; GENERIC-NEXT: add d0, d1, d0 -; GENERIC-NEXT: sub d0, d2, d0 +; GENERIC-NEXT: fmov d1, xzr +; GENERIC-NEXT: sub d0, d1, d0 ; GENERIC-NEXT: ret %vecext = extractelement <2 x i64> %a, i32 0 %vecext1 = extractelement <2 x i64> %b, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll --- a/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll @@ -10,33 +10,32 @@ define zeroext i8 @fullGtU(i32 %i1, i32 %i2) { ; CHECK-LABEL: fullGtU: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: adrp x8, _block@GOTPAGE -; CHECK-NEXT: ldr x8, [x8, _block@GOTPAGEOFF] +; CHECK-NEXT: adrp x10, _block@GOTPAGE ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: ; kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sxtw x9, w0 -; CHECK-NEXT: sxtw x10, w1 -; CHECK-NEXT: ldr x8, [x8] -; CHECK-NEXT: ldrb w11, [x8, x9] -; CHECK-NEXT: ldrb w12, [x8, x10] +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: ldr x10, [x10, _block@GOTPAGEOFF] +; CHECK-NEXT: ldr x10, [x10] +; CHECK-NEXT: ldrb w11, [x10, x8] +; CHECK-NEXT: ldrb w12, [x10, x9] ; CHECK-NEXT: cmp w11, w12 -; CHECK-NEXT: b.ne LBB0_4 +; CHECK-NEXT: b.ne LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %if.end -; CHECK-NEXT: add x9, x9, x8 -; CHECK-NEXT: add x8, x10, x8 -; CHECK-NEXT: ldrb w10, [x9, #1] -; CHECK-NEXT: ldrb w11, [x8, #1] +; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: add x9, x9, x10 +; CHECK-NEXT: ldrb w10, [x8, #1] +; CHECK-NEXT: ldrb w11, [x9, #1] ; CHECK-NEXT: cmp w10, w11 -; CHECK-NEXT: b.ne LBB0_4 +; CHECK-NEXT: b.ne LBB0_3 ; CHECK-NEXT: ; %bb.2: ; %if.end25 -; CHECK-NEXT: ldrb w9, [x9, #2] ; CHECK-NEXT: ldrb w8, [x8, #2] -; CHECK-NEXT: cmp w9, w8 -; CHECK-NEXT: b.ne LBB0_4 -; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ldrb w9, [x9, #2] +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w8, hi +; CHECK-NEXT: csel w0, wzr, w8, eq ; CHECK-NEXT: ret -; CHECK-NEXT: LBB0_4: ; %if.then36 +; CHECK-NEXT: LBB0_3: ; %if.then19 ; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll --- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll +++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll @@ -67,9 +67,9 @@ define void @t6(i64 %a, i64* %object) { ; CHECK-LABEL: t6: ; CHECK: // %bb.0: -; CHECK-NEXT: add x8, x1, x0, lsl #3 -; CHECK-NEXT: mov w9, #32768 -; CHECK-NEXT: ldr xzr, [x8, x9] +; CHECK-NEXT: mov w8, #32768 +; CHECK-NEXT: add x9, x1, x0, lsl #3 +; CHECK-NEXT: ldr xzr, [x9, x8] ; CHECK-NEXT: ret %tmp1 = getelementptr inbounds i64, i64* %object, i64 %a %incdec.ptr = getelementptr inbounds i64, i64* %tmp1, i64 4096 diff --git a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll --- a/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll +++ b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll @@ -896,18 +896,18 @@ ; LLC-LABEL: fct20: ; LLC: // %bb.0: // %entry ; LLC-NEXT: mov x12, #11776 +; LLC-NEXT: extr x9, x1, x0, #18 ; LLC-NEXT: movk x12, #25856, lsl #16 +; LLC-NEXT: lsr x8, x1, #18 ; LLC-NEXT: movk x12, #11077, lsl #32 -; LLC-NEXT: extr x8, x1, x0, #18 -; LLC-NEXT: lsr x9, x1, #18 ; LLC-NEXT: orr x10, x2, x3 ; LLC-NEXT: mov w11, #26220 ; LLC-NEXT: movk x12, #45, lsl #48 -; LLC-NEXT: and x11, x9, x11 -; LLC-NEXT: and x12, x8, x12 +; LLC-NEXT: and x11, x8, x11 +; LLC-NEXT: and x12, x9, x12 ; LLC-NEXT: cmp x10, #0 -; LLC-NEXT: csel x0, x12, x8, eq -; LLC-NEXT: csel x1, x11, x9, eq +; LLC-NEXT: csel x0, x12, x9, eq +; LLC-NEXT: csel x1, x11, x8, eq ; LLC-NEXT: ret ; OPT-LABEL: @fct20( ; OPT-NEXT: entry: @@ -941,8 +941,8 @@ define i64 @fct21(i64 %x) { ; LLC-LABEL: fct21: ; LLC: // %bb.0: // %entry -; LLC-NEXT: adrp x9, arr ; LLC-NEXT: ubfx x8, x0, #4, #4 +; LLC-NEXT: adrp x9, arr ; LLC-NEXT: add x9, x9, :lo12:arr ; LLC-NEXT: ldr x0, [x9, x8, lsl #3] ; LLC-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll --- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll +++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll @@ -612,9 +612,9 @@ ; CHECK-LABEL: _setL ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]: ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE +; CHECK-NEXT: ; kill ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]: ; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF] -; CHECK-NEXT: ; kill ; Ultimately we should generate str b0, but right now, we match the vector ; variant which does not allow to fold the immediate into the store. ; CHECK-NEXT: st1.b { v0 }[0], [x[[LDRGOT_REG]]] diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll --- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -5,12 +5,12 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) { ; CHECK-LABEL: fptosi_v4f64_to_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: fcvtzs v0.2d, v0.2d ; CHECK-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x double>, <4 x double>* %ptr %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16> @@ -20,18 +20,18 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) { ; CHECK-LABEL: fptosi_v4f64_to_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0, #32] -; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: ldp q0, q1, [x0, #32] ; CHECK-NEXT: fcvtzs v0.2d, v0.2d +; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzs v1.2d, v1.2d -; CHECK-NEXT: fcvtzs v2.2d, v2.2d -; CHECK-NEXT: fcvtzs v3.2d, v3.2d ; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: fcvtzs v2.2d, v2.2d ; CHECK-NEXT: xtn v1.2s, v1.2d +; CHECK-NEXT: fcvtzs v3.2d, v3.2d +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h ; CHECK-NEXT: xtn v2.2s, v2.2d ; CHECK-NEXT: xtn v3.2s, v3.2d -; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h -; CHECK-NEXT: uzp1 v1.4h, v3.4h, v2.4h +; CHECK-NEXT: uzp1 v1.4h, v2.4h, v3.4h ; CHECK-NEXT: uzp1 v0.8b, v1.8b, v0.8b ; CHECK-NEXT: ret %tmp1 = load <8 x double>, <8 x double>* %ptr @@ -44,8 +44,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: ucvtf v1.2d, v1.2d +; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d ; CHECK-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NEXT: ret @@ -70,12 +70,12 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(<4 x double>* %ptr) { ; CHECK-LABEL: fptoui_v4f64_to_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: fcvtzs v0.2d, v0.2d ; CHECK-NEXT: fcvtzs v1.2d, v1.2d ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %tmp1 = load <4 x double>, <4 x double>* %ptr %tmp2 = fptoui <4 x double> %tmp1 to <4 x i16> diff --git a/llvm/test/CodeGen/AArch64/arm64-csel.ll b/llvm/test/CodeGen/AArch64/arm64-csel.ll --- a/llvm/test/CodeGen/AArch64/arm64-csel.ll +++ b/llvm/test/CodeGen/AArch64/arm64-csel.ll @@ -138,8 +138,8 @@ define i32 @foo9(i32 %v) nounwind readnone optsize ssp { ; CHECK-LABEL: foo9: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinv w0, w8, eq ; CHECK-NEXT: ret entry: @@ -151,8 +151,8 @@ define i64 @foo10(i64 %v) nounwind readnone optsize ssp { ; CHECK-LABEL: foo10: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp x0, #0 ; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: cmp x0, #0 ; CHECK-NEXT: cinv x0, x8, eq ; CHECK-NEXT: ret entry: @@ -164,8 +164,8 @@ define i32 @foo11(i32 %v) nounwind readnone optsize ssp { ; CHECK-LABEL: foo11: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cneg w0, w8, eq ; CHECK-NEXT: ret entry: @@ -177,8 +177,8 @@ define i64 @foo12(i64 %v) nounwind readnone optsize ssp { ; CHECK-LABEL: foo12: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp x0, #0 ; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: cmp x0, #0 ; CHECK-NEXT: cneg x0, x8, eq ; CHECK-NEXT: ret entry: @@ -281,8 +281,8 @@ define i32 @foo20(i32 %x) { ; CHECK-LABEL: foo20: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #5 ; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: cmp w0, #5 ; CHECK-NEXT: csinc w0, w8, wzr, eq ; CHECK-NEXT: ret %cmp = icmp eq i32 %x, 5 @@ -293,8 +293,8 @@ define i64 @foo21(i64 %x) { ; CHECK-LABEL: foo21: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, #5 ; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: cmp x0, #5 ; CHECK-NEXT: csinc x0, x8, xzr, eq ; CHECK-NEXT: ret %cmp = icmp eq i64 %x, 5 @@ -305,8 +305,8 @@ define i32 @foo22(i32 %x) { ; CHECK-LABEL: foo22: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #5 ; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: cmp w0, #5 ; CHECK-NEXT: csinc w0, w8, wzr, ne ; CHECK-NEXT: ret %cmp = icmp eq i32 %x, 5 @@ -317,8 +317,8 @@ define i64 @foo23(i64 %x) { ; CHECK-LABEL: foo23: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, #5 ; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: cmp x0, #5 ; CHECK-NEXT: csinc x0, x8, xzr, ne ; CHECK-NEXT: ret %cmp = icmp eq i64 %x, 5 diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll --- a/llvm/test/CodeGen/AArch64/arm64-dup.ll +++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll @@ -446,11 +446,11 @@ define void @disguised_dup(<4 x float> %x, <4 x float>* %p1, <4 x float>* %p2) { ; CHECK-LABEL: disguised_dup: ; CHECK: // %bb.0: -; CHECK-NEXT: dup.4s v1, v0[0] -; CHECK-NEXT: ext.16b v0, v0, v0, #12 -; CHECK-NEXT: ext.16b v0, v0, v1, #8 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: ext.16b v1, v0, v0, #12 +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: ext.16b v1, v1, v0, #8 +; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: str q0, [x1] ; CHECK-NEXT: ret %shuf = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %dup = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll @@ -6,8 +6,8 @@ define float @test1(float %x, float %y) nounwind { ; CHECK-LABEL: test1: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ; kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-NEXT: ; kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: ; kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: bit.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $s0 killed $s0 killed $q0 @@ -22,8 +22,8 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v2, #0000000000000000 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fneg.2d v2, v2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fneg.2d v2, v2 ; CHECK-NEXT: bit.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -36,11 +36,11 @@ define double @test3(double %a, float %b, float %c) nounwind { ; CHECK-LABEL: test3: ; CHECK: ; %bb.0: -; CHECK-NEXT: fadd s1, s1, s2 -; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: movi.2d v3, #0000000000000000 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fadd s1, s1, s2 +; CHECK-NEXT: fneg.2d v2, v3 ; CHECK-NEXT: fcvt d1, s1 -; CHECK-NEXT: fneg.2d v2, v2 ; CHECK-NEXT: bit.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -55,11 +55,11 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill ; CHECK-NEXT: bl _bar +; CHECK-NEXT: movi.4s v1, #128, lsl #24 ; CHECK-NEXT: fcvt s0, d0 -; CHECK-NEXT: fmov s1, #0.50000000 -; CHECK-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-NEXT: bit.16b v1, v0, v2 -; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: fmov s2, #0.50000000 +; CHECK-NEXT: bit.16b v2, v0, v1 +; CHECK-NEXT: fadd s0, s0, s2 ; CHECK-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/arm64-fmadd.ll b/llvm/test/CodeGen/AArch64/arm64-fmadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-fmadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fmadd.ll @@ -115,10 +115,10 @@ ; CHECK-LABEL: negated_constant: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-1037565952 -; CHECK-NEXT: mov w9, #1109917696 ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #1109917696 ; CHECK-NEXT: fmul s1, s0, s1 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fmadd s0, s0, s2, s1 ; CHECK-NEXT: ret %m = fmul float %x, 42.0 diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll --- a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll +++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll @@ -20,23 +20,23 @@ ; CHECK-NEXT: .cfi_offset b10, -56 ; CHECK-NEXT: .cfi_offset b11, -64 ; CHECK-NEXT: fmov s3, #1.00000000 +; CHECK-NEXT: scvtf s4, w0 +; CHECK-NEXT: sub w19, w0, #1 ; CHECK-NEXT: fadd s8, s0, s3 ; CHECK-NEXT: fadd s0, s8, s1 -; CHECK-NEXT: scvtf s4, w0 ; CHECK-NEXT: fadd s0, s0, s2 ; CHECK-NEXT: fsub s9, s0, s4 ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: sub w19, w0, #1 ; CHECK-NEXT: bl __Z3goof ; CHECK-NEXT: fmov s10, s0 ; CHECK-NEXT: fmov s0, s9 ; CHECK-NEXT: bl __Z3goof ; CHECK-NEXT: fadd s0, s10, s0 -; CHECK-NEXT: fmul s0, s8, s0 -; CHECK-NEXT: fadd s0, s9, s0 ; CHECK-NEXT: scvtf s1, w19 ; CHECK-NEXT: ldp x29, x30, [sp, #48] ; CHECK-NEXT: ldp x20, x19, [sp, #32] +; CHECK-NEXT: fmul s0, s8, s0 +; CHECK-NEXT: fadd s0, s9, s0 ; CHECK-NEXT: ldp d9, d8, [sp, #16] ; CHECK-NEXT: fsub s0, s0, s1 ; CHECK-NEXT: ldp d11, d10, [sp], #64 @@ -47,8 +47,8 @@ ; CHECK-LINUX-NEXT: stp d11, d10, [sp, #-64]! ; CHECK-LINUX-NEXT: stp d9, d8, [sp, #16] ; CHECK-LINUX-NEXT: stp x29, x30, [sp, #32] -; CHECK-LINUX-NEXT: stp x20, x19, [sp, #48] ; CHECK-LINUX-NEXT: add x29, sp, #32 +; CHECK-LINUX-NEXT: stp x20, x19, [sp, #48] ; CHECK-LINUX-NEXT: .cfi_def_cfa w29, 32 ; CHECK-LINUX-NEXT: .cfi_offset w19, -8 ; CHECK-LINUX-NEXT: .cfi_offset w20, -16 @@ -59,23 +59,23 @@ ; CHECK-LINUX-NEXT: .cfi_offset b10, -56 ; CHECK-LINUX-NEXT: .cfi_offset b11, -64 ; CHECK-LINUX-NEXT: fmov s3, #1.00000000 +; CHECK-LINUX-NEXT: scvtf s4, w0 +; CHECK-LINUX-NEXT: sub w19, w0, #1 ; CHECK-LINUX-NEXT: fadd s8, s0, s3 ; CHECK-LINUX-NEXT: fadd s0, s8, s1 -; CHECK-LINUX-NEXT: scvtf s4, w0 ; CHECK-LINUX-NEXT: fadd s0, s0, s2 ; CHECK-LINUX-NEXT: fsub s9, s0, s4 ; CHECK-LINUX-NEXT: fmov s0, s8 -; CHECK-LINUX-NEXT: sub w19, w0, #1 ; CHECK-LINUX-NEXT: bl _Z3goof ; CHECK-LINUX-NEXT: fmov s10, s0 ; CHECK-LINUX-NEXT: fmov s0, s9 ; CHECK-LINUX-NEXT: bl _Z3goof ; CHECK-LINUX-NEXT: fadd s0, s10, s0 -; CHECK-LINUX-NEXT: fmul s0, s8, s0 -; CHECK-LINUX-NEXT: fadd s0, s9, s0 ; CHECK-LINUX-NEXT: scvtf s1, w19 ; CHECK-LINUX-NEXT: ldp x20, x19, [sp, #48] ; CHECK-LINUX-NEXT: ldp x29, x30, [sp, #32] +; CHECK-LINUX-NEXT: fmul s0, s8, s0 +; CHECK-LINUX-NEXT: fadd s0, s9, s0 ; CHECK-LINUX-NEXT: ldp d9, d8, [sp, #16] ; CHECK-LINUX-NEXT: fsub s0, s0, s1 ; CHECK-LINUX-NEXT: ldp d11, d10, [sp], #64 diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll --- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll @@ -15,9 +15,9 @@ define i64* @store64idxpos256(i64* %ptr, i64 %index, i64 %spacing) { ; CHECK-LABEL: store64idxpos256: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: str x2, [x0] -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add x0, x0, #256 +; CHECK-NEXT: str x2, [x8] ; CHECK-NEXT: ret %incdec.ptr = getelementptr inbounds i64, i64* %ptr, i64 32 store i64 %spacing, i64* %ptr, align 4 @@ -47,9 +47,9 @@ define i32* @store32idxpos256(i32* %ptr, i32 %index, i32 %spacing) { ; CHECK-LABEL: store32idxpos256: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: str w2, [x0] -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add x0, x0, #256 +; CHECK-NEXT: str w2, [x8] ; CHECK-NEXT: ret %incdec.ptr = getelementptr inbounds i32, i32* %ptr, i64 64 store i32 %spacing, i32* %ptr, align 4 @@ -79,9 +79,9 @@ define i16* @store16idxpos256(i16* %ptr, i16 %index, i16 %spacing) { ; CHECK-LABEL: store16idxpos256: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: strh w2, [x0] -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add x0, x0, #256 +; CHECK-NEXT: strh w2, [x8] ; CHECK-NEXT: ret %incdec.ptr = getelementptr inbounds i16, i16* %ptr, i64 128 store i16 %spacing, i16* %ptr, align 4 @@ -111,9 +111,9 @@ define i8* @store8idxpos256(i8* %ptr, i8 %index, i8 %spacing) { ; CHECK-LABEL: store8idxpos256: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: strb w2, [x0] -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add x0, x0, #256 +; CHECK-NEXT: strb w2, [x8] ; CHECK-NEXT: ret %incdec.ptr = getelementptr inbounds i8, i8* %ptr, i64 256 store i8 %spacing, i8* %ptr, align 4 @@ -238,9 +238,9 @@ define i64* @pre64idxpos256(i64* %ptr, i64 %spacing) { ; CHECK-LABEL: pre64idxpos256: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: str x1, [x0, #256] -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add x0, x0, #256 +; CHECK-NEXT: str x1, [x8, #256] ; CHECK-NEXT: ret %incdec.ptr = getelementptr inbounds i64, i64* %ptr, i64 32 store i64 %spacing, i64* %incdec.ptr, align 4 @@ -270,9 +270,9 @@ define i32* @pre32idxpos256(i32* %ptr, i32 %spacing) { ; CHECK-LABEL: pre32idxpos256: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: str w1, [x0, #256] -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add x0, x0, #256 +; CHECK-NEXT: str w1, [x8, #256] ; CHECK-NEXT: ret %incdec.ptr = getelementptr inbounds i32, i32* %ptr, i64 64 store i32 %spacing, i32* %incdec.ptr, align 4 @@ -302,9 +302,9 @@ define i16* @pre16idxpos256(i16* %ptr, i16 %spacing) { ; CHECK-LABEL: pre16idxpos256: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: strh w1, [x0, #256] -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add x0, x0, #256 +; CHECK-NEXT: strh w1, [x8, #256] ; CHECK-NEXT: ret %incdec.ptr = getelementptr inbounds i16, i16* %ptr, i64 128 store i16 %spacing, i16* %incdec.ptr, align 4 @@ -334,9 +334,9 @@ define i8* @pre8idxpos256(i8* %ptr, i8 %spacing) { ; CHECK-LABEL: pre8idxpos256: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #256 -; CHECK-NEXT: strb w1, [x0, #256] -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add x0, x0, #256 +; CHECK-NEXT: strb w1, [x8, #256] ; CHECK-NEXT: ret %incdec.ptr = getelementptr inbounds i8, i8* %ptr, i64 256 store i8 %spacing, i8* %incdec.ptr, align 4 @@ -578,9 +578,9 @@ ; ; CHECK32-LABEL: postidx_clobber: ; CHECK32: ; %bb.0: -; CHECK32-NEXT: add w8, w0, #8 -; CHECK32-NEXT: str w0, [x0] -; CHECK32-NEXT: mov x0, x8 +; CHECK32-NEXT: mov x8, x0 +; CHECK32-NEXT: add w0, w8, #8 +; CHECK32-NEXT: str w8, [x8] ; CHECK32-NEXT: ret ; ret %paddr = bitcast i64* %addr to i64** diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll --- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -3909,8 +3909,8 @@ define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind { ; CHECK-LABEL: test_v8i16_post_reg_ld2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ld2.h { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: str x0, [x1] @@ -3941,8 +3941,8 @@ define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind { ; CHECK-LABEL: test_v4i16_post_reg_ld2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ld2.h { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: str x0, [x1] @@ -3973,8 +3973,8 @@ define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind { ; CHECK-LABEL: test_v4i32_post_reg_ld2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ld2.s { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: str x0, [x1] @@ -4005,8 +4005,8 @@ define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind { ; CHECK-LABEL: test_v2i32_post_reg_ld2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ld2.s { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: str x0, [x1] @@ -4037,8 +4037,8 @@ define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind { ; CHECK-LABEL: test_v2i64_post_reg_ld2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ld2.d { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: str x0, [x1] @@ -4069,8 +4069,8 @@ define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind { ; CHECK-LABEL: test_v1i64_post_reg_ld2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ld2.d { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: str x0, [x1] @@ -4101,8 +4101,8 @@ define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind { ; CHECK-LABEL: test_v4f32_post_reg_ld2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ld2.s { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: str x0, [x1] @@ -4133,8 +4133,8 @@ define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind { ; CHECK-LABEL: test_v2f32_post_reg_ld2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ld2.s { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: str x0, [x1] @@ -4165,8 +4165,8 @@ define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind { ; CHECK-LABEL: test_v2f64_post_reg_ld2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ld2.d { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: str x0, [x1] @@ -4197,8 +4197,8 @@ define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind { ; CHECK-LABEL: test_v1f64_post_reg_ld2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ld2.d { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: str x0, [x1] @@ -5117,8 +5117,8 @@ define i16* @test_v8i16_post_reg_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v8i16_post_reg_st2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.8h { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -5145,8 +5145,8 @@ define i16* @test_v4i16_post_reg_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i16_post_reg_st2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: st2.4h { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -5173,8 +5173,8 @@ define i32* @test_v4i32_post_reg_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i32_post_reg_st2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.4s { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -5201,8 +5201,8 @@ define i32* @test_v2i32_post_reg_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i32_post_reg_st2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: st2.2s { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -5229,8 +5229,8 @@ define i64* @test_v2i64_post_reg_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i64_post_reg_st2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.2d { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -5257,8 +5257,8 @@ define i64* @test_v1i64_post_reg_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v1i64_post_reg_st2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: st1.1d { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -5285,8 +5285,8 @@ define float* @test_v4f32_post_reg_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v4f32_post_reg_st2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.4s { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -5313,8 +5313,8 @@ define float* @test_v2f32_post_reg_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f32_post_reg_st2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: st2.2s { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -5341,8 +5341,8 @@ define double* @test_v2f64_post_reg_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f64_post_reg_st2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.2d { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -5369,8 +5369,8 @@ define double* @test_v1f64_post_reg_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v1f64_post_reg_st2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: st1.1d { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -5456,8 +5456,8 @@ define i16* @test_v8i16_post_reg_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v8i16_post_reg_st3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.8h { v0, v1, v2 }, [x0], x8 @@ -5486,8 +5486,8 @@ define i16* @test_v4i16_post_reg_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i16_post_reg_st3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: st3.4h { v0, v1, v2 }, [x0], x8 @@ -5516,8 +5516,8 @@ define i32* @test_v4i32_post_reg_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i32_post_reg_st3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.4s { v0, v1, v2 }, [x0], x8 @@ -5546,8 +5546,8 @@ define i32* @test_v2i32_post_reg_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i32_post_reg_st3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: st3.2s { v0, v1, v2 }, [x0], x8 @@ -5576,8 +5576,8 @@ define i64* @test_v2i64_post_reg_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i64_post_reg_st3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.2d { v0, v1, v2 }, [x0], x8 @@ -5606,8 +5606,8 @@ define i64* @test_v1i64_post_reg_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v1i64_post_reg_st3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: st1.1d { v0, v1, v2 }, [x0], x8 @@ -5636,8 +5636,8 @@ define float* @test_v4f32_post_reg_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v4f32_post_reg_st3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.4s { v0, v1, v2 }, [x0], x8 @@ -5666,8 +5666,8 @@ define float* @test_v2f32_post_reg_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f32_post_reg_st3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: st3.2s { v0, v1, v2 }, [x0], x8 @@ -5696,8 +5696,8 @@ define double* @test_v2f64_post_reg_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f64_post_reg_st3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.2d { v0, v1, v2 }, [x0], x8 @@ -5726,8 +5726,8 @@ define double* @test_v1f64_post_reg_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v1f64_post_reg_st3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: st1.1d { v0, v1, v2 }, [x0], x8 @@ -6191,8 +6191,8 @@ define i16* @test_v8i16_post_reg_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v8i16_post_reg_st1x2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st1.8h { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -6219,8 +6219,8 @@ define i16* @test_v4i16_post_reg_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i16_post_reg_st1x2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: st1.4h { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -6247,8 +6247,8 @@ define i32* @test_v4i32_post_reg_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i32_post_reg_st1x2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st1.4s { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -6275,8 +6275,8 @@ define i32* @test_v2i32_post_reg_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i32_post_reg_st1x2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: st1.2s { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -6303,8 +6303,8 @@ define i64* @test_v2i64_post_reg_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i64_post_reg_st1x2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st1.2d { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -6331,8 +6331,8 @@ define i64* @test_v1i64_post_reg_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v1i64_post_reg_st1x2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: st1.1d { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -6359,8 +6359,8 @@ define float* @test_v4f32_post_reg_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v4f32_post_reg_st1x2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st1.4s { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -6387,8 +6387,8 @@ define float* @test_v2f32_post_reg_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f32_post_reg_st1x2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: st1.2s { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -6415,8 +6415,8 @@ define double* @test_v2f64_post_reg_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f64_post_reg_st1x2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st1.2d { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -6443,8 +6443,8 @@ define double* @test_v1f64_post_reg_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v1f64_post_reg_st1x2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1 ; CHECK-NEXT: st1.1d { v0, v1 }, [x0], x8 ; CHECK-NEXT: ret @@ -6530,8 +6530,8 @@ define i16* @test_v8i16_post_reg_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v8i16_post_reg_st1x3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st1.8h { v0, v1, v2 }, [x0], x8 @@ -6560,8 +6560,8 @@ define i16* @test_v4i16_post_reg_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i16_post_reg_st1x3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: st1.4h { v0, v1, v2 }, [x0], x8 @@ -6590,8 +6590,8 @@ define i32* @test_v4i32_post_reg_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i32_post_reg_st1x3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st1.4s { v0, v1, v2 }, [x0], x8 @@ -6620,8 +6620,8 @@ define i32* @test_v2i32_post_reg_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i32_post_reg_st1x3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: st1.2s { v0, v1, v2 }, [x0], x8 @@ -6650,8 +6650,8 @@ define i64* @test_v2i64_post_reg_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i64_post_reg_st1x3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st1.2d { v0, v1, v2 }, [x0], x8 @@ -6680,8 +6680,8 @@ define i64* @test_v1i64_post_reg_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v1i64_post_reg_st1x3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: st1.1d { v0, v1, v2 }, [x0], x8 @@ -6710,8 +6710,8 @@ define float* @test_v4f32_post_reg_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v4f32_post_reg_st1x3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st1.4s { v0, v1, v2 }, [x0], x8 @@ -6740,8 +6740,8 @@ define float* @test_v2f32_post_reg_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f32_post_reg_st1x3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: st1.2s { v0, v1, v2 }, [x0], x8 @@ -6770,8 +6770,8 @@ define double* @test_v2f64_post_reg_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f64_post_reg_st1x3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st1.2d { v0, v1, v2 }, [x0], x8 @@ -6800,8 +6800,8 @@ define double* @test_v1f64_post_reg_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v1f64_post_reg_st1x3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2 ; CHECK-NEXT: st1.1d { v0, v1, v2 }, [x0], x8 @@ -7264,8 +7264,8 @@ define i16* @test_v8i16_post_reg_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v8i16_post_reg_st2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.h { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: ret @@ -7292,8 +7292,8 @@ define i16* @test_v4i16_post_reg_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i16_post_reg_st2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.h { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: ret @@ -7320,8 +7320,8 @@ define i32* @test_v4i32_post_reg_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i32_post_reg_st2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.s { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: ret @@ -7348,8 +7348,8 @@ define i32* @test_v2i32_post_reg_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i32_post_reg_st2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.s { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: ret @@ -7376,8 +7376,8 @@ define i64* @test_v2i64_post_reg_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i64_post_reg_st2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.d { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: ret @@ -7404,8 +7404,8 @@ define i64* @test_v1i64_post_reg_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v1i64_post_reg_st2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.d { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: ret @@ -7432,8 +7432,8 @@ define float* @test_v4f32_post_reg_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v4f32_post_reg_st2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.s { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: ret @@ -7460,8 +7460,8 @@ define float* @test_v2f32_post_reg_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f32_post_reg_st2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.s { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: ret @@ -7488,8 +7488,8 @@ define double* @test_v2f64_post_reg_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f64_post_reg_st2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.d { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: ret @@ -7516,8 +7516,8 @@ define double* @test_v1f64_post_reg_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind { ; CHECK-LABEL: test_v1f64_post_reg_st2lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: st2.d { v0, v1 }[0], [x0], x8 ; CHECK-NEXT: ret @@ -7603,8 +7603,8 @@ define i16* @test_v8i16_post_reg_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v8i16_post_reg_st3lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.h { v0, v1, v2 }[0], [x0], x8 @@ -7633,8 +7633,8 @@ define i16* @test_v4i16_post_reg_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i16_post_reg_st3lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.h { v0, v1, v2 }[0], [x0], x8 @@ -7663,8 +7663,8 @@ define i32* @test_v4i32_post_reg_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v4i32_post_reg_st3lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.s { v0, v1, v2 }[0], [x0], x8 @@ -7693,8 +7693,8 @@ define i32* @test_v2i32_post_reg_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i32_post_reg_st3lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.s { v0, v1, v2 }[0], [x0], x8 @@ -7723,8 +7723,8 @@ define i64* @test_v2i64_post_reg_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2i64_post_reg_st3lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.d { v0, v1, v2 }[0], [x0], x8 @@ -7753,8 +7753,8 @@ define i64* @test_v1i64_post_reg_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v1i64_post_reg_st3lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.d { v0, v1, v2 }[0], [x0], x8 @@ -7783,8 +7783,8 @@ define float* @test_v4f32_post_reg_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v4f32_post_reg_st3lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.s { v0, v1, v2 }[0], [x0], x8 @@ -7813,8 +7813,8 @@ define float* @test_v2f32_post_reg_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f32_post_reg_st3lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.s { v0, v1, v2 }[0], [x0], x8 @@ -7843,8 +7843,8 @@ define double* @test_v2f64_post_reg_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v2f64_post_reg_st3lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.d { v0, v1, v2 }[0], [x0], x8 @@ -7873,8 +7873,8 @@ define double* @test_v1f64_post_reg_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind { ; CHECK-LABEL: test_v1f64_post_reg_st3lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: lsl x8, x2, #3 +; CHECK-NEXT: ; kill: def $d2 killed $d2 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0_q1_q2 def $q0_q1_q2 ; CHECK-NEXT: st3.d { v0, v1, v2 }[0], [x0], x8 @@ -8721,8 +8721,8 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <4 x i16> %A) { ; CHECK-LABEL: test_v4i16_post_reg_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ld1.h { v0 }[1], [x0], x8 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] @@ -8779,8 +8779,8 @@ define <2 x i32> @test_v2i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <2 x i32> %A) { ; CHECK-LABEL: test_v2i32_post_reg_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ld1.s { v0 }[1], [x0], x8 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] @@ -8864,8 +8864,8 @@ define <2 x float> @test_v2f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <2 x float> %A) { ; CHECK-LABEL: test_v2f32_post_reg_ld1lane: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: lsl x8, x2, #2 +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ld1.s { v0 }[1], [x0], x8 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] @@ -8912,8 +8912,8 @@ ; CHECK-NEXT: str q0, [x3] ; CHECK-NEXT: ldr q0, [x4] ; CHECK-NEXT: add x8, x0, x2, lsl #2 -; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: mov.s v0[1], v1[0] +; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: ret %tmp1 = load float, float* %bar store <4 x float> %vec, <4 x float>* %dep_ptr_1, align 16 @@ -8934,8 +8934,8 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(i16* %bar, i16** %ptr, i64 %inc, <4 x i16> %A, <2 x i32>* %d) { ; CHECK-LABEL: test_v4i16_post_reg_ld1lane_forced_narrow: ; CHECK: ; %bb.0: -; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: lsl x8, x2, #1 +; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: ld1.h { v0 }[1], [x0], x8 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: str x0, [x1] @@ -8960,11 +8960,11 @@ define void @test_ld1lane_build(i32* %ptr0, i32* %ptr1, i32* %ptr2, i32* %ptr3, <2 x i32>* %out) { ; CHECK-LABEL: test_ld1lane_build: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ldr s1, [x2] -; CHECK-NEXT: ld1.s { v0 }[1], [x1] -; CHECK-NEXT: ld1.s { v1 }[1], [x3] -; CHECK-NEXT: sub.2s v0, v0, v1 +; CHECK-NEXT: ldr s0, [x2] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: ld1.s { v0 }[1], [x3] +; CHECK-NEXT: ld1.s { v1 }[1], [x1] +; CHECK-NEXT: sub.2s v0, v1, v0 ; CHECK-NEXT: str d0, [x4] ; CHECK-NEXT: ret %load0 = load i32, i32* %ptr0, align 4 @@ -9096,8 +9096,8 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: bfxil x8, x1, #0, #4 ; CHECK-NEXT: str q0, [sp] @@ -9114,9 +9114,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: and x8, x1, #0x7 +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: bfi x9, x8, #1, #3 ; CHECK-NEXT: str q0, [sp] @@ -9144,10 +9144,10 @@ ; CHECK-LABEL: load_single_extract_variable_index_v3i32_small_align: ; CHECK: ; %bb.0: ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: cmp x9, #2 +; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: ldr w0, [x0, x8, lsl #2] ; CHECK-NEXT: ret %lv = load <3 x i32>, <3 x i32>* %A, align 2 @@ -9159,10 +9159,10 @@ ; CHECK-LABEL: load_single_extract_variable_index_v3i32_default_align: ; CHECK: ; %bb.0: ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: mov w9, #2 -; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: cmp x9, #2 +; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: ldr w0, [x0, x8, lsl #2] ; CHECK-NEXT: ret %lv = load <3 x i32>, <3 x i32>* %A diff --git a/llvm/test/CodeGen/AArch64/arm64-inline-asm.ll b/llvm/test/CodeGen/AArch64/arm64-inline-asm.ll --- a/llvm/test/CodeGen/AArch64/arm64-inline-asm.ll +++ b/llvm/test/CodeGen/AArch64/arm64-inline-asm.ll @@ -87,8 +87,8 @@ ; CHECK-LABEL: t7: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: str x0, [sp, #8] ; CHECK-NEXT: add x8, sp, #8 +; CHECK-NEXT: str x0, [sp, #8] ; CHECK-NEXT: ; InlineAsm Start ; CHECK-NEXT: str w1, [x8] ; CHECK-NEXT: ; InlineAsm End @@ -368,13 +368,13 @@ define void @test_zero_reg(i32* %addr) { ; CHECK-LABEL: test_zero_reg: ; CHECK: ; %bb.0: +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: ; InlineAsm Start ; CHECK-NEXT: USE(xzr) ; CHECK-NEXT: ; InlineAsm End ; CHECK-NEXT: ; InlineAsm Start ; CHECK-NEXT: USE(wzr) ; CHECK-NEXT: ; InlineAsm End -; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: ; InlineAsm Start ; CHECK-NEXT: USE(w8) ; CHECK-NEXT: ; InlineAsm End @@ -485,11 +485,11 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: ldr s0, [x0, #32] ; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str s0, [sp, #32] +; CHECK-NEXT: ldr s0, [x0, #32] ; CHECK-NEXT: stp q2, q1, [sp] +; CHECK-NEXT: str s0, [sp, #32] ; CHECK-NEXT: ; InlineAsm Start ; CHECK-NEXT: ; InlineAsm End ; CHECK-NEXT: add sp, sp, #64 diff --git a/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll b/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll --- a/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll +++ b/llvm/test/CodeGen/AArch64/arm64-instruction-mix-remarks.ll @@ -27,19 +27,19 @@ define i32 @foo(i32* %ptr, i32 %x, i64 %y) !dbg !3 { ; CHECK-LABEL: foo: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: ldr w10, [x0] ; CHECK-NEXT: mov x8, x0 -; CHECK-NEXT: mov w10, #16959 -; CHECK-NEXT: movk w10, #15, lsl #16 -; CHECK-NEXT: add w0, w9, w1 -; CHECK-NEXT: add x9, x0, x2 -; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: mov w9, #16959 +; CHECK-NEXT: movk w9, #15, lsl #16 +; CHECK-NEXT: add w0, w10, w1 +; CHECK-NEXT: add x10, x0, x2 +; CHECK-NEXT: cmp x10, x9 ; CHECK-NEXT: b.eq LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %else -; CHECK-NEXT: mul w10, w0, w1 -; CHECK-NEXT: mov w9, #10 -; CHECK-NEXT: mul w0, w10, w1 -; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: mul w9, w0, w1 +; CHECK-NEXT: mov w10, #10 +; CHECK-NEXT: mul w0, w9, w1 +; CHECK-NEXT: str w10, [x8] ; CHECK-NEXT: LBB0_2: ; %common.ret ; CHECK-NEXT: ; kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp.ll b/llvm/test/CodeGen/AArch64/arm64-ldp.ll --- a/llvm/test/CodeGen/AArch64/arm64-ldp.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ldp.ll @@ -312,10 +312,10 @@ ; are used---just check that there isn't an ldp before the add ; CHECK-LABEL: pairUpBarelyOut: ; CHECK: // %bb.0: -; CHECK-NEXT: sub x9, x0, #264 -; CHECK-NEXT: ldur x8, [x0, #-256] -; CHECK-NEXT: ldr x9, [x9] -; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: sub x8, x0, #264 +; CHECK-NEXT: ldur x9, [x0, #-256] +; CHECK-NEXT: ldr x8, [x8] +; CHECK-NEXT: add x0, x9, x8 ; CHECK-NEXT: ret %p1 = getelementptr inbounds i64, i64* %a, i64 -32 %tmp1 = load i64, i64* %p1, align 2 @@ -330,10 +330,10 @@ ; are used---just check that there isn't an ldp before the add ; CHECK-LABEL: pairUpBarelyOutSext: ; CHECK: // %bb.0: -; CHECK-NEXT: sub x9, x0, #260 -; CHECK-NEXT: ldursw x8, [x0, #-256] -; CHECK-NEXT: ldrsw x9, [x9] -; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: sub x8, x0, #260 +; CHECK-NEXT: ldursw x9, [x0, #-256] +; CHECK-NEXT: ldrsw x8, [x8] +; CHECK-NEXT: add x0, x9, x8 ; CHECK-NEXT: ret %p1 = getelementptr inbounds i32, i32* %a, i64 -64 %tmp1 = load i32, i32* %p1, align 2 @@ -422,8 +422,10 @@ ; CHECK-LABEL: ldp_sext_int_post: ; CHECK: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-NEXT: add x8, x0, #8 ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldpsw x19, x20, [x0], #8 +; CHECK-NEXT: ldpsw x19, x20, [x0] +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: bl "use-ptr" ; CHECK-NEXT: add x0, x20, x19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll --- a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll +++ b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll @@ -472,8 +472,8 @@ ; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov x8, #-6148914691236517206 ; CHECK-NEXT: movi v0.16b, #170 +; CHECK-NEXT: mov x8, #-6148914691236517206 ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: str x8, [sp, #32] ; CHECK-NEXT: stp q0, q0, [sp] @@ -517,8 +517,8 @@ ; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 96 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov x8, #-6148914691236517206 ; CHECK-NEXT: movi v0.16b, #170 +; CHECK-NEXT: mov x8, #-6148914691236517206 ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: str x8, [sp, #64] ; CHECK-NEXT: stp q0, q0, [sp, #32] diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll @@ -117,9 +117,9 @@ define <4 x i32> @test_vaddl_a16(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: test_vaddl_a16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vmovl.i.i = zext <4 x i16> %a to <4 x i32> @@ -132,9 +132,9 @@ define <2 x i64> @test_vaddl_a32(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: test_vaddl_a32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: uaddl v0.2d, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vmovl.i.i = zext <2 x i32> %a to <2 x i64> @@ -247,9 +247,9 @@ define <4 x i32> @test_vaddl_high_a16(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_vaddl_high_a16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-NEXT: uaddl2 v0.4s, v0.8h, v1.8h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -264,9 +264,9 @@ define <2 x i64> @test_vaddl_high_a32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_vaddl_high_a32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: uaddl2 v0.2d, v0.4s, v1.4s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -360,9 +360,9 @@ define <4 x i32> @test_vaddw_a16(<4 x i32> %a, <4 x i16> %b) { ; CHECK-LABEL: test_vaddw_a16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vmovl.i.i = zext <4 x i16> %b to <4 x i32> @@ -374,9 +374,9 @@ define <2 x i64> @test_vaddw_a32(<2 x i64> %a, <2 x i32> %b) { ; CHECK-LABEL: test_vaddw_a32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vmovl.i.i = zext <2 x i32> %b to <2 x i64> @@ -474,9 +474,9 @@ define <4 x i32> @test_vaddw_high_a16(<4 x i32> %a, <8 x i16> %b) { ; CHECK-LABEL: test_vaddw_high_a16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> @@ -489,9 +489,9 @@ define <2 x i64> @test_vaddw_high_a32(<2 x i64> %a, <4 x i32> %b) { ; CHECK-LABEL: test_vaddw_high_a32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: uaddw2 v0.2d, v0.2d, v1.4s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> @@ -590,9 +590,9 @@ define <4 x i32> @test_vsubl_a16(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: test_vsubl_a16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-NEXT: usubl v0.4s, v0.4h, v1.4h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vmovl.i.i = zext <4 x i16> %a to <4 x i32> @@ -605,9 +605,9 @@ define <2 x i64> @test_vsubl_a32(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: test_vsubl_a32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: usubl v0.2d, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vmovl.i.i = zext <2 x i32> %a to <2 x i64> @@ -720,9 +720,9 @@ define <4 x i32> @test_vsubl_high_a16(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_vsubl_high_a16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-NEXT: usubl2 v0.4s, v0.8h, v1.8h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -737,9 +737,9 @@ define <2 x i64> @test_vsubl_high_a32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_vsubl_high_a32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: usubl2 v0.2d, v0.4s, v1.4s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -833,9 +833,9 @@ define <4 x i32> @test_vsubw_a16(<4 x i32> %a, <4 x i16> %b) { ; CHECK-LABEL: test_vsubw_a16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-NEXT: usubw v0.4s, v0.4s, v1.4h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vmovl.i.i = zext <4 x i16> %b to <4 x i32> @@ -847,9 +847,9 @@ define <2 x i64> @test_vsubw_a32(<2 x i64> %a, <2 x i32> %b) { ; CHECK-LABEL: test_vsubw_a32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: usubw v0.2d, v0.2d, v1.2s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %vmovl.i.i = zext <2 x i32> %b to <2 x i64> @@ -947,9 +947,9 @@ define <4 x i32> @test_vsubw_high_a16(<4 x i32> %a, <8 x i16> %b) { ; CHECK-LABEL: test_vsubw_high_a16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff ; CHECK-NEXT: usubw2 v0.4s, v0.4s, v1.8h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> @@ -962,9 +962,9 @@ define <2 x i64> @test_vsubw_high_a32(<2 x i64> %a, <4 x i32> %b) { ; CHECK-LABEL: test_vsubw_high_a32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: usubw2 v0.2d, v0.2d, v1.4s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret entry: %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll @@ -200,9 +200,9 @@ ; CHECK-LABEL: test_sabd_v2i32_const: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: movi d1, #0x00ffffffff0000 -; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: movi d0, #0x00ffffffff0000 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: sabd v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32( <2 x i32> , diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1226,11 +1226,12 @@ define <4 x i16> @test_extracts_inserts_varidx_extract(<8 x i16> %x, i32 %idx) { ; CHECK-LABEL: test_extracts_inserts_varidx_extract: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: str q0, [sp, #-16]! ; CHECK-NEXT: and x8, x0, #0x7 ; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q0, [sp] ; CHECK-NEXT: bfi x9, x8, #1, #3 ; CHECK-NEXT: ldr h1, [x9] ; CHECK-NEXT: mov v1.h[1], v0.h[1] @@ -1491,11 +1492,11 @@ define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) { ; CHECK-LABEL: test_concat_diff_v1i32_v1i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s1, w1 -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: sqabs s1, s1 -; CHECK-NEXT: sqabs s0, s0 -; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: fmov s0, w1 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqabs s2, s0 +; CHECK-NEXT: sqabs s0, s1 +; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: mov v0.s[1], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll @@ -73,13 +73,13 @@ define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) { ; CHECK-LABEL: mul2xi64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x10, d0 ; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: mul x10, x11, x10 -; CHECK-NEXT: mul x8, x9, x8 -; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: mov x11, v0.d[1] +; CHECK-NEXT: mul x9, x10, x9 +; CHECK-NEXT: mul x8, x11, x8 +; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: mov v0.d[1], x8 ; CHECK-NEXT: ret %tmp3 = mul <2 x i64> %A, %B; @@ -158,37 +158,37 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w10, v1.b[0] -; CHECK-NEXT: smov w11, v0.b[0] ; CHECK-NEXT: smov w8, v1.b[1] ; CHECK-NEXT: smov w9, v0.b[1] -; CHECK-NEXT: sdiv w10, w11, w10 -; CHECK-NEXT: smov w12, v1.b[2] -; CHECK-NEXT: smov w13, v0.b[2] +; CHECK-NEXT: smov w10, v0.b[0] +; CHECK-NEXT: smov w11, v0.b[2] +; CHECK-NEXT: smov w12, v0.b[3] +; CHECK-NEXT: smov w13, v0.b[4] ; CHECK-NEXT: sdiv w8, w9, w8 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: smov w14, v1.b[3] -; CHECK-NEXT: smov w15, v0.b[3] -; CHECK-NEXT: sdiv w12, w13, w12 +; CHECK-NEXT: smov w9, v1.b[0] +; CHECK-NEXT: sdiv w9, w10, w9 +; CHECK-NEXT: smov w10, v1.b[2] +; CHECK-NEXT: sdiv w10, w11, w10 +; CHECK-NEXT: smov w11, v1.b[3] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: smov w9, v1.b[5] ; CHECK-NEXT: mov v2.b[1], w8 -; CHECK-NEXT: smov w9, v1.b[4] -; CHECK-NEXT: smov w11, v0.b[4] -; CHECK-NEXT: sdiv w14, w15, w14 -; CHECK-NEXT: mov v2.b[2], w12 -; CHECK-NEXT: smov w13, v1.b[5] -; CHECK-NEXT: smov w15, v0.b[5] -; CHECK-NEXT: sdiv w9, w11, w9 -; CHECK-NEXT: mov v2.b[3], w14 -; CHECK-NEXT: smov w11, v1.b[6] -; CHECK-NEXT: sdiv w13, w15, w13 -; CHECK-NEXT: smov w15, v0.b[6] -; CHECK-NEXT: mov v2.b[4], w9 -; CHECK-NEXT: sdiv w11, w15, w11 -; CHECK-NEXT: smov w8, v1.b[7] -; CHECK-NEXT: mov v2.b[5], w13 -; CHECK-NEXT: smov w9, v0.b[7] -; CHECK-NEXT: mov v2.b[6], w11 -; CHECK-NEXT: sdiv w8, w9, w8 +; CHECK-NEXT: sdiv w11, w12, w11 +; CHECK-NEXT: smov w12, v1.b[4] +; CHECK-NEXT: mov v2.b[2], w10 +; CHECK-NEXT: smov w10, v0.b[6] +; CHECK-NEXT: sdiv w12, w13, w12 +; CHECK-NEXT: smov w13, v0.b[5] +; CHECK-NEXT: mov v2.b[3], w11 +; CHECK-NEXT: smov w11, v0.b[7] +; CHECK-NEXT: sdiv w8, w13, w9 +; CHECK-NEXT: smov w9, v1.b[6] +; CHECK-NEXT: mov v2.b[4], w12 +; CHECK-NEXT: sdiv w9, w10, w9 +; CHECK-NEXT: smov w10, v1.b[7] +; CHECK-NEXT: mov v2.b[5], w8 +; CHECK-NEXT: sdiv w8, w11, w10 +; CHECK-NEXT: mov v2.b[6], w9 ; CHECK-NEXT: mov v2.b[7], w8 ; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret @@ -199,69 +199,69 @@ define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) { ; CHECK-LABEL: sdiv16x8: ; CHECK: // %bb.0: -; CHECK-NEXT: smov w10, v1.b[0] -; CHECK-NEXT: smov w11, v0.b[0] ; CHECK-NEXT: smov w8, v1.b[1] ; CHECK-NEXT: smov w9, v0.b[1] -; CHECK-NEXT: sdiv w10, w11, w10 -; CHECK-NEXT: smov w12, v1.b[2] -; CHECK-NEXT: smov w13, v0.b[2] +; CHECK-NEXT: smov w10, v0.b[0] +; CHECK-NEXT: smov w11, v0.b[2] +; CHECK-NEXT: smov w12, v0.b[3] +; CHECK-NEXT: smov w13, v0.b[4] +; CHECK-NEXT: smov w14, v0.b[5] +; CHECK-NEXT: smov w15, v0.b[6] ; CHECK-NEXT: sdiv w8, w9, w8 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: smov w14, v1.b[3] -; CHECK-NEXT: smov w15, v0.b[3] -; CHECK-NEXT: sdiv w12, w13, w12 +; CHECK-NEXT: smov w9, v1.b[0] +; CHECK-NEXT: smov w16, v0.b[7] +; CHECK-NEXT: smov w17, v0.b[8] +; CHECK-NEXT: sdiv w9, w10, w9 +; CHECK-NEXT: smov w10, v1.b[2] +; CHECK-NEXT: sdiv w10, w11, w10 +; CHECK-NEXT: smov w11, v1.b[3] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: smov w9, v1.b[9] ; CHECK-NEXT: mov v2.b[1], w8 -; CHECK-NEXT: smov w16, v1.b[4] -; CHECK-NEXT: smov w17, v0.b[4] +; CHECK-NEXT: sdiv w11, w12, w11 +; CHECK-NEXT: smov w12, v1.b[4] +; CHECK-NEXT: mov v2.b[2], w10 +; CHECK-NEXT: smov w10, v0.b[10] +; CHECK-NEXT: sdiv w12, w13, w12 +; CHECK-NEXT: smov w13, v1.b[5] +; CHECK-NEXT: mov v2.b[3], w11 +; CHECK-NEXT: smov w11, v0.b[11] +; CHECK-NEXT: sdiv w13, w14, w13 +; CHECK-NEXT: smov w14, v1.b[6] +; CHECK-NEXT: mov v2.b[4], w12 +; CHECK-NEXT: smov w12, v0.b[12] ; CHECK-NEXT: sdiv w14, w15, w14 -; CHECK-NEXT: mov v2.b[2], w12 -; CHECK-NEXT: smov w18, v1.b[5] -; CHECK-NEXT: smov w0, v0.b[5] +; CHECK-NEXT: smov w15, v1.b[7] +; CHECK-NEXT: mov v2.b[5], w13 +; CHECK-NEXT: smov w13, v0.b[13] +; CHECK-NEXT: sdiv w15, w16, w15 +; CHECK-NEXT: smov w16, v1.b[8] +; CHECK-NEXT: mov v2.b[6], w14 ; CHECK-NEXT: sdiv w16, w17, w16 -; CHECK-NEXT: mov v2.b[3], w14 -; CHECK-NEXT: smov w1, v1.b[6] -; CHECK-NEXT: smov w2, v0.b[6] -; CHECK-NEXT: sdiv w18, w0, w18 -; CHECK-NEXT: mov v2.b[4], w16 -; CHECK-NEXT: smov w3, v1.b[7] -; CHECK-NEXT: smov w4, v0.b[7] -; CHECK-NEXT: sdiv w1, w2, w1 -; CHECK-NEXT: mov v2.b[5], w18 -; CHECK-NEXT: smov w9, v1.b[8] -; CHECK-NEXT: smov w11, v0.b[8] -; CHECK-NEXT: sdiv w3, w4, w3 -; CHECK-NEXT: mov v2.b[6], w1 -; CHECK-NEXT: smov w13, v1.b[9] -; CHECK-NEXT: smov w15, v0.b[9] -; CHECK-NEXT: sdiv w9, w11, w9 -; CHECK-NEXT: mov v2.b[7], w3 -; CHECK-NEXT: smov w17, v1.b[10] -; CHECK-NEXT: smov w0, v0.b[10] -; CHECK-NEXT: sdiv w13, w15, w13 -; CHECK-NEXT: mov v2.b[8], w9 -; CHECK-NEXT: smov w2, v1.b[11] -; CHECK-NEXT: smov w4, v0.b[11] -; CHECK-NEXT: sdiv w17, w0, w17 -; CHECK-NEXT: mov v2.b[9], w13 +; CHECK-NEXT: smov w17, v0.b[9] +; CHECK-NEXT: mov v2.b[7], w15 +; CHECK-NEXT: sdiv w8, w17, w9 +; CHECK-NEXT: smov w9, v1.b[10] +; CHECK-NEXT: mov v2.b[8], w16 +; CHECK-NEXT: sdiv w9, w10, w9 +; CHECK-NEXT: smov w10, v1.b[11] +; CHECK-NEXT: mov v2.b[9], w8 +; CHECK-NEXT: sdiv w10, w11, w10 ; CHECK-NEXT: smov w11, v1.b[12] -; CHECK-NEXT: smov w15, v0.b[12] -; CHECK-NEXT: sdiv w2, w4, w2 -; CHECK-NEXT: mov v2.b[10], w17 -; CHECK-NEXT: smov w0, v1.b[13] -; CHECK-NEXT: smov w4, v0.b[13] -; CHECK-NEXT: sdiv w11, w15, w11 -; CHECK-NEXT: mov v2.b[11], w2 -; CHECK-NEXT: smov w15, v1.b[14] -; CHECK-NEXT: sdiv w0, w4, w0 -; CHECK-NEXT: smov w4, v0.b[14] +; CHECK-NEXT: mov v2.b[10], w9 +; CHECK-NEXT: smov w9, v1.b[14] +; CHECK-NEXT: sdiv w11, w12, w11 +; CHECK-NEXT: smov w12, v1.b[13] +; CHECK-NEXT: mov v2.b[11], w10 +; CHECK-NEXT: smov w10, v1.b[15] +; CHECK-NEXT: sdiv w8, w13, w12 +; CHECK-NEXT: smov w12, v0.b[14] ; CHECK-NEXT: mov v2.b[12], w11 -; CHECK-NEXT: sdiv w15, w4, w15 -; CHECK-NEXT: smov w8, v1.b[15] -; CHECK-NEXT: mov v2.b[13], w0 -; CHECK-NEXT: smov w9, v0.b[15] -; CHECK-NEXT: mov v2.b[14], w15 -; CHECK-NEXT: sdiv w8, w9, w8 +; CHECK-NEXT: smov w11, v0.b[15] +; CHECK-NEXT: sdiv w9, w12, w9 +; CHECK-NEXT: mov v2.b[13], w8 +; CHECK-NEXT: sdiv w8, w11, w10 +; CHECK-NEXT: mov v2.b[14], w9 ; CHECK-NEXT: mov v2.b[15], w8 ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret @@ -288,23 +288,23 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w10, v1.h[0] -; CHECK-NEXT: smov w11, v0.h[0] ; CHECK-NEXT: smov w8, v1.h[1] ; CHECK-NEXT: smov w9, v0.h[1] -; CHECK-NEXT: sdiv w10, w11, w10 -; CHECK-NEXT: sdiv w8, w9, w8 -; CHECK-NEXT: smov w9, v1.h[2] +; CHECK-NEXT: smov w10, v0.h[0] ; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: sdiv w9, w11, w9 -; CHECK-NEXT: smov w10, v1.h[3] -; CHECK-NEXT: mov v2.h[1], w8 -; CHECK-NEXT: smov w8, v0.h[3] -; CHECK-NEXT: mov v2.h[2], w9 -; CHECK-NEXT: sdiv w8, w8, w10 -; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: sdiv w8, w9, w8 +; CHECK-NEXT: smov w9, v1.h[0] +; CHECK-NEXT: sdiv w9, w10, w9 +; CHECK-NEXT: smov w10, v1.h[2] +; CHECK-NEXT: sdiv w10, w11, w10 +; CHECK-NEXT: smov w11, v1.h[3] +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: sdiv w8, w12, w11 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = sdiv <4 x i16> %A, %B; ret <4 x i16> %tmp3 @@ -313,37 +313,37 @@ define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) { ; CHECK-LABEL: sdiv8x16: ; CHECK: // %bb.0: -; CHECK-NEXT: smov w10, v1.h[0] -; CHECK-NEXT: smov w11, v0.h[0] ; CHECK-NEXT: smov w8, v1.h[1] ; CHECK-NEXT: smov w9, v0.h[1] -; CHECK-NEXT: sdiv w10, w11, w10 -; CHECK-NEXT: smov w12, v1.h[2] -; CHECK-NEXT: smov w13, v0.h[2] +; CHECK-NEXT: smov w10, v0.h[0] +; CHECK-NEXT: smov w11, v0.h[2] +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: smov w13, v0.h[4] ; CHECK-NEXT: sdiv w8, w9, w8 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: smov w14, v1.h[3] -; CHECK-NEXT: smov w15, v0.h[3] -; CHECK-NEXT: sdiv w12, w13, w12 +; CHECK-NEXT: smov w9, v1.h[0] +; CHECK-NEXT: sdiv w9, w10, w9 +; CHECK-NEXT: smov w10, v1.h[2] +; CHECK-NEXT: sdiv w10, w11, w10 +; CHECK-NEXT: smov w11, v1.h[3] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: smov w9, v1.h[5] ; CHECK-NEXT: mov v2.h[1], w8 -; CHECK-NEXT: smov w9, v1.h[4] -; CHECK-NEXT: smov w11, v0.h[4] -; CHECK-NEXT: sdiv w14, w15, w14 -; CHECK-NEXT: mov v2.h[2], w12 -; CHECK-NEXT: smov w13, v1.h[5] -; CHECK-NEXT: smov w15, v0.h[5] -; CHECK-NEXT: sdiv w9, w11, w9 -; CHECK-NEXT: mov v2.h[3], w14 -; CHECK-NEXT: smov w11, v1.h[6] -; CHECK-NEXT: sdiv w13, w15, w13 -; CHECK-NEXT: smov w15, v0.h[6] -; CHECK-NEXT: mov v2.h[4], w9 -; CHECK-NEXT: sdiv w11, w15, w11 -; CHECK-NEXT: smov w8, v1.h[7] -; CHECK-NEXT: mov v2.h[5], w13 -; CHECK-NEXT: smov w9, v0.h[7] -; CHECK-NEXT: mov v2.h[6], w11 -; CHECK-NEXT: sdiv w8, w9, w8 +; CHECK-NEXT: sdiv w11, w12, w11 +; CHECK-NEXT: smov w12, v1.h[4] +; CHECK-NEXT: mov v2.h[2], w10 +; CHECK-NEXT: smov w10, v0.h[6] +; CHECK-NEXT: sdiv w12, w13, w12 +; CHECK-NEXT: smov w13, v0.h[5] +; CHECK-NEXT: mov v2.h[3], w11 +; CHECK-NEXT: smov w11, v0.h[7] +; CHECK-NEXT: sdiv w8, w13, w9 +; CHECK-NEXT: smov w9, v1.h[6] +; CHECK-NEXT: mov v2.h[4], w12 +; CHECK-NEXT: sdiv w9, w10, w9 +; CHECK-NEXT: smov w10, v1.h[7] +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: sdiv w8, w11, w10 +; CHECK-NEXT: mov v2.h[6], w9 ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret @@ -370,14 +370,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: sdiv w10, w11, w10 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov w10, v0.s[1] ; CHECK-NEXT: sdiv w8, w9, w8 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: mov w9, v1.s[1] +; CHECK-NEXT: sdiv w9, w10, w9 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = sdiv <2 x i32> %A, %B; @@ -387,21 +387,21 @@ define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: sdiv4x32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov w12, s1 -; CHECK-NEXT: fmov w15, s0 ; CHECK-NEXT: mov w8, v1.s[1] ; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: sdiv w12, w15, w12 -; CHECK-NEXT: mov w10, v1.s[2] -; CHECK-NEXT: mov w13, v0.s[2] -; CHECK-NEXT: mov w14, v0.s[3] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov w11, v0.s[2] +; CHECK-NEXT: mov w12, v0.s[3] ; CHECK-NEXT: sdiv w8, w9, w8 -; CHECK-NEXT: fmov s0, w12 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: sdiv w9, w10, w9 +; CHECK-NEXT: mov w10, v1.s[2] +; CHECK-NEXT: sdiv w10, w11, w10 ; CHECK-NEXT: mov w11, v1.s[3] -; CHECK-NEXT: sdiv w9, w13, w10 +; CHECK-NEXT: fmov s0, w9 ; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v0.s[2], w9 -; CHECK-NEXT: sdiv w8, w14, w11 +; CHECK-NEXT: sdiv w8, w12, w11 +; CHECK-NEXT: mov v0.s[2], w10 ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: ret %tmp3 = sdiv <4 x i32> %A, %B; @@ -425,14 +425,14 @@ define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) { ; CHECK-LABEL: sdiv2x64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: sdiv x10, x11, x10 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mov x10, v0.d[1] ; CHECK-NEXT: sdiv x8, x9, x8 -; CHECK-NEXT: fmov d0, x10 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: sdiv x9, x10, x9 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: mov v0.d[1], x9 ; CHECK-NEXT: ret %tmp3 = sdiv <2 x i64> %A, %B; ret <2 x i64> %tmp3 @@ -457,37 +457,37 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v1.b[0] -; CHECK-NEXT: umov w11, v0.b[0] ; CHECK-NEXT: umov w8, v1.b[1] ; CHECK-NEXT: umov w9, v0.b[1] -; CHECK-NEXT: udiv w10, w11, w10 -; CHECK-NEXT: umov w12, v1.b[2] -; CHECK-NEXT: umov w13, v0.b[2] +; CHECK-NEXT: umov w10, v0.b[0] +; CHECK-NEXT: umov w11, v0.b[2] +; CHECK-NEXT: umov w12, v0.b[3] +; CHECK-NEXT: umov w13, v0.b[4] ; CHECK-NEXT: udiv w8, w9, w8 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: umov w14, v1.b[3] -; CHECK-NEXT: umov w15, v0.b[3] -; CHECK-NEXT: udiv w12, w13, w12 +; CHECK-NEXT: umov w9, v1.b[0] +; CHECK-NEXT: udiv w9, w10, w9 +; CHECK-NEXT: umov w10, v1.b[2] +; CHECK-NEXT: udiv w10, w11, w10 +; CHECK-NEXT: umov w11, v1.b[3] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: umov w9, v1.b[5] ; CHECK-NEXT: mov v2.b[1], w8 -; CHECK-NEXT: umov w9, v1.b[4] -; CHECK-NEXT: umov w11, v0.b[4] -; CHECK-NEXT: udiv w14, w15, w14 -; CHECK-NEXT: mov v2.b[2], w12 -; CHECK-NEXT: umov w13, v1.b[5] -; CHECK-NEXT: umov w15, v0.b[5] -; CHECK-NEXT: udiv w9, w11, w9 -; CHECK-NEXT: mov v2.b[3], w14 -; CHECK-NEXT: umov w11, v1.b[6] -; CHECK-NEXT: udiv w13, w15, w13 -; CHECK-NEXT: umov w15, v0.b[6] -; CHECK-NEXT: mov v2.b[4], w9 -; CHECK-NEXT: udiv w11, w15, w11 -; CHECK-NEXT: umov w8, v1.b[7] -; CHECK-NEXT: mov v2.b[5], w13 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: mov v2.b[6], w11 -; CHECK-NEXT: udiv w8, w9, w8 +; CHECK-NEXT: udiv w11, w12, w11 +; CHECK-NEXT: umov w12, v1.b[4] +; CHECK-NEXT: mov v2.b[2], w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: udiv w12, w13, w12 +; CHECK-NEXT: umov w13, v0.b[5] +; CHECK-NEXT: mov v2.b[3], w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: udiv w8, w13, w9 +; CHECK-NEXT: umov w9, v1.b[6] +; CHECK-NEXT: mov v2.b[4], w12 +; CHECK-NEXT: udiv w9, w10, w9 +; CHECK-NEXT: umov w10, v1.b[7] +; CHECK-NEXT: mov v2.b[5], w8 +; CHECK-NEXT: udiv w8, w11, w10 +; CHECK-NEXT: mov v2.b[6], w9 ; CHECK-NEXT: mov v2.b[7], w8 ; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret @@ -498,69 +498,69 @@ define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) { ; CHECK-LABEL: udiv16x8: ; CHECK: // %bb.0: -; CHECK-NEXT: umov w10, v1.b[0] -; CHECK-NEXT: umov w11, v0.b[0] ; CHECK-NEXT: umov w8, v1.b[1] ; CHECK-NEXT: umov w9, v0.b[1] -; CHECK-NEXT: udiv w10, w11, w10 -; CHECK-NEXT: umov w12, v1.b[2] -; CHECK-NEXT: umov w13, v0.b[2] +; CHECK-NEXT: umov w10, v0.b[0] +; CHECK-NEXT: umov w11, v0.b[2] +; CHECK-NEXT: umov w12, v0.b[3] +; CHECK-NEXT: umov w13, v0.b[4] +; CHECK-NEXT: umov w14, v0.b[5] +; CHECK-NEXT: umov w15, v0.b[6] ; CHECK-NEXT: udiv w8, w9, w8 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: umov w14, v1.b[3] -; CHECK-NEXT: umov w15, v0.b[3] -; CHECK-NEXT: udiv w12, w13, w12 +; CHECK-NEXT: umov w9, v1.b[0] +; CHECK-NEXT: umov w16, v0.b[7] +; CHECK-NEXT: umov w17, v0.b[8] +; CHECK-NEXT: udiv w9, w10, w9 +; CHECK-NEXT: umov w10, v1.b[2] +; CHECK-NEXT: udiv w10, w11, w10 +; CHECK-NEXT: umov w11, v1.b[3] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: umov w9, v1.b[9] ; CHECK-NEXT: mov v2.b[1], w8 -; CHECK-NEXT: umov w16, v1.b[4] -; CHECK-NEXT: umov w17, v0.b[4] +; CHECK-NEXT: udiv w11, w12, w11 +; CHECK-NEXT: umov w12, v1.b[4] +; CHECK-NEXT: mov v2.b[2], w10 +; CHECK-NEXT: umov w10, v0.b[10] +; CHECK-NEXT: udiv w12, w13, w12 +; CHECK-NEXT: umov w13, v1.b[5] +; CHECK-NEXT: mov v2.b[3], w11 +; CHECK-NEXT: umov w11, v0.b[11] +; CHECK-NEXT: udiv w13, w14, w13 +; CHECK-NEXT: umov w14, v1.b[6] +; CHECK-NEXT: mov v2.b[4], w12 +; CHECK-NEXT: umov w12, v0.b[12] ; CHECK-NEXT: udiv w14, w15, w14 -; CHECK-NEXT: mov v2.b[2], w12 -; CHECK-NEXT: umov w18, v1.b[5] -; CHECK-NEXT: umov w0, v0.b[5] +; CHECK-NEXT: umov w15, v1.b[7] +; CHECK-NEXT: mov v2.b[5], w13 +; CHECK-NEXT: umov w13, v0.b[13] +; CHECK-NEXT: udiv w15, w16, w15 +; CHECK-NEXT: umov w16, v1.b[8] +; CHECK-NEXT: mov v2.b[6], w14 ; CHECK-NEXT: udiv w16, w17, w16 -; CHECK-NEXT: mov v2.b[3], w14 -; CHECK-NEXT: umov w1, v1.b[6] -; CHECK-NEXT: umov w2, v0.b[6] -; CHECK-NEXT: udiv w18, w0, w18 -; CHECK-NEXT: mov v2.b[4], w16 -; CHECK-NEXT: umov w3, v1.b[7] -; CHECK-NEXT: umov w4, v0.b[7] -; CHECK-NEXT: udiv w1, w2, w1 -; CHECK-NEXT: mov v2.b[5], w18 -; CHECK-NEXT: umov w9, v1.b[8] -; CHECK-NEXT: umov w11, v0.b[8] -; CHECK-NEXT: udiv w3, w4, w3 -; CHECK-NEXT: mov v2.b[6], w1 -; CHECK-NEXT: umov w13, v1.b[9] -; CHECK-NEXT: umov w15, v0.b[9] -; CHECK-NEXT: udiv w9, w11, w9 -; CHECK-NEXT: mov v2.b[7], w3 -; CHECK-NEXT: umov w17, v1.b[10] -; CHECK-NEXT: umov w0, v0.b[10] -; CHECK-NEXT: udiv w13, w15, w13 -; CHECK-NEXT: mov v2.b[8], w9 -; CHECK-NEXT: umov w2, v1.b[11] -; CHECK-NEXT: umov w4, v0.b[11] -; CHECK-NEXT: udiv w17, w0, w17 -; CHECK-NEXT: mov v2.b[9], w13 +; CHECK-NEXT: umov w17, v0.b[9] +; CHECK-NEXT: mov v2.b[7], w15 +; CHECK-NEXT: udiv w8, w17, w9 +; CHECK-NEXT: umov w9, v1.b[10] +; CHECK-NEXT: mov v2.b[8], w16 +; CHECK-NEXT: udiv w9, w10, w9 +; CHECK-NEXT: umov w10, v1.b[11] +; CHECK-NEXT: mov v2.b[9], w8 +; CHECK-NEXT: udiv w10, w11, w10 ; CHECK-NEXT: umov w11, v1.b[12] -; CHECK-NEXT: umov w15, v0.b[12] -; CHECK-NEXT: udiv w2, w4, w2 -; CHECK-NEXT: mov v2.b[10], w17 -; CHECK-NEXT: umov w0, v1.b[13] -; CHECK-NEXT: umov w4, v0.b[13] -; CHECK-NEXT: udiv w11, w15, w11 -; CHECK-NEXT: mov v2.b[11], w2 -; CHECK-NEXT: umov w15, v1.b[14] -; CHECK-NEXT: udiv w0, w4, w0 -; CHECK-NEXT: umov w4, v0.b[14] +; CHECK-NEXT: mov v2.b[10], w9 +; CHECK-NEXT: umov w9, v1.b[14] +; CHECK-NEXT: udiv w11, w12, w11 +; CHECK-NEXT: umov w12, v1.b[13] +; CHECK-NEXT: mov v2.b[11], w10 +; CHECK-NEXT: umov w10, v1.b[15] +; CHECK-NEXT: udiv w8, w13, w12 +; CHECK-NEXT: umov w12, v0.b[14] ; CHECK-NEXT: mov v2.b[12], w11 -; CHECK-NEXT: udiv w15, w4, w15 -; CHECK-NEXT: umov w8, v1.b[15] -; CHECK-NEXT: mov v2.b[13], w0 -; CHECK-NEXT: umov w9, v0.b[15] -; CHECK-NEXT: mov v2.b[14], w15 -; CHECK-NEXT: udiv w8, w9, w8 +; CHECK-NEXT: umov w11, v0.b[15] +; CHECK-NEXT: udiv w9, w12, w9 +; CHECK-NEXT: mov v2.b[13], w8 +; CHECK-NEXT: udiv w8, w11, w10 +; CHECK-NEXT: mov v2.b[14], w9 ; CHECK-NEXT: mov v2.b[15], w8 ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret @@ -587,23 +587,23 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v1.h[0] -; CHECK-NEXT: umov w11, v0.h[0] ; CHECK-NEXT: umov w8, v1.h[1] ; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: udiv w10, w11, w10 -; CHECK-NEXT: udiv w8, w9, w8 -; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: umov w10, v0.h[0] ; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: udiv w9, w11, w9 -; CHECK-NEXT: umov w10, v1.h[3] -; CHECK-NEXT: mov v2.h[1], w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: mov v2.h[2], w9 -; CHECK-NEXT: udiv w8, w8, w10 -; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: umov w12, v0.h[3] +; CHECK-NEXT: udiv w8, w9, w8 +; CHECK-NEXT: umov w9, v1.h[0] +; CHECK-NEXT: udiv w9, w10, w9 +; CHECK-NEXT: umov w10, v1.h[2] +; CHECK-NEXT: udiv w10, w11, w10 +; CHECK-NEXT: umov w11, v1.h[3] +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: udiv w8, w12, w11 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = udiv <4 x i16> %A, %B; ret <4 x i16> %tmp3 @@ -612,37 +612,37 @@ define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) { ; CHECK-LABEL: udiv8x16: ; CHECK: // %bb.0: -; CHECK-NEXT: umov w10, v1.h[0] -; CHECK-NEXT: umov w11, v0.h[0] ; CHECK-NEXT: umov w8, v1.h[1] ; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: udiv w10, w11, w10 -; CHECK-NEXT: umov w12, v1.h[2] -; CHECK-NEXT: umov w13, v0.h[2] +; CHECK-NEXT: umov w10, v0.h[0] +; CHECK-NEXT: umov w11, v0.h[2] +; CHECK-NEXT: umov w12, v0.h[3] +; CHECK-NEXT: umov w13, v0.h[4] ; CHECK-NEXT: udiv w8, w9, w8 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: umov w14, v1.h[3] -; CHECK-NEXT: umov w15, v0.h[3] -; CHECK-NEXT: udiv w12, w13, w12 +; CHECK-NEXT: umov w9, v1.h[0] +; CHECK-NEXT: udiv w9, w10, w9 +; CHECK-NEXT: umov w10, v1.h[2] +; CHECK-NEXT: udiv w10, w11, w10 +; CHECK-NEXT: umov w11, v1.h[3] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: umov w9, v1.h[5] ; CHECK-NEXT: mov v2.h[1], w8 -; CHECK-NEXT: umov w9, v1.h[4] -; CHECK-NEXT: umov w11, v0.h[4] -; CHECK-NEXT: udiv w14, w15, w14 -; CHECK-NEXT: mov v2.h[2], w12 -; CHECK-NEXT: umov w13, v1.h[5] -; CHECK-NEXT: umov w15, v0.h[5] -; CHECK-NEXT: udiv w9, w11, w9 -; CHECK-NEXT: mov v2.h[3], w14 -; CHECK-NEXT: umov w11, v1.h[6] -; CHECK-NEXT: udiv w13, w15, w13 -; CHECK-NEXT: umov w15, v0.h[6] -; CHECK-NEXT: mov v2.h[4], w9 -; CHECK-NEXT: udiv w11, w15, w11 -; CHECK-NEXT: umov w8, v1.h[7] -; CHECK-NEXT: mov v2.h[5], w13 -; CHECK-NEXT: umov w9, v0.h[7] -; CHECK-NEXT: mov v2.h[6], w11 -; CHECK-NEXT: udiv w8, w9, w8 +; CHECK-NEXT: udiv w11, w12, w11 +; CHECK-NEXT: umov w12, v1.h[4] +; CHECK-NEXT: mov v2.h[2], w10 +; CHECK-NEXT: umov w10, v0.h[6] +; CHECK-NEXT: udiv w12, w13, w12 +; CHECK-NEXT: umov w13, v0.h[5] +; CHECK-NEXT: mov v2.h[3], w11 +; CHECK-NEXT: umov w11, v0.h[7] +; CHECK-NEXT: udiv w8, w13, w9 +; CHECK-NEXT: umov w9, v1.h[6] +; CHECK-NEXT: mov v2.h[4], w12 +; CHECK-NEXT: udiv w9, w10, w9 +; CHECK-NEXT: umov w10, v1.h[7] +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: udiv w8, w11, w10 +; CHECK-NEXT: mov v2.h[6], w9 ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret @@ -669,14 +669,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: udiv w10, w11, w10 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov w10, v0.s[1] ; CHECK-NEXT: udiv w8, w9, w8 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: mov w9, v1.s[1] +; CHECK-NEXT: udiv w9, w10, w9 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = udiv <2 x i32> %A, %B; @@ -686,21 +686,21 @@ define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: udiv4x32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov w12, s1 -; CHECK-NEXT: fmov w15, s0 ; CHECK-NEXT: mov w8, v1.s[1] ; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: udiv w12, w15, w12 -; CHECK-NEXT: mov w10, v1.s[2] -; CHECK-NEXT: mov w13, v0.s[2] -; CHECK-NEXT: mov w14, v0.s[3] +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov w11, v0.s[2] +; CHECK-NEXT: mov w12, v0.s[3] ; CHECK-NEXT: udiv w8, w9, w8 -; CHECK-NEXT: fmov s0, w12 +; CHECK-NEXT: fmov w9, s1 +; CHECK-NEXT: udiv w9, w10, w9 +; CHECK-NEXT: mov w10, v1.s[2] +; CHECK-NEXT: udiv w10, w11, w10 ; CHECK-NEXT: mov w11, v1.s[3] -; CHECK-NEXT: udiv w9, w13, w10 +; CHECK-NEXT: fmov s0, w9 ; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v0.s[2], w9 -; CHECK-NEXT: udiv w8, w14, w11 +; CHECK-NEXT: udiv w8, w12, w11 +; CHECK-NEXT: mov v0.s[2], w10 ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: ret %tmp3 = udiv <4 x i32> %A, %B; @@ -724,14 +724,14 @@ define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) { ; CHECK-LABEL: udiv2x64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: udiv x10, x11, x10 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mov x10, v0.d[1] ; CHECK-NEXT: udiv x8, x9, x8 -; CHECK-NEXT: fmov d0, x10 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: udiv x9, x10, x9 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: mov v0.d[1], x9 ; CHECK-NEXT: ret %tmp3 = udiv <2 x i64> %A, %B; ret <2 x i64> %tmp3 @@ -757,45 +757,45 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w11, v1.b[0] +; CHECK-NEXT: smov w12, v0.b[0] ; CHECK-NEXT: smov w8, v1.b[1] ; CHECK-NEXT: smov w9, v0.b[1] -; CHECK-NEXT: smov w10, v1.b[0] -; CHECK-NEXT: smov w11, v0.b[0] -; CHECK-NEXT: sdiv w16, w9, w8 -; CHECK-NEXT: smov w12, v1.b[2] -; CHECK-NEXT: smov w13, v0.b[2] -; CHECK-NEXT: msub w8, w16, w8, w9 -; CHECK-NEXT: sdiv w16, w11, w10 -; CHECK-NEXT: smov w14, v1.b[3] -; CHECK-NEXT: smov w15, v0.b[3] -; CHECK-NEXT: msub w10, w16, w10, w11 -; CHECK-NEXT: sdiv w16, w13, w12 -; CHECK-NEXT: smov w9, v1.b[4] -; CHECK-NEXT: smov w11, v0.b[4] -; CHECK-NEXT: msub w12, w16, w12, w13 +; CHECK-NEXT: smov w14, v1.b[2] +; CHECK-NEXT: smov w15, v0.b[2] +; CHECK-NEXT: smov w17, v1.b[3] +; CHECK-NEXT: smov w18, v0.b[3] +; CHECK-NEXT: sdiv w13, w12, w11 +; CHECK-NEXT: smov w1, v1.b[4] +; CHECK-NEXT: smov w2, v0.b[4] +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: smov w12, v1.b[5] +; CHECK-NEXT: sdiv w10, w9, w8 +; CHECK-NEXT: smov w13, v0.b[5] +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: smov w11, v0.b[6] +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: smov w10, v1.b[6] ; CHECK-NEXT: sdiv w16, w15, w14 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: smov w13, v1.b[5] -; CHECK-NEXT: msub w14, w16, w14, w15 -; CHECK-NEXT: smov w15, v0.b[5] -; CHECK-NEXT: sdiv w16, w11, w9 ; CHECK-NEXT: mov v2.b[1], w8 -; CHECK-NEXT: msub w9, w16, w9, w11 -; CHECK-NEXT: sdiv w16, w15, w13 -; CHECK-NEXT: mov v2.b[2], w12 -; CHECK-NEXT: smov w11, v1.b[6] -; CHECK-NEXT: msub w13, w16, w13, w15 -; CHECK-NEXT: smov w15, v0.b[6] +; CHECK-NEXT: msub w8, w16, w14, w15 +; CHECK-NEXT: smov w15, v1.b[7] +; CHECK-NEXT: sdiv w0, w18, w17 +; CHECK-NEXT: smov w16, v0.b[7] +; CHECK-NEXT: mov v2.b[2], w8 +; CHECK-NEXT: msub w14, w0, w17, w18 +; CHECK-NEXT: sdiv w3, w2, w1 ; CHECK-NEXT: mov v2.b[3], w14 -; CHECK-NEXT: sdiv w16, w15, w11 -; CHECK-NEXT: smov w8, v1.b[7] -; CHECK-NEXT: mov v2.b[4], w9 -; CHECK-NEXT: smov w9, v0.b[7] -; CHECK-NEXT: msub w11, w16, w11, w15 -; CHECK-NEXT: mov v2.b[5], w13 -; CHECK-NEXT: sdiv w10, w9, w8 -; CHECK-NEXT: mov v2.b[6], w11 -; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: msub w14, w3, w1, w2 +; CHECK-NEXT: sdiv w9, w13, w12 +; CHECK-NEXT: mov v2.b[4], w14 +; CHECK-NEXT: msub w9, w9, w12, w13 +; CHECK-NEXT: sdiv w8, w11, w10 +; CHECK-NEXT: mov v2.b[5], w9 +; CHECK-NEXT: msub w8, w8, w10, w11 +; CHECK-NEXT: sdiv w12, w16, w15 +; CHECK-NEXT: mov v2.b[6], w8 +; CHECK-NEXT: msub w8, w12, w15, w16 ; CHECK-NEXT: mov v2.b[7], w8 ; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret @@ -806,87 +806,104 @@ define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) { ; CHECK-LABEL: srem16x8: ; CHECK: // %bb.0: +; CHECK-NEXT: stp x26, x25, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w25, -56 +; CHECK-NEXT: .cfi_offset w26, -64 +; CHECK-NEXT: smov w11, v1.b[0] +; CHECK-NEXT: smov w12, v0.b[0] ; CHECK-NEXT: smov w8, v1.b[1] ; CHECK-NEXT: smov w9, v0.b[1] -; CHECK-NEXT: smov w10, v1.b[0] -; CHECK-NEXT: smov w11, v0.b[0] -; CHECK-NEXT: sdiv w5, w9, w8 -; CHECK-NEXT: smov w12, v1.b[2] -; CHECK-NEXT: smov w13, v0.b[2] -; CHECK-NEXT: msub w8, w5, w8, w9 -; CHECK-NEXT: sdiv w5, w11, w10 -; CHECK-NEXT: smov w14, v1.b[3] -; CHECK-NEXT: smov w15, v0.b[3] -; CHECK-NEXT: msub w10, w5, w10, w11 -; CHECK-NEXT: sdiv w5, w13, w12 -; CHECK-NEXT: smov w16, v1.b[4] -; CHECK-NEXT: smov w17, v0.b[4] -; CHECK-NEXT: msub w12, w5, w12, w13 -; CHECK-NEXT: sdiv w5, w15, w14 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: smov w18, v1.b[5] -; CHECK-NEXT: smov w0, v0.b[5] -; CHECK-NEXT: msub w14, w5, w14, w15 -; CHECK-NEXT: sdiv w5, w17, w16 +; CHECK-NEXT: smov w14, v1.b[2] +; CHECK-NEXT: smov w15, v0.b[2] +; CHECK-NEXT: smov w17, v1.b[3] +; CHECK-NEXT: smov w18, v0.b[3] +; CHECK-NEXT: sdiv w13, w12, w11 +; CHECK-NEXT: smov w1, v1.b[4] +; CHECK-NEXT: smov w2, v0.b[4] +; CHECK-NEXT: smov w4, v1.b[5] +; CHECK-NEXT: smov w5, v0.b[5] +; CHECK-NEXT: smov w7, v1.b[6] +; CHECK-NEXT: smov w19, v0.b[6] +; CHECK-NEXT: smov w21, v1.b[7] +; CHECK-NEXT: smov w22, v0.b[7] +; CHECK-NEXT: smov w24, v1.b[8] +; CHECK-NEXT: smov w25, v0.b[8] +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: smov w12, v1.b[9] +; CHECK-NEXT: sdiv w10, w9, w8 +; CHECK-NEXT: smov w13, v0.b[9] +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: smov w11, v0.b[10] +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: smov w10, v1.b[10] +; CHECK-NEXT: sdiv w16, w15, w14 ; CHECK-NEXT: mov v2.b[1], w8 -; CHECK-NEXT: smov w1, v1.b[6] -; CHECK-NEXT: smov w2, v0.b[6] -; CHECK-NEXT: msub w16, w5, w16, w17 -; CHECK-NEXT: sdiv w5, w0, w18 -; CHECK-NEXT: mov v2.b[2], w12 -; CHECK-NEXT: smov w3, v1.b[7] -; CHECK-NEXT: smov w4, v0.b[7] -; CHECK-NEXT: msub w18, w5, w18, w0 -; CHECK-NEXT: sdiv w5, w2, w1 +; CHECK-NEXT: msub w8, w16, w14, w15 +; CHECK-NEXT: smov w15, v1.b[11] +; CHECK-NEXT: sdiv w0, w18, w17 +; CHECK-NEXT: smov w16, v0.b[11] +; CHECK-NEXT: mov v2.b[2], w8 +; CHECK-NEXT: msub w14, w0, w17, w18 +; CHECK-NEXT: smov w18, v1.b[12] +; CHECK-NEXT: sdiv w3, w2, w1 +; CHECK-NEXT: smov w0, v0.b[12] ; CHECK-NEXT: mov v2.b[3], w14 -; CHECK-NEXT: smov w9, v1.b[8] -; CHECK-NEXT: smov w11, v0.b[8] -; CHECK-NEXT: msub w1, w5, w1, w2 -; CHECK-NEXT: sdiv w5, w4, w3 -; CHECK-NEXT: mov v2.b[4], w16 -; CHECK-NEXT: smov w13, v1.b[9] -; CHECK-NEXT: smov w15, v0.b[9] -; CHECK-NEXT: msub w3, w5, w3, w4 -; CHECK-NEXT: sdiv w5, w11, w9 -; CHECK-NEXT: mov v2.b[5], w18 -; CHECK-NEXT: smov w17, v1.b[10] -; CHECK-NEXT: smov w0, v0.b[10] -; CHECK-NEXT: msub w9, w5, w9, w11 -; CHECK-NEXT: sdiv w5, w15, w13 -; CHECK-NEXT: mov v2.b[6], w1 -; CHECK-NEXT: smov w2, v1.b[11] -; CHECK-NEXT: smov w4, v0.b[11] -; CHECK-NEXT: msub w13, w5, w13, w15 -; CHECK-NEXT: sdiv w5, w0, w17 -; CHECK-NEXT: mov v2.b[7], w3 -; CHECK-NEXT: smov w11, v1.b[12] -; CHECK-NEXT: smov w15, v0.b[12] -; CHECK-NEXT: msub w17, w5, w17, w0 -; CHECK-NEXT: sdiv w5, w4, w2 -; CHECK-NEXT: mov v2.b[8], w9 -; CHECK-NEXT: smov w0, v1.b[13] -; CHECK-NEXT: msub w2, w5, w2, w4 -; CHECK-NEXT: smov w4, v0.b[13] -; CHECK-NEXT: sdiv w5, w15, w11 -; CHECK-NEXT: mov v2.b[9], w13 -; CHECK-NEXT: msub w11, w5, w11, w15 -; CHECK-NEXT: sdiv w5, w4, w0 -; CHECK-NEXT: mov v2.b[10], w17 -; CHECK-NEXT: smov w15, v1.b[14] -; CHECK-NEXT: msub w0, w5, w0, w4 -; CHECK-NEXT: smov w4, v0.b[14] -; CHECK-NEXT: mov v2.b[11], w2 -; CHECK-NEXT: sdiv w5, w4, w15 -; CHECK-NEXT: smov w8, v1.b[15] +; CHECK-NEXT: msub w14, w3, w1, w2 +; CHECK-NEXT: smov w2, v1.b[13] +; CHECK-NEXT: sdiv w6, w5, w4 +; CHECK-NEXT: smov w3, v0.b[13] +; CHECK-NEXT: mov v2.b[4], w14 +; CHECK-NEXT: msub w17, w6, w4, w5 +; CHECK-NEXT: sdiv w20, w19, w7 +; CHECK-NEXT: mov v2.b[5], w17 +; CHECK-NEXT: msub w17, w20, w7, w19 +; CHECK-NEXT: sdiv w23, w22, w21 +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: mov v2.b[6], w17 +; CHECK-NEXT: msub w1, w23, w21, w22 +; CHECK-NEXT: sdiv w26, w25, w24 +; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: mov v2.b[7], w1 +; CHECK-NEXT: msub w1, w26, w24, w25 +; CHECK-NEXT: sdiv w9, w13, w12 +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: mov v2.b[8], w1 +; CHECK-NEXT: msub w9, w9, w12, w13 +; CHECK-NEXT: smov w13, v1.b[15] +; CHECK-NEXT: sdiv w8, w11, w10 +; CHECK-NEXT: mov v2.b[9], w9 +; CHECK-NEXT: smov w9, v1.b[14] +; CHECK-NEXT: msub w8, w8, w10, w11 +; CHECK-NEXT: smov w10, v0.b[14] +; CHECK-NEXT: sdiv w14, w16, w15 +; CHECK-NEXT: mov v2.b[10], w8 +; CHECK-NEXT: msub w11, w14, w15, w16 +; CHECK-NEXT: smov w14, v0.b[15] +; CHECK-NEXT: sdiv w17, w0, w18 +; CHECK-NEXT: mov v2.b[11], w11 +; CHECK-NEXT: msub w11, w17, w18, w0 +; CHECK-NEXT: sdiv w12, w3, w2 ; CHECK-NEXT: mov v2.b[12], w11 -; CHECK-NEXT: smov w9, v0.b[15] -; CHECK-NEXT: msub w15, w5, w15, w4 -; CHECK-NEXT: mov v2.b[13], w0 -; CHECK-NEXT: sdiv w10, w9, w8 -; CHECK-NEXT: mov v2.b[14], w15 -; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: msub w12, w12, w2, w3 +; CHECK-NEXT: sdiv w8, w10, w9 +; CHECK-NEXT: mov v2.b[13], w12 +; CHECK-NEXT: msub w8, w8, w9, w10 +; CHECK-NEXT: sdiv w11, w14, w13 +; CHECK-NEXT: mov v2.b[14], w8 +; CHECK-NEXT: msub w8, w11, w13, w14 ; CHECK-NEXT: mov v2.b[15], w8 ; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ldp x26, x25, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret %tmp3 = srem <16 x i8> %A, %B; ret <16 x i8> %tmp3 @@ -912,27 +929,27 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w11, v1.h[0] +; CHECK-NEXT: smov w12, v0.h[0] ; CHECK-NEXT: smov w8, v1.h[1] ; CHECK-NEXT: smov w9, v0.h[1] -; CHECK-NEXT: smov w10, v1.h[0] -; CHECK-NEXT: smov w11, v0.h[0] -; CHECK-NEXT: sdiv w12, w9, w8 -; CHECK-NEXT: msub w8, w12, w8, w9 -; CHECK-NEXT: sdiv w12, w11, w10 -; CHECK-NEXT: smov w9, v1.h[2] -; CHECK-NEXT: msub w10, w12, w10, w11 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: sdiv w12, w11, w9 -; CHECK-NEXT: msub w9, w12, w9, w11 -; CHECK-NEXT: smov w11, v1.h[3] -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: smov w10, v0.h[3] -; CHECK-NEXT: mov v1.h[1], w8 -; CHECK-NEXT: sdiv w8, w10, w11 -; CHECK-NEXT: mov v1.h[2], w9 -; CHECK-NEXT: msub w8, w8, w11, w10 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: smov w14, v1.h[2] +; CHECK-NEXT: smov w15, v0.h[2] +; CHECK-NEXT: sdiv w13, w12, w11 +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: smov w12, v1.h[3] +; CHECK-NEXT: sdiv w10, w9, w8 +; CHECK-NEXT: smov w13, v0.h[3] +; CHECK-NEXT: fmov s0, w11 +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: sdiv w16, w15, w14 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: msub w10, w16, w14, w15 +; CHECK-NEXT: sdiv w9, w13, w12 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: msub w8, w9, w12, w13 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = srem <4 x i16> %A, %B; ret <4 x i16> %tmp3 @@ -941,45 +958,45 @@ define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) { ; CHECK-LABEL: srem8x16: ; CHECK: // %bb.0: +; CHECK-NEXT: smov w11, v1.h[0] +; CHECK-NEXT: smov w12, v0.h[0] ; CHECK-NEXT: smov w8, v1.h[1] ; CHECK-NEXT: smov w9, v0.h[1] -; CHECK-NEXT: smov w10, v1.h[0] -; CHECK-NEXT: smov w11, v0.h[0] -; CHECK-NEXT: sdiv w16, w9, w8 -; CHECK-NEXT: smov w12, v1.h[2] -; CHECK-NEXT: smov w13, v0.h[2] -; CHECK-NEXT: msub w8, w16, w8, w9 -; CHECK-NEXT: sdiv w16, w11, w10 -; CHECK-NEXT: smov w14, v1.h[3] -; CHECK-NEXT: smov w15, v0.h[3] -; CHECK-NEXT: msub w10, w16, w10, w11 -; CHECK-NEXT: sdiv w16, w13, w12 -; CHECK-NEXT: smov w9, v1.h[4] -; CHECK-NEXT: smov w11, v0.h[4] -; CHECK-NEXT: msub w12, w16, w12, w13 +; CHECK-NEXT: smov w14, v1.h[2] +; CHECK-NEXT: smov w15, v0.h[2] +; CHECK-NEXT: smov w17, v1.h[3] +; CHECK-NEXT: smov w18, v0.h[3] +; CHECK-NEXT: sdiv w13, w12, w11 +; CHECK-NEXT: smov w1, v1.h[4] +; CHECK-NEXT: smov w2, v0.h[4] +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: smov w12, v1.h[5] +; CHECK-NEXT: sdiv w10, w9, w8 +; CHECK-NEXT: smov w13, v0.h[5] +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: smov w11, v0.h[6] +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: smov w10, v1.h[6] ; CHECK-NEXT: sdiv w16, w15, w14 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: smov w13, v1.h[5] -; CHECK-NEXT: msub w14, w16, w14, w15 -; CHECK-NEXT: smov w15, v0.h[5] -; CHECK-NEXT: sdiv w16, w11, w9 ; CHECK-NEXT: mov v2.h[1], w8 -; CHECK-NEXT: msub w9, w16, w9, w11 -; CHECK-NEXT: sdiv w16, w15, w13 -; CHECK-NEXT: mov v2.h[2], w12 -; CHECK-NEXT: smov w11, v1.h[6] -; CHECK-NEXT: msub w13, w16, w13, w15 -; CHECK-NEXT: smov w15, v0.h[6] +; CHECK-NEXT: msub w8, w16, w14, w15 +; CHECK-NEXT: smov w15, v1.h[7] +; CHECK-NEXT: sdiv w0, w18, w17 +; CHECK-NEXT: smov w16, v0.h[7] +; CHECK-NEXT: mov v2.h[2], w8 +; CHECK-NEXT: msub w14, w0, w17, w18 +; CHECK-NEXT: sdiv w3, w2, w1 ; CHECK-NEXT: mov v2.h[3], w14 -; CHECK-NEXT: sdiv w16, w15, w11 -; CHECK-NEXT: smov w8, v1.h[7] -; CHECK-NEXT: mov v2.h[4], w9 -; CHECK-NEXT: smov w9, v0.h[7] -; CHECK-NEXT: msub w11, w16, w11, w15 -; CHECK-NEXT: mov v2.h[5], w13 -; CHECK-NEXT: sdiv w10, w9, w8 -; CHECK-NEXT: mov v2.h[6], w11 -; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: msub w14, w3, w1, w2 +; CHECK-NEXT: sdiv w9, w13, w12 +; CHECK-NEXT: mov v2.h[4], w14 +; CHECK-NEXT: msub w9, w9, w12, w13 +; CHECK-NEXT: sdiv w8, w11, w10 +; CHECK-NEXT: mov v2.h[5], w9 +; CHECK-NEXT: msub w8, w8, w10, w11 +; CHECK-NEXT: sdiv w12, w16, w15 +; CHECK-NEXT: mov v2.h[6], w8 +; CHECK-NEXT: msub w8, w12, w15, w16 ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret @@ -1007,16 +1024,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: sdiv w12, w11, w10 -; CHECK-NEXT: sdiv w13, w9, w8 -; CHECK-NEXT: msub w10, w12, w10, w11 -; CHECK-NEXT: msub w8, w13, w8, w9 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov w11, v1.s[1] +; CHECK-NEXT: mov w12, v0.s[1] +; CHECK-NEXT: sdiv w10, w9, w8 +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: sdiv w13, w12, w11 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: msub w9, w13, w11, w12 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = srem <2 x i32> %A, %B; @@ -1026,25 +1043,25 @@ define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: srem4x32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov w12, s1 -; CHECK-NEXT: fmov w15, s0 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: fmov w12, s0 ; CHECK-NEXT: mov w8, v1.s[1] ; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: sdiv w16, w15, w12 -; CHECK-NEXT: mov w10, v1.s[2] -; CHECK-NEXT: mov w13, v0.s[2] -; CHECK-NEXT: msub w12, w16, w12, w15 -; CHECK-NEXT: sdiv w15, w9, w8 -; CHECK-NEXT: mov w11, v1.s[3] -; CHECK-NEXT: mov w14, v0.s[3] -; CHECK-NEXT: msub w8, w15, w8, w9 -; CHECK-NEXT: sdiv w9, w13, w10 -; CHECK-NEXT: fmov s0, w12 -; CHECK-NEXT: msub w9, w9, w10, w13 +; CHECK-NEXT: mov w14, v1.s[2] +; CHECK-NEXT: mov w15, v0.s[2] +; CHECK-NEXT: sdiv w13, w12, w11 +; CHECK-NEXT: mov w17, v1.s[3] +; CHECK-NEXT: mov w18, v0.s[3] +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: sdiv w10, w9, w8 +; CHECK-NEXT: fmov s0, w11 +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: sdiv w16, w15, w14 ; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: sdiv w8, w14, w11 -; CHECK-NEXT: mov v0.s[2], w9 -; CHECK-NEXT: msub w8, w8, w11, w14 +; CHECK-NEXT: msub w10, w16, w14, w15 +; CHECK-NEXT: sdiv w9, w18, w17 +; CHECK-NEXT: mov v0.s[2], w10 +; CHECK-NEXT: msub w8, w9, w17, w18 ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: ret %tmp3 = srem <4 x i32> %A, %B; @@ -1069,16 +1086,16 @@ define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) { ; CHECK-LABEL: srem2x64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: sdiv x12, x11, x10 -; CHECK-NEXT: sdiv x13, x9, x8 -; CHECK-NEXT: msub x10, x12, x10, x11 -; CHECK-NEXT: msub x8, x13, x8, x9 -; CHECK-NEXT: fmov d0, x10 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mov x11, v1.d[1] +; CHECK-NEXT: mov x12, v0.d[1] +; CHECK-NEXT: sdiv x10, x9, x8 +; CHECK-NEXT: msub x8, x10, x8, x9 +; CHECK-NEXT: sdiv x13, x12, x11 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: msub x9, x13, x11, x12 +; CHECK-NEXT: mov v0.d[1], x9 ; CHECK-NEXT: ret %tmp3 = srem <2 x i64> %A, %B; ret <2 x i64> %tmp3 @@ -1104,45 +1121,45 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w11, v1.b[0] +; CHECK-NEXT: umov w12, v0.b[0] ; CHECK-NEXT: umov w8, v1.b[1] ; CHECK-NEXT: umov w9, v0.b[1] -; CHECK-NEXT: umov w10, v1.b[0] -; CHECK-NEXT: umov w11, v0.b[0] -; CHECK-NEXT: udiv w16, w9, w8 -; CHECK-NEXT: umov w12, v1.b[2] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: msub w8, w16, w8, w9 -; CHECK-NEXT: udiv w16, w11, w10 -; CHECK-NEXT: umov w14, v1.b[3] -; CHECK-NEXT: umov w15, v0.b[3] -; CHECK-NEXT: msub w10, w16, w10, w11 -; CHECK-NEXT: udiv w16, w13, w12 -; CHECK-NEXT: umov w9, v1.b[4] -; CHECK-NEXT: umov w11, v0.b[4] -; CHECK-NEXT: msub w12, w16, w12, w13 +; CHECK-NEXT: umov w14, v1.b[2] +; CHECK-NEXT: umov w15, v0.b[2] +; CHECK-NEXT: umov w17, v1.b[3] +; CHECK-NEXT: umov w18, v0.b[3] +; CHECK-NEXT: udiv w13, w12, w11 +; CHECK-NEXT: umov w1, v1.b[4] +; CHECK-NEXT: umov w2, v0.b[4] +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: umov w12, v1.b[5] +; CHECK-NEXT: udiv w10, w9, w8 +; CHECK-NEXT: umov w13, v0.b[5] +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: umov w11, v0.b[6] +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: umov w10, v1.b[6] ; CHECK-NEXT: udiv w16, w15, w14 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: umov w13, v1.b[5] -; CHECK-NEXT: msub w14, w16, w14, w15 -; CHECK-NEXT: umov w15, v0.b[5] -; CHECK-NEXT: udiv w16, w11, w9 ; CHECK-NEXT: mov v2.b[1], w8 -; CHECK-NEXT: msub w9, w16, w9, w11 -; CHECK-NEXT: udiv w16, w15, w13 -; CHECK-NEXT: mov v2.b[2], w12 -; CHECK-NEXT: umov w11, v1.b[6] -; CHECK-NEXT: msub w13, w16, w13, w15 -; CHECK-NEXT: umov w15, v0.b[6] +; CHECK-NEXT: msub w8, w16, w14, w15 +; CHECK-NEXT: umov w15, v1.b[7] +; CHECK-NEXT: udiv w0, w18, w17 +; CHECK-NEXT: umov w16, v0.b[7] +; CHECK-NEXT: mov v2.b[2], w8 +; CHECK-NEXT: msub w14, w0, w17, w18 +; CHECK-NEXT: udiv w3, w2, w1 ; CHECK-NEXT: mov v2.b[3], w14 -; CHECK-NEXT: udiv w16, w15, w11 -; CHECK-NEXT: umov w8, v1.b[7] -; CHECK-NEXT: mov v2.b[4], w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: msub w11, w16, w11, w15 -; CHECK-NEXT: mov v2.b[5], w13 -; CHECK-NEXT: udiv w10, w9, w8 -; CHECK-NEXT: mov v2.b[6], w11 -; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: msub w14, w3, w1, w2 +; CHECK-NEXT: udiv w9, w13, w12 +; CHECK-NEXT: mov v2.b[4], w14 +; CHECK-NEXT: msub w9, w9, w12, w13 +; CHECK-NEXT: udiv w8, w11, w10 +; CHECK-NEXT: mov v2.b[5], w9 +; CHECK-NEXT: msub w8, w8, w10, w11 +; CHECK-NEXT: udiv w12, w16, w15 +; CHECK-NEXT: mov v2.b[6], w8 +; CHECK-NEXT: msub w8, w12, w15, w16 ; CHECK-NEXT: mov v2.b[7], w8 ; CHECK-NEXT: fmov d0, d2 ; CHECK-NEXT: ret @@ -1153,87 +1170,104 @@ define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) { ; CHECK-LABEL: urem16x8: ; CHECK: // %bb.0: +; CHECK-NEXT: stp x26, x25, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w25, -56 +; CHECK-NEXT: .cfi_offset w26, -64 +; CHECK-NEXT: umov w11, v1.b[0] +; CHECK-NEXT: umov w12, v0.b[0] ; CHECK-NEXT: umov w8, v1.b[1] ; CHECK-NEXT: umov w9, v0.b[1] -; CHECK-NEXT: umov w10, v1.b[0] -; CHECK-NEXT: umov w11, v0.b[0] -; CHECK-NEXT: udiv w5, w9, w8 -; CHECK-NEXT: umov w12, v1.b[2] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: msub w8, w5, w8, w9 -; CHECK-NEXT: udiv w5, w11, w10 -; CHECK-NEXT: umov w14, v1.b[3] -; CHECK-NEXT: umov w15, v0.b[3] -; CHECK-NEXT: msub w10, w5, w10, w11 -; CHECK-NEXT: udiv w5, w13, w12 -; CHECK-NEXT: umov w16, v1.b[4] -; CHECK-NEXT: umov w17, v0.b[4] -; CHECK-NEXT: msub w12, w5, w12, w13 -; CHECK-NEXT: udiv w5, w15, w14 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: umov w18, v1.b[5] -; CHECK-NEXT: umov w0, v0.b[5] -; CHECK-NEXT: msub w14, w5, w14, w15 -; CHECK-NEXT: udiv w5, w17, w16 +; CHECK-NEXT: umov w14, v1.b[2] +; CHECK-NEXT: umov w15, v0.b[2] +; CHECK-NEXT: umov w17, v1.b[3] +; CHECK-NEXT: umov w18, v0.b[3] +; CHECK-NEXT: udiv w13, w12, w11 +; CHECK-NEXT: umov w1, v1.b[4] +; CHECK-NEXT: umov w2, v0.b[4] +; CHECK-NEXT: umov w4, v1.b[5] +; CHECK-NEXT: umov w5, v0.b[5] +; CHECK-NEXT: umov w7, v1.b[6] +; CHECK-NEXT: umov w19, v0.b[6] +; CHECK-NEXT: umov w21, v1.b[7] +; CHECK-NEXT: umov w22, v0.b[7] +; CHECK-NEXT: umov w24, v1.b[8] +; CHECK-NEXT: umov w25, v0.b[8] +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: umov w12, v1.b[9] +; CHECK-NEXT: udiv w10, w9, w8 +; CHECK-NEXT: umov w13, v0.b[9] +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: umov w11, v0.b[10] +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: umov w10, v1.b[10] +; CHECK-NEXT: udiv w16, w15, w14 ; CHECK-NEXT: mov v2.b[1], w8 -; CHECK-NEXT: umov w1, v1.b[6] -; CHECK-NEXT: umov w2, v0.b[6] -; CHECK-NEXT: msub w16, w5, w16, w17 -; CHECK-NEXT: udiv w5, w0, w18 -; CHECK-NEXT: mov v2.b[2], w12 -; CHECK-NEXT: umov w3, v1.b[7] -; CHECK-NEXT: umov w4, v0.b[7] -; CHECK-NEXT: msub w18, w5, w18, w0 -; CHECK-NEXT: udiv w5, w2, w1 +; CHECK-NEXT: msub w8, w16, w14, w15 +; CHECK-NEXT: umov w15, v1.b[11] +; CHECK-NEXT: udiv w0, w18, w17 +; CHECK-NEXT: umov w16, v0.b[11] +; CHECK-NEXT: mov v2.b[2], w8 +; CHECK-NEXT: msub w14, w0, w17, w18 +; CHECK-NEXT: umov w18, v1.b[12] +; CHECK-NEXT: udiv w3, w2, w1 +; CHECK-NEXT: umov w0, v0.b[12] ; CHECK-NEXT: mov v2.b[3], w14 -; CHECK-NEXT: umov w9, v1.b[8] -; CHECK-NEXT: umov w11, v0.b[8] -; CHECK-NEXT: msub w1, w5, w1, w2 -; CHECK-NEXT: udiv w5, w4, w3 -; CHECK-NEXT: mov v2.b[4], w16 -; CHECK-NEXT: umov w13, v1.b[9] -; CHECK-NEXT: umov w15, v0.b[9] -; CHECK-NEXT: msub w3, w5, w3, w4 -; CHECK-NEXT: udiv w5, w11, w9 -; CHECK-NEXT: mov v2.b[5], w18 -; CHECK-NEXT: umov w17, v1.b[10] -; CHECK-NEXT: umov w0, v0.b[10] -; CHECK-NEXT: msub w9, w5, w9, w11 -; CHECK-NEXT: udiv w5, w15, w13 -; CHECK-NEXT: mov v2.b[6], w1 -; CHECK-NEXT: umov w2, v1.b[11] -; CHECK-NEXT: umov w4, v0.b[11] -; CHECK-NEXT: msub w13, w5, w13, w15 -; CHECK-NEXT: udiv w5, w0, w17 -; CHECK-NEXT: mov v2.b[7], w3 -; CHECK-NEXT: umov w11, v1.b[12] -; CHECK-NEXT: umov w15, v0.b[12] -; CHECK-NEXT: msub w17, w5, w17, w0 -; CHECK-NEXT: udiv w5, w4, w2 -; CHECK-NEXT: mov v2.b[8], w9 -; CHECK-NEXT: umov w0, v1.b[13] -; CHECK-NEXT: msub w2, w5, w2, w4 -; CHECK-NEXT: umov w4, v0.b[13] -; CHECK-NEXT: udiv w5, w15, w11 -; CHECK-NEXT: mov v2.b[9], w13 -; CHECK-NEXT: msub w11, w5, w11, w15 -; CHECK-NEXT: udiv w5, w4, w0 -; CHECK-NEXT: mov v2.b[10], w17 -; CHECK-NEXT: umov w15, v1.b[14] -; CHECK-NEXT: msub w0, w5, w0, w4 -; CHECK-NEXT: umov w4, v0.b[14] -; CHECK-NEXT: mov v2.b[11], w2 -; CHECK-NEXT: udiv w5, w4, w15 -; CHECK-NEXT: umov w8, v1.b[15] +; CHECK-NEXT: msub w14, w3, w1, w2 +; CHECK-NEXT: umov w2, v1.b[13] +; CHECK-NEXT: udiv w6, w5, w4 +; CHECK-NEXT: umov w3, v0.b[13] +; CHECK-NEXT: mov v2.b[4], w14 +; CHECK-NEXT: msub w17, w6, w4, w5 +; CHECK-NEXT: udiv w20, w19, w7 +; CHECK-NEXT: mov v2.b[5], w17 +; CHECK-NEXT: msub w17, w20, w7, w19 +; CHECK-NEXT: udiv w23, w22, w21 +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: mov v2.b[6], w17 +; CHECK-NEXT: msub w1, w23, w21, w22 +; CHECK-NEXT: udiv w26, w25, w24 +; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: mov v2.b[7], w1 +; CHECK-NEXT: msub w1, w26, w24, w25 +; CHECK-NEXT: udiv w9, w13, w12 +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: mov v2.b[8], w1 +; CHECK-NEXT: msub w9, w9, w12, w13 +; CHECK-NEXT: umov w13, v1.b[15] +; CHECK-NEXT: udiv w8, w11, w10 +; CHECK-NEXT: mov v2.b[9], w9 +; CHECK-NEXT: umov w9, v1.b[14] +; CHECK-NEXT: msub w8, w8, w10, w11 +; CHECK-NEXT: umov w10, v0.b[14] +; CHECK-NEXT: udiv w14, w16, w15 +; CHECK-NEXT: mov v2.b[10], w8 +; CHECK-NEXT: msub w11, w14, w15, w16 +; CHECK-NEXT: umov w14, v0.b[15] +; CHECK-NEXT: udiv w17, w0, w18 +; CHECK-NEXT: mov v2.b[11], w11 +; CHECK-NEXT: msub w11, w17, w18, w0 +; CHECK-NEXT: udiv w12, w3, w2 ; CHECK-NEXT: mov v2.b[12], w11 -; CHECK-NEXT: umov w9, v0.b[15] -; CHECK-NEXT: msub w15, w5, w15, w4 -; CHECK-NEXT: mov v2.b[13], w0 -; CHECK-NEXT: udiv w10, w9, w8 -; CHECK-NEXT: mov v2.b[14], w15 -; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: msub w12, w12, w2, w3 +; CHECK-NEXT: udiv w8, w10, w9 +; CHECK-NEXT: mov v2.b[13], w12 +; CHECK-NEXT: msub w8, w8, w9, w10 +; CHECK-NEXT: udiv w11, w14, w13 +; CHECK-NEXT: mov v2.b[14], w8 +; CHECK-NEXT: msub w8, w11, w13, w14 ; CHECK-NEXT: mov v2.b[15], w8 ; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: ldp x26, x25, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret %tmp3 = urem <16 x i8> %A, %B; ret <16 x i8> %tmp3 @@ -1259,27 +1293,27 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w11, v1.h[0] +; CHECK-NEXT: umov w12, v0.h[0] ; CHECK-NEXT: umov w8, v1.h[1] ; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v1.h[0] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: udiv w12, w9, w8 -; CHECK-NEXT: msub w8, w12, w8, w9 -; CHECK-NEXT: udiv w12, w11, w10 -; CHECK-NEXT: umov w9, v1.h[2] -; CHECK-NEXT: msub w10, w12, w10, w11 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: udiv w12, w11, w9 -; CHECK-NEXT: msub w9, w12, w9, w11 -; CHECK-NEXT: umov w11, v1.h[3] -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: umov w10, v0.h[3] -; CHECK-NEXT: mov v1.h[1], w8 -; CHECK-NEXT: udiv w8, w10, w11 -; CHECK-NEXT: mov v1.h[2], w9 -; CHECK-NEXT: msub w8, w8, w11, w10 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: umov w14, v1.h[2] +; CHECK-NEXT: umov w15, v0.h[2] +; CHECK-NEXT: udiv w13, w12, w11 +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: umov w12, v1.h[3] +; CHECK-NEXT: udiv w10, w9, w8 +; CHECK-NEXT: umov w13, v0.h[3] +; CHECK-NEXT: fmov s0, w11 +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: udiv w16, w15, w14 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: msub w10, w16, w14, w15 +; CHECK-NEXT: udiv w9, w13, w12 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: msub w8, w9, w12, w13 +; CHECK-NEXT: mov v0.h[3], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = urem <4 x i16> %A, %B; ret <4 x i16> %tmp3 @@ -1288,45 +1322,45 @@ define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) { ; CHECK-LABEL: urem8x16: ; CHECK: // %bb.0: +; CHECK-NEXT: umov w11, v1.h[0] +; CHECK-NEXT: umov w12, v0.h[0] ; CHECK-NEXT: umov w8, v1.h[1] ; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v1.h[0] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: udiv w16, w9, w8 -; CHECK-NEXT: umov w12, v1.h[2] -; CHECK-NEXT: umov w13, v0.h[2] -; CHECK-NEXT: msub w8, w16, w8, w9 -; CHECK-NEXT: udiv w16, w11, w10 -; CHECK-NEXT: umov w14, v1.h[3] -; CHECK-NEXT: umov w15, v0.h[3] -; CHECK-NEXT: msub w10, w16, w10, w11 -; CHECK-NEXT: udiv w16, w13, w12 -; CHECK-NEXT: umov w9, v1.h[4] -; CHECK-NEXT: umov w11, v0.h[4] -; CHECK-NEXT: msub w12, w16, w12, w13 +; CHECK-NEXT: umov w14, v1.h[2] +; CHECK-NEXT: umov w15, v0.h[2] +; CHECK-NEXT: umov w17, v1.h[3] +; CHECK-NEXT: umov w18, v0.h[3] +; CHECK-NEXT: udiv w13, w12, w11 +; CHECK-NEXT: umov w1, v1.h[4] +; CHECK-NEXT: umov w2, v0.h[4] +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: umov w12, v1.h[5] +; CHECK-NEXT: udiv w10, w9, w8 +; CHECK-NEXT: umov w13, v0.h[5] +; CHECK-NEXT: fmov s2, w11 +; CHECK-NEXT: umov w11, v0.h[6] +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: umov w10, v1.h[6] ; CHECK-NEXT: udiv w16, w15, w14 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: umov w13, v1.h[5] -; CHECK-NEXT: msub w14, w16, w14, w15 -; CHECK-NEXT: umov w15, v0.h[5] -; CHECK-NEXT: udiv w16, w11, w9 ; CHECK-NEXT: mov v2.h[1], w8 -; CHECK-NEXT: msub w9, w16, w9, w11 -; CHECK-NEXT: udiv w16, w15, w13 -; CHECK-NEXT: mov v2.h[2], w12 -; CHECK-NEXT: umov w11, v1.h[6] -; CHECK-NEXT: msub w13, w16, w13, w15 -; CHECK-NEXT: umov w15, v0.h[6] +; CHECK-NEXT: msub w8, w16, w14, w15 +; CHECK-NEXT: umov w15, v1.h[7] +; CHECK-NEXT: udiv w0, w18, w17 +; CHECK-NEXT: umov w16, v0.h[7] +; CHECK-NEXT: mov v2.h[2], w8 +; CHECK-NEXT: msub w14, w0, w17, w18 +; CHECK-NEXT: udiv w3, w2, w1 ; CHECK-NEXT: mov v2.h[3], w14 -; CHECK-NEXT: udiv w16, w15, w11 -; CHECK-NEXT: umov w8, v1.h[7] -; CHECK-NEXT: mov v2.h[4], w9 -; CHECK-NEXT: umov w9, v0.h[7] -; CHECK-NEXT: msub w11, w16, w11, w15 -; CHECK-NEXT: mov v2.h[5], w13 -; CHECK-NEXT: udiv w10, w9, w8 -; CHECK-NEXT: mov v2.h[6], w11 -; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: msub w14, w3, w1, w2 +; CHECK-NEXT: udiv w9, w13, w12 +; CHECK-NEXT: mov v2.h[4], w14 +; CHECK-NEXT: msub w9, w9, w12, w13 +; CHECK-NEXT: udiv w8, w11, w10 +; CHECK-NEXT: mov v2.h[5], w9 +; CHECK-NEXT: msub w8, w8, w10, w11 +; CHECK-NEXT: udiv w12, w16, w15 +; CHECK-NEXT: mov v2.h[6], w8 +; CHECK-NEXT: msub w8, w12, w15, w16 ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret @@ -1354,16 +1388,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: udiv w12, w11, w10 -; CHECK-NEXT: udiv w13, w9, w8 -; CHECK-NEXT: msub w10, w12, w10, w11 -; CHECK-NEXT: msub w8, w13, w8, w9 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov w11, v1.s[1] +; CHECK-NEXT: mov w12, v0.s[1] +; CHECK-NEXT: udiv w10, w9, w8 +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: udiv w13, w12, w11 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: msub w9, w13, w11, w12 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = urem <2 x i32> %A, %B; @@ -1373,25 +1407,25 @@ define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) { ; CHECK-LABEL: urem4x32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov w12, s1 -; CHECK-NEXT: fmov w15, s0 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: fmov w12, s0 ; CHECK-NEXT: mov w8, v1.s[1] ; CHECK-NEXT: mov w9, v0.s[1] -; CHECK-NEXT: udiv w16, w15, w12 -; CHECK-NEXT: mov w10, v1.s[2] -; CHECK-NEXT: mov w13, v0.s[2] -; CHECK-NEXT: msub w12, w16, w12, w15 -; CHECK-NEXT: udiv w15, w9, w8 -; CHECK-NEXT: mov w11, v1.s[3] -; CHECK-NEXT: mov w14, v0.s[3] -; CHECK-NEXT: msub w8, w15, w8, w9 -; CHECK-NEXT: udiv w9, w13, w10 -; CHECK-NEXT: fmov s0, w12 -; CHECK-NEXT: msub w9, w9, w10, w13 +; CHECK-NEXT: mov w14, v1.s[2] +; CHECK-NEXT: mov w15, v0.s[2] +; CHECK-NEXT: udiv w13, w12, w11 +; CHECK-NEXT: mov w17, v1.s[3] +; CHECK-NEXT: mov w18, v0.s[3] +; CHECK-NEXT: msub w11, w13, w11, w12 +; CHECK-NEXT: udiv w10, w9, w8 +; CHECK-NEXT: fmov s0, w11 +; CHECK-NEXT: msub w8, w10, w8, w9 +; CHECK-NEXT: udiv w16, w15, w14 ; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: udiv w8, w14, w11 -; CHECK-NEXT: mov v0.s[2], w9 -; CHECK-NEXT: msub w8, w8, w11, w14 +; CHECK-NEXT: msub w10, w16, w14, w15 +; CHECK-NEXT: udiv w9, w18, w17 +; CHECK-NEXT: mov v0.s[2], w10 +; CHECK-NEXT: msub w8, w9, w17, w18 ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: ret %tmp3 = urem <4 x i32> %A, %B; @@ -1416,16 +1450,16 @@ define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) { ; CHECK-LABEL: urem2x64: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: mov x9, v0.d[1] -; CHECK-NEXT: udiv x12, x11, x10 -; CHECK-NEXT: udiv x13, x9, x8 -; CHECK-NEXT: msub x10, x12, x10, x11 -; CHECK-NEXT: msub x8, x13, x8, x9 -; CHECK-NEXT: fmov d0, x10 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mov x11, v1.d[1] +; CHECK-NEXT: mov x12, v0.d[1] +; CHECK-NEXT: udiv x10, x9, x8 +; CHECK-NEXT: msub x8, x10, x8, x9 +; CHECK-NEXT: udiv x13, x12, x11 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: msub x9, x13, x11, x12 +; CHECK-NEXT: mov v0.d[1], x9 ; CHECK-NEXT: ret %tmp3 = urem <2 x i64> %A, %B; ret <2 x i64> %tmp3 @@ -1450,8 +1484,8 @@ ; CHECK-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.s[1], v1.s[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: add sp, sp, #64 @@ -1493,8 +1527,8 @@ ; CHECK-NEXT: mov s1, v1.s[3] ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: mov v1.s[3], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: add sp, sp, #64 @@ -1533,8 +1567,8 @@ ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-NEXT: bl fmod ; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll --- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll +++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll @@ -7,11 +7,11 @@ ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: fmov.2d v0, #2.00000000 ; CHECK-NEXT: and x8, x1, #0x3 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: fmov.2d v0, #2.00000000 ; CHECK-NEXT: bfi x9, x8, #2, #2 +; CHECK-NEXT: str q0, [sp] ; CHECK-NEXT: ldr s0, [x9] ; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: add sp, sp, #16 @@ -28,11 +28,11 @@ ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: movi.16b v0, #63 ; CHECK-NEXT: and x8, x1, #0x3 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: movi.16b v0, #63 ; CHECK-NEXT: bfi x9, x8, #2, #2 +; CHECK-NEXT: str q0, [sp] ; CHECK-NEXT: ldr s0, [x9] ; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: add sp, sp, #16 @@ -54,8 +54,8 @@ ; CHECK-LABEL: nvcast_f32_v8i8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: adrp x8, __gv@GOTPAGE -; CHECK-NEXT: ldr x8, [x8, __gv@GOTPAGEOFF] ; CHECK-NEXT: movi.8b v0, #254 +; CHECK-NEXT: ldr x8, [x8, __gv@GOTPAGEOFF] ; CHECK-NEXT: str d0, [x8] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll --- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll @@ -14,17 +14,17 @@ ; ; CHECK-NONEON-LABEL: cnt32_advsimd: ; CHECK-NONEON: // %bb.0: -; CHECK-NONEON-NEXT: lsr w8, w0, #1 -; CHECK-NONEON-NEXT: and w8, w8, #0x55555555 -; CHECK-NONEON-NEXT: sub w8, w0, w8 -; CHECK-NONEON-NEXT: and w9, w8, #0x33333333 -; CHECK-NONEON-NEXT: lsr w8, w8, #2 -; CHECK-NONEON-NEXT: and w8, w8, #0x33333333 -; CHECK-NONEON-NEXT: add w8, w9, w8 -; CHECK-NONEON-NEXT: add w8, w8, w8, lsr #4 -; CHECK-NONEON-NEXT: and w8, w8, #0xf0f0f0f -; CHECK-NONEON-NEXT: mov w9, #16843009 -; CHECK-NONEON-NEXT: mul w8, w8, w9 +; CHECK-NONEON-NEXT: lsr w9, w0, #1 +; CHECK-NONEON-NEXT: mov w8, #16843009 +; CHECK-NONEON-NEXT: and w9, w9, #0x55555555 +; CHECK-NONEON-NEXT: sub w9, w0, w9 +; CHECK-NONEON-NEXT: lsr w10, w9, #2 +; CHECK-NONEON-NEXT: and w9, w9, #0x33333333 +; CHECK-NONEON-NEXT: and w10, w10, #0x33333333 +; CHECK-NONEON-NEXT: add w9, w9, w10 +; CHECK-NONEON-NEXT: add w9, w9, w9, lsr #4 +; CHECK-NONEON-NEXT: and w9, w9, #0xf0f0f0f +; CHECK-NONEON-NEXT: mul w8, w9, w8 ; CHECK-NONEON-NEXT: lsr w0, w8, #24 ; CHECK-NONEON-NEXT: ret %cnt = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -44,17 +44,17 @@ ; ; CHECK-NONEON-LABEL: cnt32_advsimd_2: ; CHECK-NONEON: // %bb.0: -; CHECK-NONEON-NEXT: lsr w8, w0, #1 -; CHECK-NONEON-NEXT: and w8, w8, #0x55555555 -; CHECK-NONEON-NEXT: sub w8, w0, w8 -; CHECK-NONEON-NEXT: and w9, w8, #0x33333333 -; CHECK-NONEON-NEXT: lsr w8, w8, #2 -; CHECK-NONEON-NEXT: and w8, w8, #0x33333333 -; CHECK-NONEON-NEXT: add w8, w9, w8 -; CHECK-NONEON-NEXT: add w8, w8, w8, lsr #4 -; CHECK-NONEON-NEXT: and w8, w8, #0xf0f0f0f -; CHECK-NONEON-NEXT: mov w9, #16843009 -; CHECK-NONEON-NEXT: mul w8, w8, w9 +; CHECK-NONEON-NEXT: lsr w9, w0, #1 +; CHECK-NONEON-NEXT: mov w8, #16843009 +; CHECK-NONEON-NEXT: and w9, w9, #0x55555555 +; CHECK-NONEON-NEXT: sub w9, w0, w9 +; CHECK-NONEON-NEXT: lsr w10, w9, #2 +; CHECK-NONEON-NEXT: and w9, w9, #0x33333333 +; CHECK-NONEON-NEXT: and w10, w10, #0x33333333 +; CHECK-NONEON-NEXT: add w9, w9, w10 +; CHECK-NONEON-NEXT: add w9, w9, w9, lsr #4 +; CHECK-NONEON-NEXT: and w9, w9, #0xf0f0f0f +; CHECK-NONEON-NEXT: mul w8, w9, w8 ; CHECK-NONEON-NEXT: lsr w0, w8, #24 ; CHECK-NONEON-NEXT: ret %1 = extractelement <2 x i32> %x, i64 0 @@ -73,17 +73,17 @@ ; ; CHECK-NONEON-LABEL: cnt64_advsimd: ; CHECK-NONEON: // %bb.0: -; CHECK-NONEON-NEXT: lsr x8, x0, #1 -; CHECK-NONEON-NEXT: and x8, x8, #0x5555555555555555 -; CHECK-NONEON-NEXT: sub x8, x0, x8 -; CHECK-NONEON-NEXT: and x9, x8, #0x3333333333333333 -; CHECK-NONEON-NEXT: lsr x8, x8, #2 -; CHECK-NONEON-NEXT: and x8, x8, #0x3333333333333333 -; CHECK-NONEON-NEXT: add x8, x9, x8 -; CHECK-NONEON-NEXT: add x8, x8, x8, lsr #4 -; CHECK-NONEON-NEXT: and x8, x8, #0xf0f0f0f0f0f0f0f -; CHECK-NONEON-NEXT: mov x9, #72340172838076673 -; CHECK-NONEON-NEXT: mul x8, x8, x9 +; CHECK-NONEON-NEXT: lsr x9, x0, #1 +; CHECK-NONEON-NEXT: mov x8, #72340172838076673 +; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 +; CHECK-NONEON-NEXT: sub x9, x0, x9 +; CHECK-NONEON-NEXT: lsr x10, x9, #2 +; CHECK-NONEON-NEXT: and x9, x9, #0x3333333333333333 +; CHECK-NONEON-NEXT: and x10, x10, #0x3333333333333333 +; CHECK-NONEON-NEXT: add x9, x9, x10 +; CHECK-NONEON-NEXT: add x9, x9, x9, lsr #4 +; CHECK-NONEON-NEXT: and x9, x9, #0xf0f0f0f0f0f0f0f +; CHECK-NONEON-NEXT: mul x8, x9, x8 ; CHECK-NONEON-NEXT: lsr x0, x8, #56 ; CHECK-NONEON-NEXT: ret %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) @@ -96,33 +96,33 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat { ; CHECK-LABEL: cnt32: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #1 -; CHECK-NEXT: and w8, w8, #0x55555555 -; CHECK-NEXT: sub w8, w0, w8 -; CHECK-NEXT: and w9, w8, #0x33333333 -; CHECK-NEXT: lsr w8, w8, #2 -; CHECK-NEXT: and w8, w8, #0x33333333 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: add w8, w8, w8, lsr #4 -; CHECK-NEXT: and w8, w8, #0xf0f0f0f -; CHECK-NEXT: mov w9, #16843009 -; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: lsr w9, w0, #1 +; CHECK-NEXT: mov w8, #16843009 +; CHECK-NEXT: and w9, w9, #0x55555555 +; CHECK-NEXT: sub w9, w0, w9 +; CHECK-NEXT: lsr w10, w9, #2 +; CHECK-NEXT: and w9, w9, #0x33333333 +; CHECK-NEXT: and w10, w10, #0x33333333 +; CHECK-NEXT: add w9, w9, w10 +; CHECK-NEXT: add w9, w9, w9, lsr #4 +; CHECK-NEXT: and w9, w9, #0xf0f0f0f +; CHECK-NEXT: mul w8, w9, w8 ; CHECK-NEXT: lsr w0, w8, #24 ; CHECK-NEXT: ret ; ; CHECK-NONEON-LABEL: cnt32: ; CHECK-NONEON: // %bb.0: -; CHECK-NONEON-NEXT: lsr w8, w0, #1 -; CHECK-NONEON-NEXT: and w8, w8, #0x55555555 -; CHECK-NONEON-NEXT: sub w8, w0, w8 -; CHECK-NONEON-NEXT: and w9, w8, #0x33333333 -; CHECK-NONEON-NEXT: lsr w8, w8, #2 -; CHECK-NONEON-NEXT: and w8, w8, #0x33333333 -; CHECK-NONEON-NEXT: add w8, w9, w8 -; CHECK-NONEON-NEXT: add w8, w8, w8, lsr #4 -; CHECK-NONEON-NEXT: and w8, w8, #0xf0f0f0f -; CHECK-NONEON-NEXT: mov w9, #16843009 -; CHECK-NONEON-NEXT: mul w8, w8, w9 +; CHECK-NONEON-NEXT: lsr w9, w0, #1 +; CHECK-NONEON-NEXT: mov w8, #16843009 +; CHECK-NONEON-NEXT: and w9, w9, #0x55555555 +; CHECK-NONEON-NEXT: sub w9, w0, w9 +; CHECK-NONEON-NEXT: lsr w10, w9, #2 +; CHECK-NONEON-NEXT: and w9, w9, #0x33333333 +; CHECK-NONEON-NEXT: and w10, w10, #0x33333333 +; CHECK-NONEON-NEXT: add w9, w9, w10 +; CHECK-NONEON-NEXT: add w9, w9, w9, lsr #4 +; CHECK-NONEON-NEXT: and w9, w9, #0xf0f0f0f +; CHECK-NONEON-NEXT: mul w8, w9, w8 ; CHECK-NONEON-NEXT: lsr w0, w8, #24 ; CHECK-NONEON-NEXT: ret %cnt = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -132,33 +132,33 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat { ; CHECK-LABEL: cnt64: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #1 -; CHECK-NEXT: and x8, x8, #0x5555555555555555 -; CHECK-NEXT: sub x8, x0, x8 -; CHECK-NEXT: and x9, x8, #0x3333333333333333 -; CHECK-NEXT: lsr x8, x8, #2 -; CHECK-NEXT: and x8, x8, #0x3333333333333333 -; CHECK-NEXT: add x8, x9, x8 -; CHECK-NEXT: add x8, x8, x8, lsr #4 -; CHECK-NEXT: and x8, x8, #0xf0f0f0f0f0f0f0f -; CHECK-NEXT: mov x9, #72340172838076673 -; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: lsr x9, x0, #1 +; CHECK-NEXT: mov x8, #72340172838076673 +; CHECK-NEXT: and x9, x9, #0x5555555555555555 +; CHECK-NEXT: sub x9, x0, x9 +; CHECK-NEXT: lsr x10, x9, #2 +; CHECK-NEXT: and x9, x9, #0x3333333333333333 +; CHECK-NEXT: and x10, x10, #0x3333333333333333 +; CHECK-NEXT: add x9, x9, x10 +; CHECK-NEXT: add x9, x9, x9, lsr #4 +; CHECK-NEXT: and x9, x9, #0xf0f0f0f0f0f0f0f +; CHECK-NEXT: mul x8, x9, x8 ; CHECK-NEXT: lsr x0, x8, #56 ; CHECK-NEXT: ret ; ; CHECK-NONEON-LABEL: cnt64: ; CHECK-NONEON: // %bb.0: -; CHECK-NONEON-NEXT: lsr x8, x0, #1 -; CHECK-NONEON-NEXT: and x8, x8, #0x5555555555555555 -; CHECK-NONEON-NEXT: sub x8, x0, x8 -; CHECK-NONEON-NEXT: and x9, x8, #0x3333333333333333 -; CHECK-NONEON-NEXT: lsr x8, x8, #2 -; CHECK-NONEON-NEXT: and x8, x8, #0x3333333333333333 -; CHECK-NONEON-NEXT: add x8, x9, x8 -; CHECK-NONEON-NEXT: add x8, x8, x8, lsr #4 -; CHECK-NONEON-NEXT: and x8, x8, #0xf0f0f0f0f0f0f0f -; CHECK-NONEON-NEXT: mov x9, #72340172838076673 -; CHECK-NONEON-NEXT: mul x8, x8, x9 +; CHECK-NONEON-NEXT: lsr x9, x0, #1 +; CHECK-NONEON-NEXT: mov x8, #72340172838076673 +; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 +; CHECK-NONEON-NEXT: sub x9, x0, x9 +; CHECK-NONEON-NEXT: lsr x10, x9, #2 +; CHECK-NONEON-NEXT: and x9, x9, #0x3333333333333333 +; CHECK-NONEON-NEXT: and x10, x10, #0x3333333333333333 +; CHECK-NONEON-NEXT: add x9, x9, x10 +; CHECK-NONEON-NEXT: add x9, x9, x9, lsr #4 +; CHECK-NONEON-NEXT: and x9, x9, #0xf0f0f0f0f0f0f0f +; CHECK-NONEON-NEXT: mul x8, x9, x8 ; CHECK-NONEON-NEXT: lsr x0, x8, #56 ; CHECK-NONEON-NEXT: ret %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) @@ -178,17 +178,17 @@ ; ; CHECK-NONEON-LABEL: ctpop_eq_one: ; CHECK-NONEON: // %bb.0: -; CHECK-NONEON-NEXT: lsr x8, x0, #1 -; CHECK-NONEON-NEXT: and x8, x8, #0x5555555555555555 -; CHECK-NONEON-NEXT: sub x8, x0, x8 -; CHECK-NONEON-NEXT: and x9, x8, #0x3333333333333333 -; CHECK-NONEON-NEXT: lsr x8, x8, #2 -; CHECK-NONEON-NEXT: and x8, x8, #0x3333333333333333 -; CHECK-NONEON-NEXT: add x8, x9, x8 -; CHECK-NONEON-NEXT: add x8, x8, x8, lsr #4 -; CHECK-NONEON-NEXT: and x8, x8, #0xf0f0f0f0f0f0f0f -; CHECK-NONEON-NEXT: mov x9, #72340172838076673 -; CHECK-NONEON-NEXT: mul x8, x8, x9 +; CHECK-NONEON-NEXT: lsr x9, x0, #1 +; CHECK-NONEON-NEXT: mov x8, #72340172838076673 +; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 +; CHECK-NONEON-NEXT: sub x9, x0, x9 +; CHECK-NONEON-NEXT: lsr x10, x9, #2 +; CHECK-NONEON-NEXT: and x9, x9, #0x3333333333333333 +; CHECK-NONEON-NEXT: and x10, x10, #0x3333333333333333 +; CHECK-NONEON-NEXT: add x9, x9, x10 +; CHECK-NONEON-NEXT: add x9, x9, x9, lsr #4 +; CHECK-NONEON-NEXT: and x9, x9, #0xf0f0f0f0f0f0f0f +; CHECK-NONEON-NEXT: mul x8, x9, x8 ; CHECK-NONEON-NEXT: lsr x8, x8, #56 ; CHECK-NONEON-NEXT: cmp x8, #1 ; CHECK-NONEON-NEXT: cset w0, eq @@ -212,17 +212,17 @@ ; ; CHECK-NONEON-LABEL: ctpop_ne_one: ; CHECK-NONEON: // %bb.0: -; CHECK-NONEON-NEXT: lsr x8, x0, #1 -; CHECK-NONEON-NEXT: and x8, x8, #0x5555555555555555 -; CHECK-NONEON-NEXT: sub x8, x0, x8 -; CHECK-NONEON-NEXT: and x9, x8, #0x3333333333333333 -; CHECK-NONEON-NEXT: lsr x8, x8, #2 -; CHECK-NONEON-NEXT: and x8, x8, #0x3333333333333333 -; CHECK-NONEON-NEXT: add x8, x9, x8 -; CHECK-NONEON-NEXT: add x8, x8, x8, lsr #4 -; CHECK-NONEON-NEXT: and x8, x8, #0xf0f0f0f0f0f0f0f -; CHECK-NONEON-NEXT: mov x9, #72340172838076673 -; CHECK-NONEON-NEXT: mul x8, x8, x9 +; CHECK-NONEON-NEXT: lsr x9, x0, #1 +; CHECK-NONEON-NEXT: mov x8, #72340172838076673 +; CHECK-NONEON-NEXT: and x9, x9, #0x5555555555555555 +; CHECK-NONEON-NEXT: sub x9, x0, x9 +; CHECK-NONEON-NEXT: lsr x10, x9, #2 +; CHECK-NONEON-NEXT: and x9, x9, #0x3333333333333333 +; CHECK-NONEON-NEXT: and x10, x10, #0x3333333333333333 +; CHECK-NONEON-NEXT: add x9, x9, x10 +; CHECK-NONEON-NEXT: add x9, x9, x9, lsr #4 +; CHECK-NONEON-NEXT: and x9, x9, #0xf0f0f0f0f0f0f0f +; CHECK-NONEON-NEXT: mul x8, x9, x8 ; CHECK-NONEON-NEXT: lsr x8, x8, #56 ; CHECK-NONEON-NEXT: cmp x8, #1 ; CHECK-NONEON-NEXT: cset w0, ne diff --git a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll --- a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll +++ b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll @@ -47,12 +47,12 @@ ; CHECK-NEXT: mov s2, v1[1] ; CHECK-NEXT: fneg s0, s1 ; CHECK-NEXT: mov s3, v1[2] -; CHECK-NEXT: fneg s2, s2 ; CHECK-NEXT: mov s1, v1[3] -; CHECK-NEXT: fneg s3, s3 -; CHECK-NEXT: mov.s v0[1], v2[0] -; CHECK-NEXT: mov.s v0[2], v3[0] +; CHECK-NEXT: fneg s2, s2 ; CHECK-NEXT: fneg s1, s1 +; CHECK-NEXT: mov.s v0[1], v2[0] +; CHECK-NEXT: fneg s2, s3 +; CHECK-NEXT: mov.s v0[2], v2[0] ; CHECK-NEXT: mov.s v0[3], v1[0] ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/arm64-register-pairing.ll b/llvm/test/CodeGen/AArch64/arm64-register-pairing.ll --- a/llvm/test/CodeGen/AArch64/arm64-register-pairing.ll +++ b/llvm/test/CodeGen/AArch64/arm64-register-pairing.ll @@ -42,8 +42,8 @@ ; CHECK-NOTMACHO-NEXT: //NO_APP ; CHECK-NOTMACHO-NEXT: ldp x21, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NOTMACHO-NEXT: ldp x25, x23, [sp, #48] // 16-byte Folded Reload -; CHECK-NOTMACHO-NEXT: ldr x27, [sp, #32] // 8-byte Folded Reload ; CHECK-NOTMACHO-NEXT: ldp d10, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-NOTMACHO-NEXT: ldr x27, [sp, #32] // 8-byte Folded Reload ; CHECK-NOTMACHO-NEXT: ldp d14, d12, [sp], #80 // 16-byte Folded Reload ; CHECK-NOTMACHO-NEXT: ret @@ -89,8 +89,8 @@ ; CHECK-NOTMACHO-NEXT: //NO_APP ; CHECK-NOTMACHO-NEXT: ldp x22, x20, [sp, #64] // 16-byte Folded Reload ; CHECK-NOTMACHO-NEXT: ldp x26, x24, [sp, #48] // 16-byte Folded Reload -; CHECK-NOTMACHO-NEXT: ldr x28, [sp, #32] // 8-byte Folded Reload ; CHECK-NOTMACHO-NEXT: ldp d11, d9, [sp, #16] // 16-byte Folded Reload +; CHECK-NOTMACHO-NEXT: ldr x28, [sp, #32] // 8-byte Folded Reload ; CHECK-NOTMACHO-NEXT: ldp d15, d13, [sp], #80 // 16-byte Folded Reload ; CHECK-NOTMACHO-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll --- a/llvm/test/CodeGen/AArch64/arm64-rev.ll +++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll @@ -533,16 +533,16 @@ define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp { ; CHECK-LABEL: test_vrev64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: add x8, x1, #2 +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: st1.h { v0 }[5], [x8] ; CHECK-NEXT: st1.h { v0 }[6], [x1] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_vrev64: ; GISEL: // %bb.0: // %entry -; GISEL-NEXT: ldr q0, [x0] ; GISEL-NEXT: add x8, x1, #2 +; GISEL-NEXT: ldr q0, [x0] ; GISEL-NEXT: st1.h { v0 }[6], [x1] ; GISEL-NEXT: st1.h { v0 }[5], [x8] ; GISEL-NEXT: ret @@ -561,18 +561,18 @@ define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp { ; CHECK-LABEL: float_vrev64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: movi.2d v1, #0000000000000000 -; CHECK-NEXT: dup.4s v1, v1[0] -; CHECK-NEXT: ext.16b v0, v0, v1, #12 +; CHECK-NEXT: movi.2d v0, #0000000000000000 +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: dup.4s v0, v0[0] +; CHECK-NEXT: ext.16b v0, v1, v0, #12 ; CHECK-NEXT: rev64.4s v0, v0 ; CHECK-NEXT: str q0, [x1, #176] ; CHECK-NEXT: ret ; ; GISEL-LABEL: float_vrev64: ; GISEL: // %bb.0: // %entry -; GISEL-NEXT: movi d0, #0000000000000000 ; GISEL-NEXT: adrp x8, .LCPI28_0 +; GISEL-NEXT: movi d0, #0000000000000000 ; GISEL-NEXT: ldr q1, [x0] ; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI28_0] ; GISEL-NEXT: tbl.16b v0, { v0, v1 }, v2 diff --git a/llvm/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll b/llvm/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll --- a/llvm/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll +++ b/llvm/test/CodeGen/AArch64/arm64-setcc-int-to-fp-combine.ll @@ -4,9 +4,9 @@ define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind { ; CHECK-LABEL: foo: ; CHECK: ; %bb.0: +; CHECK-NEXT: fmov.4s v2, #1.00000000 ; CHECK-NEXT: fcmeq.4s v0, v0, v1 -; CHECK-NEXT: fmov.4s v1, #1.00000000 -; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: and.16b v0, v0, v2 ; CHECK-NEXT: ret %cmp = fcmp oeq <4 x float> %val, %test %ext = zext <4 x i1> %cmp to <4 x i32> @@ -19,14 +19,14 @@ define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwind { ; CHECK-LABEL: foo1: ; CHECK: ; %bb.0: +; CHECK-NEXT: movi.4s v2, #1 ; CHECK-NEXT: fcmeq.4s v0, v0, v1 -; CHECK-NEXT: movi.4s v1, #1 -; CHECK-NEXT: and.16b v0, v0, v1 -; CHECK-NEXT: ushll.2d v1, v0, #0 -; CHECK-NEXT: ushll2.2d v0, v0, #0 +; CHECK-NEXT: and.16b v0, v0, v2 +; CHECK-NEXT: ushll2.2d v1, v0, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 ; CHECK-NEXT: scvtf.2d v1, v1 ; CHECK-NEXT: scvtf.2d v0, v0 -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %cmp = fcmp oeq <4 x float> %val, %test %ext = zext <4 x i1> %cmp to <4 x i32> @@ -42,10 +42,10 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: Lloh0: ; CHECK-NEXT: adrp x8, lCPI2_0@PAGE -; CHECK-NEXT: Lloh1: -; CHECK-NEXT: ldr q2, [x8, lCPI2_0@PAGEOFF] ; CHECK-NEXT: fcmeq.4s v0, v0, v1 -; CHECK-NEXT: and.16b v0, v0, v2 +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF] +; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 %cmp = fcmp oeq <4 x float> %val, %test diff --git a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll --- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll +++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll @@ -427,8 +427,8 @@ ; ENABLE-NEXT: add x9, x8, #8 ; ENABLE-NEXT: str x9, [sp, #8] ; ENABLE-NEXT: ldr w8, [x8] -; ENABLE-NEXT: subs w1, w1, #1 ; ENABLE-NEXT: add w0, w0, w8 +; ENABLE-NEXT: subs w1, w1, #1 ; ENABLE-NEXT: b.ne LBB6_2 ; ENABLE-NEXT: LBB6_3: ; %for.end ; ENABLE-NEXT: add sp, sp, #16 @@ -453,8 +453,8 @@ ; DISABLE-NEXT: add x9, x8, #8 ; DISABLE-NEXT: str x9, [sp, #8] ; DISABLE-NEXT: ldr w8, [x8] -; DISABLE-NEXT: subs w1, w1, #1 ; DISABLE-NEXT: add w0, w0, w8 +; DISABLE-NEXT: subs w1, w1, #1 ; DISABLE-NEXT: b.ne LBB6_2 ; DISABLE-NEXT: LBB6_3: ; %if.end ; DISABLE-NEXT: add sp, sp, #16 @@ -586,8 +586,8 @@ ; ENABLE-NEXT: .cfi_offset w29, -16 ; ENABLE-NEXT: stp x1, x1, [sp, #32] ; ENABLE-NEXT: stp x1, x1, [sp, #16] -; ENABLE-NEXT: stp x1, x1, [sp] ; ENABLE-NEXT: mov w0, w1 +; ENABLE-NEXT: stp x1, x1, [sp] ; ENABLE-NEXT: bl _someVariadicFunc ; ENABLE-NEXT: lsl w0, w0, #3 ; ENABLE-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload @@ -610,8 +610,8 @@ ; DISABLE-NEXT: ; %bb.1: ; %if.then ; DISABLE-NEXT: stp x1, x1, [sp, #32] ; DISABLE-NEXT: stp x1, x1, [sp, #16] -; DISABLE-NEXT: stp x1, x1, [sp] ; DISABLE-NEXT: mov w0, w1 +; DISABLE-NEXT: stp x1, x1, [sp] ; DISABLE-NEXT: bl _someVariadicFunc ; DISABLE-NEXT: lsl w0, w0, #3 ; DISABLE-NEXT: b LBB8_3 @@ -787,17 +787,17 @@ ; ENABLE-NEXT: sub x8, sp, #16 ; ENABLE-NEXT: mov sp, x8 ; ENABLE-NEXT: mov w9, wzr +; ENABLE-NEXT: LBB11_2: ; %for.body +; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1 ; ENABLE-NEXT: ; InlineAsm Start ; ENABLE-NEXT: mov x10, #0 ; ENABLE-NEXT: ; InlineAsm End -; ENABLE-NEXT: LBB11_2: ; %for.body -; ENABLE-NEXT: ; =>This Inner Loop Header: Depth=1 -; ENABLE-NEXT: add w9, w10, w9 -; ENABLE-NEXT: str w9, [x8] +; ENABLE-NEXT: add w10, w10, w9 +; ENABLE-NEXT: mov w9, #1 +; ENABLE-NEXT: str w10, [x8] ; ENABLE-NEXT: ; InlineAsm Start ; ENABLE-NEXT: nop ; ENABLE-NEXT: ; InlineAsm End -; ENABLE-NEXT: mov w9, #1 ; ENABLE-NEXT: b LBB11_2 ; ENABLE-NEXT: LBB11_3: ; %if.end ; ENABLE-NEXT: sub sp, x29, #16 @@ -820,17 +820,17 @@ ; DISABLE-NEXT: sub x8, sp, #16 ; DISABLE-NEXT: mov sp, x8 ; DISABLE-NEXT: mov w9, wzr +; DISABLE-NEXT: LBB11_2: ; %for.body +; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1 ; DISABLE-NEXT: ; InlineAsm Start ; DISABLE-NEXT: mov x10, #0 ; DISABLE-NEXT: ; InlineAsm End -; DISABLE-NEXT: LBB11_2: ; %for.body -; DISABLE-NEXT: ; =>This Inner Loop Header: Depth=1 -; DISABLE-NEXT: add w9, w10, w9 -; DISABLE-NEXT: str w9, [x8] +; DISABLE-NEXT: add w10, w10, w9 +; DISABLE-NEXT: mov w9, #1 +; DISABLE-NEXT: str w10, [x8] ; DISABLE-NEXT: ; InlineAsm Start ; DISABLE-NEXT: nop ; DISABLE-NEXT: ; InlineAsm End -; DISABLE-NEXT: mov w9, #1 ; DISABLE-NEXT: b LBB11_2 ; DISABLE-NEXT: LBB11_3: ; %if.end ; DISABLE-NEXT: sub sp, x29, #16 @@ -953,8 +953,8 @@ ; ENABLE-NEXT: .cfi_offset w30, -8 ; ENABLE-NEXT: .cfi_offset w29, -16 ; ENABLE-NEXT: lsl w8, w0, w1 -; ENABLE-NEXT: cmp w0, w1 ; ENABLE-NEXT: lsl w9, w1, w0 +; ENABLE-NEXT: cmp w0, w1 ; ENABLE-NEXT: b.ge LBB13_2 ; ENABLE-NEXT: ; %bb.1: ; %true ; ENABLE-NEXT: str w0, [sp] @@ -975,8 +975,8 @@ ; DISABLE-NEXT: .cfi_offset w30, -8 ; DISABLE-NEXT: .cfi_offset w29, -16 ; DISABLE-NEXT: lsl w8, w0, w1 -; DISABLE-NEXT: cmp w0, w1 ; DISABLE-NEXT: lsl w9, w1, w0 +; DISABLE-NEXT: cmp w0, w1 ; DISABLE-NEXT: b.ge LBB13_2 ; DISABLE-NEXT: ; %bb.1: ; %true ; DISABLE-NEXT: str w0, [sp] @@ -1034,16 +1034,16 @@ ; ENABLE-NEXT: .cfi_offset w26, -80 ; ENABLE-NEXT: .cfi_offset w27, -88 ; ENABLE-NEXT: .cfi_offset w28, -96 -; ENABLE-NEXT: lsl w8, w0, w1 -; ENABLE-NEXT: lsl w9, w1, w0 -; ENABLE-NEXT: lsr w10, w0, w1 -; ENABLE-NEXT: lsr w12, w1, w0 -; ENABLE-NEXT: add w15, w1, w0 +; ENABLE-NEXT: add w8, w1, w0 +; ENABLE-NEXT: lsl w9, w0, w1 +; ENABLE-NEXT: lsl w10, w1, w0 +; ENABLE-NEXT: lsr w12, w0, w1 +; ENABLE-NEXT: lsr w13, w1, w0 +; ENABLE-NEXT: sub w11, w10, w12 ; ENABLE-NEXT: subs w17, w1, w0 -; ENABLE-NEXT: sub w11, w9, w10 -; ENABLE-NEXT: add w16, w8, w9 -; ENABLE-NEXT: add w13, w10, w12 -; ENABLE-NEXT: add w14, w12, w15 +; ENABLE-NEXT: add w16, w9, w10 +; ENABLE-NEXT: add w14, w12, w13 +; ENABLE-NEXT: add w15, w13, w8 ; ENABLE-NEXT: b.le LBB14_2 ; ENABLE-NEXT: ; %bb.1: ; %true ; ENABLE-NEXT: str w0, [sp] @@ -1051,15 +1051,15 @@ ; ENABLE-NEXT: nop ; ENABLE-NEXT: ; InlineAsm End ; ENABLE-NEXT: LBB14_2: ; %false -; ENABLE-NEXT: str w8, [x2] -; ENABLE-NEXT: str w9, [x3] -; ENABLE-NEXT: str w10, [x4] -; ENABLE-NEXT: str w12, [x5] -; ENABLE-NEXT: str w15, [x6] +; ENABLE-NEXT: str w9, [x2] +; ENABLE-NEXT: str w10, [x3] +; ENABLE-NEXT: str w12, [x4] +; ENABLE-NEXT: str w13, [x5] +; ENABLE-NEXT: str w8, [x6] ; ENABLE-NEXT: str w17, [x7] ; ENABLE-NEXT: stp w0, w1, [x2, #4] ; ENABLE-NEXT: stp w16, w11, [x2, #12] -; ENABLE-NEXT: stp w13, w14, [x2, #20] +; ENABLE-NEXT: stp w14, w15, [x2, #20] ; ENABLE-NEXT: sub sp, x29, #80 ; ENABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload ; ENABLE-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload @@ -1093,16 +1093,16 @@ ; DISABLE-NEXT: .cfi_offset w26, -80 ; DISABLE-NEXT: .cfi_offset w27, -88 ; DISABLE-NEXT: .cfi_offset w28, -96 -; DISABLE-NEXT: lsl w8, w0, w1 -; DISABLE-NEXT: lsl w9, w1, w0 -; DISABLE-NEXT: lsr w10, w0, w1 -; DISABLE-NEXT: lsr w12, w1, w0 -; DISABLE-NEXT: add w15, w1, w0 +; DISABLE-NEXT: add w8, w1, w0 +; DISABLE-NEXT: lsl w9, w0, w1 +; DISABLE-NEXT: lsl w10, w1, w0 +; DISABLE-NEXT: lsr w12, w0, w1 +; DISABLE-NEXT: lsr w13, w1, w0 +; DISABLE-NEXT: sub w11, w10, w12 ; DISABLE-NEXT: subs w17, w1, w0 -; DISABLE-NEXT: sub w11, w9, w10 -; DISABLE-NEXT: add w16, w8, w9 -; DISABLE-NEXT: add w13, w10, w12 -; DISABLE-NEXT: add w14, w12, w15 +; DISABLE-NEXT: add w16, w9, w10 +; DISABLE-NEXT: add w14, w12, w13 +; DISABLE-NEXT: add w15, w13, w8 ; DISABLE-NEXT: b.le LBB14_2 ; DISABLE-NEXT: ; %bb.1: ; %true ; DISABLE-NEXT: str w0, [sp] @@ -1110,15 +1110,15 @@ ; DISABLE-NEXT: nop ; DISABLE-NEXT: ; InlineAsm End ; DISABLE-NEXT: LBB14_2: ; %false -; DISABLE-NEXT: str w8, [x2] -; DISABLE-NEXT: str w9, [x3] -; DISABLE-NEXT: str w10, [x4] -; DISABLE-NEXT: str w12, [x5] -; DISABLE-NEXT: str w15, [x6] +; DISABLE-NEXT: str w9, [x2] +; DISABLE-NEXT: str w10, [x3] +; DISABLE-NEXT: str w12, [x4] +; DISABLE-NEXT: str w13, [x5] +; DISABLE-NEXT: str w8, [x6] ; DISABLE-NEXT: str w17, [x7] ; DISABLE-NEXT: stp w0, w1, [x2, #4] ; DISABLE-NEXT: stp w16, w11, [x2, #12] -; DISABLE-NEXT: stp w13, w14, [x2, #20] +; DISABLE-NEXT: stp w14, w15, [x2, #20] ; DISABLE-NEXT: sub sp, x29, #80 ; DISABLE-NEXT: ldp x29, x30, [sp, #80] ; 16-byte Folded Reload ; DISABLE-NEXT: ldp x20, x19, [sp, #64] ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll --- a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll @@ -18,8 +18,8 @@ ; CHECK-LABEL: testLeftBad8x8: ; CHECK: // %bb.0: ; CHECK-NEXT: movi.8b v2, #165 -; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: shl.8b v1, v1, #1 +; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -47,8 +47,8 @@ ; CHECK-LABEL: testRightBad8x8: ; CHECK: // %bb.0: ; CHECK-NEXT: movi.8b v2, #165 -; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: ushr.8b v1, v1, #1 +; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -76,8 +76,8 @@ ; CHECK-LABEL: testLeftBad16x8: ; CHECK: // %bb.0: ; CHECK-NEXT: movi.16b v2, #165 -; CHECK-NEXT: and.16b v0, v0, v2 ; CHECK-NEXT: shl.16b v1, v1, #1 +; CHECK-NEXT: and.16b v0, v0, v2 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -105,8 +105,8 @@ ; CHECK-LABEL: testRightBad16x8: ; CHECK: // %bb.0: ; CHECK-NEXT: movi.16b v2, #165 -; CHECK-NEXT: and.16b v0, v0, v2 ; CHECK-NEXT: ushr.16b v1, v1, #1 +; CHECK-NEXT: and.16b v0, v0, v2 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -134,9 +134,9 @@ ; CHECK-LABEL: testLeftBad4x16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #16500 +; CHECK-NEXT: shl.4h v1, v1, #14 ; CHECK-NEXT: dup.4h v2, w8 ; CHECK-NEXT: and.8b v0, v0, v2 -; CHECK-NEXT: shl.4h v1, v1, #14 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -164,9 +164,9 @@ ; CHECK-LABEL: testRightBad4x16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #16500 +; CHECK-NEXT: ushr.4h v1, v1, #14 ; CHECK-NEXT: dup.4h v2, w8 ; CHECK-NEXT: and.8b v0, v0, v2 -; CHECK-NEXT: ushr.4h v1, v1, #14 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -194,9 +194,9 @@ ; CHECK-LABEL: testLeftBad8x16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #16500 +; CHECK-NEXT: shl.8h v1, v1, #14 ; CHECK-NEXT: dup.8h v2, w8 ; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: shl.8h v1, v1, #14 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -224,9 +224,9 @@ ; CHECK-LABEL: testRightBad8x16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #16500 +; CHECK-NEXT: ushr.8h v1, v1, #14 ; CHECK-NEXT: dup.8h v2, w8 ; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: ushr.8h v1, v1, #14 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -254,9 +254,9 @@ ; CHECK-LABEL: testLeftBad2x32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #4194300 +; CHECK-NEXT: shl.2s v1, v1, #22 ; CHECK-NEXT: dup.2s v2, w8 ; CHECK-NEXT: and.8b v0, v0, v2 -; CHECK-NEXT: shl.2s v1, v1, #22 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -284,9 +284,9 @@ ; CHECK-LABEL: testRightBad2x32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #4194300 +; CHECK-NEXT: ushr.2s v1, v1, #22 ; CHECK-NEXT: dup.2s v2, w8 ; CHECK-NEXT: and.8b v0, v0, v2 -; CHECK-NEXT: ushr.2s v1, v1, #22 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -314,9 +314,9 @@ ; CHECK-LABEL: testLeftBad4x32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #4194300 +; CHECK-NEXT: shl.4s v1, v1, #22 ; CHECK-NEXT: dup.4s v2, w8 ; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: shl.4s v1, v1, #22 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -344,9 +344,9 @@ ; CHECK-LABEL: testRightBad4x32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #4194300 +; CHECK-NEXT: ushr.4s v1, v1, #22 ; CHECK-NEXT: dup.4s v2, w8 ; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: ushr.4s v1, v1, #22 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -375,9 +375,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #10 ; CHECK-NEXT: movk x8, #1, lsl #48 +; CHECK-NEXT: shl.2d v1, v1, #48 ; CHECK-NEXT: dup.2d v2, x8 ; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: shl.2d v1, v1, #48 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -406,9 +406,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #10 ; CHECK-NEXT: movk x8, #1, lsl #48 +; CHECK-NEXT: ushr.2d v1, v1, #48 ; CHECK-NEXT: dup.2d v2, x8 ; CHECK-NEXT: and.16b v0, v0, v2 -; CHECK-NEXT: ushr.2d v1, v1, #48 ; CHECK-NEXT: orr.16b v0, v0, v1 ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -436,8 +436,8 @@ ; CHECK-LABEL: testLeftNotAllConstantBuildVec8x8: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI29_0 -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI29_0] ; CHECK-NEXT: shl.8b v1, v1, #3 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI29_0] ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: orr.8b v0, v0, v1 ; CHECK-NEXT: str d0, [x0] diff --git a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll --- a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll +++ b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll @@ -9,8 +9,8 @@ ; CHECK-LABEL: srl_and: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, :got:g -; CHECK-NEXT: ldr x8, [x8, :got_lo12:g] ; CHECK-NEXT: mov w9, #50 +; CHECK-NEXT: ldr x8, [x8, :got_lo12:g] ; CHECK-NEXT: ldrh w8, [x8] ; CHECK-NEXT: eor w8, w8, w9 ; CHECK-NEXT: mov w9, #65535 diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll --- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll +++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll @@ -177,12 +177,12 @@ ; CHECK-LABEL: sext_v4i8_to_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: ushll.2d v1, v0, #0 -; CHECK-NEXT: ushll2.2d v0, v0, #0 +; CHECK-NEXT: ushll2.2d v1, v0, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 +; CHECK-NEXT: shl.2d v1, v1, #56 ; CHECK-NEXT: shl.2d v0, v0, #56 -; CHECK-NEXT: shl.2d v2, v1, #56 -; CHECK-NEXT: sshr.2d v1, v0, #56 -; CHECK-NEXT: sshr.2d v0, v2, #56 +; CHECK-NEXT: sshr.2d v1, v1, #56 +; CHECK-NEXT: sshr.2d v0, v0, #56 ; CHECK-NEXT: ret %r = sext <4 x i8> %v0 to <4 x i64> ret <4 x i64> %r @@ -192,12 +192,12 @@ ; CHECK-LABEL: zext_v8i8_to_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ushll2.4s v2, v0, #0 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: ushll2.2d v3, v2, #0 -; CHECK-NEXT: ushll2.2d v1, v0, #0 -; CHECK-NEXT: ushll.2d v2, v2, #0 -; CHECK-NEXT: ushll.2d v0, v0, #0 +; CHECK-NEXT: ushll.4s v2, v0, #0 +; CHECK-NEXT: ushll2.4s v4, v0, #0 +; CHECK-NEXT: ushll2.2d v1, v2, #0 +; CHECK-NEXT: ushll.2d v0, v2, #0 +; CHECK-NEXT: ushll2.2d v3, v4, #0 +; CHECK-NEXT: ushll.2d v2, v4, #0 ; CHECK-NEXT: ret %r = zext <8 x i8> %v0 to <8 x i64> ret <8 x i64> %r @@ -207,12 +207,12 @@ ; CHECK-LABEL: sext_v8i8_to_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: sshll.8h v0, v0, #0 -; CHECK-NEXT: sshll2.4s v2, v0, #0 -; CHECK-NEXT: sshll.4s v0, v0, #0 -; CHECK-NEXT: sshll2.2d v3, v2, #0 -; CHECK-NEXT: sshll2.2d v1, v0, #0 -; CHECK-NEXT: sshll.2d v2, v2, #0 -; CHECK-NEXT: sshll.2d v0, v0, #0 +; CHECK-NEXT: sshll.4s v2, v0, #0 +; CHECK-NEXT: sshll2.4s v4, v0, #0 +; CHECK-NEXT: sshll2.2d v1, v2, #0 +; CHECK-NEXT: sshll.2d v0, v2, #0 +; CHECK-NEXT: sshll2.2d v3, v4, #0 +; CHECK-NEXT: sshll.2d v2, v4, #0 ; CHECK-NEXT: ret %r = sext <8 x i8> %v0 to <8 x i64> ret <8 x i64> %r @@ -224,62 +224,62 @@ ; CHECK-LABEL: zext_v32i1: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [sp, #64] -; CHECK-NEXT: ldr w9, [sp, #72] -; CHECK-NEXT: ldr w10, [sp, #80] ; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ldr w11, [sp, #88] -; CHECK-NEXT: mov.b v0[1], w1 -; CHECK-NEXT: ldr w12, [sp, #96] +; CHECK-NEXT: ldr w9, [sp] +; CHECK-NEXT: ldr w10, [sp, #8] ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: ldr w8, [sp, #72] +; CHECK-NEXT: mov.b v0[1], w1 +; CHECK-NEXT: movi.16b v2, #1 +; CHECK-NEXT: mov.b v1[1], w8 +; CHECK-NEXT: ldr w8, [sp, #80] ; CHECK-NEXT: mov.b v0[2], w2 -; CHECK-NEXT: ldr w8, [sp, #104] -; CHECK-NEXT: mov.b v1[1], w9 +; CHECK-NEXT: mov.b v1[2], w8 +; CHECK-NEXT: ldr w8, [sp, #88] ; CHECK-NEXT: mov.b v0[3], w3 -; CHECK-NEXT: ldr w13, [sp, #112] -; CHECK-NEXT: mov.b v1[2], w10 +; CHECK-NEXT: mov.b v1[3], w8 +; CHECK-NEXT: ldr w8, [sp, #96] ; CHECK-NEXT: mov.b v0[4], w4 -; CHECK-NEXT: ldr w9, [sp, #120] -; CHECK-NEXT: mov.b v1[3], w11 -; CHECK-NEXT: ldr w11, [sp] +; CHECK-NEXT: mov.b v1[4], w8 +; CHECK-NEXT: ldr w8, [sp, #104] ; CHECK-NEXT: mov.b v0[5], w5 -; CHECK-NEXT: mov.b v1[4], w12 -; CHECK-NEXT: ldr w12, [sp, #8] -; CHECK-NEXT: ldr w14, [sp, #128] -; CHECK-NEXT: mov.b v0[6], w6 ; CHECK-NEXT: mov.b v1[5], w8 -; CHECK-NEXT: ldr w8, [sp, #16] -; CHECK-NEXT: ldr w10, [sp, #136] +; CHECK-NEXT: ldr w8, [sp, #112] +; CHECK-NEXT: mov.b v0[6], w6 +; CHECK-NEXT: mov.b v1[6], w8 +; CHECK-NEXT: ldr w8, [sp, #120] ; CHECK-NEXT: mov.b v0[7], w7 -; CHECK-NEXT: mov.b v1[6], w13 -; CHECK-NEXT: ldr w13, [sp, #24] -; CHECK-NEXT: ldr w15, [sp, #144] -; CHECK-NEXT: mov.b v0[8], w11 -; CHECK-NEXT: mov.b v1[7], w9 +; CHECK-NEXT: mov.b v1[7], w8 +; CHECK-NEXT: ldr w8, [sp, #128] +; CHECK-NEXT: mov.b v0[8], w9 +; CHECK-NEXT: ldr w9, [sp, #16] +; CHECK-NEXT: mov.b v1[8], w8 +; CHECK-NEXT: ldr w8, [sp, #136] +; CHECK-NEXT: mov.b v0[9], w10 +; CHECK-NEXT: ldr w10, [sp, #24] +; CHECK-NEXT: mov.b v1[9], w8 +; CHECK-NEXT: ldr w8, [sp, #144] +; CHECK-NEXT: mov.b v0[10], w9 ; CHECK-NEXT: ldr w9, [sp, #32] -; CHECK-NEXT: ldr w16, [sp, #152] -; CHECK-NEXT: mov.b v0[9], w12 -; CHECK-NEXT: ldr w11, [sp, #160] -; CHECK-NEXT: mov.b v0[10], w8 -; CHECK-NEXT: mov.b v1[8], w14 -; CHECK-NEXT: ldr w12, [sp, #168] -; CHECK-NEXT: mov.b v0[11], w13 -; CHECK-NEXT: ldr w14, [sp, #40] -; CHECK-NEXT: mov.b v1[9], w10 -; CHECK-NEXT: ldr w8, [sp, #176] +; CHECK-NEXT: mov.b v1[10], w8 +; CHECK-NEXT: ldr w8, [sp, #152] +; CHECK-NEXT: mov.b v0[11], w10 +; CHECK-NEXT: ldr w10, [sp, #40] +; CHECK-NEXT: mov.b v1[11], w8 +; CHECK-NEXT: ldr w8, [sp, #160] ; CHECK-NEXT: mov.b v0[12], w9 ; CHECK-NEXT: ldr w9, [sp, #48] -; CHECK-NEXT: mov.b v1[10], w15 -; CHECK-NEXT: ldr w13, [sp, #184] +; CHECK-NEXT: mov.b v1[12], w8 +; CHECK-NEXT: ldr w8, [sp, #168] +; CHECK-NEXT: mov.b v0[13], w10 ; CHECK-NEXT: ldr w10, [sp, #56] -; CHECK-NEXT: mov.b v1[11], w16 -; CHECK-NEXT: mov.b v1[12], w11 -; CHECK-NEXT: mov.b v0[13], w14 -; CHECK-NEXT: mov.b v1[13], w12 +; CHECK-NEXT: mov.b v1[13], w8 +; CHECK-NEXT: ldr w8, [sp, #176] ; CHECK-NEXT: mov.b v0[14], w9 ; CHECK-NEXT: mov.b v1[14], w8 -; CHECK-NEXT: movi.16b v2, #1 +; CHECK-NEXT: ldr w8, [sp, #184] ; CHECK-NEXT: mov.b v0[15], w10 -; CHECK-NEXT: mov.b v1[15], w13 +; CHECK-NEXT: mov.b v1[15], w8 ; CHECK-NEXT: and.16b v0, v0, v2 ; CHECK-NEXT: and.16b v1, v1, v2 ; CHECK-NEXT: ret @@ -291,61 +291,61 @@ ; CHECK-LABEL: sext_v32i1: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [sp, #64] -; CHECK-NEXT: ldr w9, [sp, #72] -; CHECK-NEXT: ldr w10, [sp, #80] ; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ldr w11, [sp, #88] -; CHECK-NEXT: mov.b v0[1], w1 -; CHECK-NEXT: ldr w12, [sp, #96] +; CHECK-NEXT: ldr w9, [sp] +; CHECK-NEXT: ldr w10, [sp, #8] ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: ldr w8, [sp, #72] +; CHECK-NEXT: mov.b v0[1], w1 +; CHECK-NEXT: mov.b v1[1], w8 +; CHECK-NEXT: ldr w8, [sp, #80] ; CHECK-NEXT: mov.b v0[2], w2 -; CHECK-NEXT: ldr w8, [sp, #104] -; CHECK-NEXT: mov.b v1[1], w9 +; CHECK-NEXT: mov.b v1[2], w8 +; CHECK-NEXT: ldr w8, [sp, #88] ; CHECK-NEXT: mov.b v0[3], w3 -; CHECK-NEXT: ldr w13, [sp, #112] -; CHECK-NEXT: mov.b v1[2], w10 +; CHECK-NEXT: mov.b v1[3], w8 +; CHECK-NEXT: ldr w8, [sp, #96] ; CHECK-NEXT: mov.b v0[4], w4 -; CHECK-NEXT: ldr w9, [sp, #120] -; CHECK-NEXT: mov.b v1[3], w11 -; CHECK-NEXT: ldr w11, [sp] +; CHECK-NEXT: mov.b v1[4], w8 +; CHECK-NEXT: ldr w8, [sp, #104] ; CHECK-NEXT: mov.b v0[5], w5 -; CHECK-NEXT: mov.b v1[4], w12 -; CHECK-NEXT: ldr w12, [sp, #8] -; CHECK-NEXT: ldr w14, [sp, #128] -; CHECK-NEXT: mov.b v0[6], w6 ; CHECK-NEXT: mov.b v1[5], w8 -; CHECK-NEXT: ldr w8, [sp, #16] -; CHECK-NEXT: ldr w10, [sp, #136] +; CHECK-NEXT: ldr w8, [sp, #112] +; CHECK-NEXT: mov.b v0[6], w6 +; CHECK-NEXT: mov.b v1[6], w8 +; CHECK-NEXT: ldr w8, [sp, #120] ; CHECK-NEXT: mov.b v0[7], w7 -; CHECK-NEXT: mov.b v1[6], w13 -; CHECK-NEXT: ldr w13, [sp, #24] -; CHECK-NEXT: ldr w15, [sp, #144] -; CHECK-NEXT: mov.b v0[8], w11 -; CHECK-NEXT: mov.b v1[7], w9 +; CHECK-NEXT: mov.b v1[7], w8 +; CHECK-NEXT: ldr w8, [sp, #128] +; CHECK-NEXT: mov.b v0[8], w9 +; CHECK-NEXT: ldr w9, [sp, #16] +; CHECK-NEXT: mov.b v1[8], w8 +; CHECK-NEXT: ldr w8, [sp, #136] +; CHECK-NEXT: mov.b v0[9], w10 +; CHECK-NEXT: ldr w10, [sp, #24] +; CHECK-NEXT: mov.b v1[9], w8 +; CHECK-NEXT: ldr w8, [sp, #144] +; CHECK-NEXT: mov.b v0[10], w9 ; CHECK-NEXT: ldr w9, [sp, #32] -; CHECK-NEXT: ldr w16, [sp, #152] -; CHECK-NEXT: mov.b v0[9], w12 -; CHECK-NEXT: ldr w11, [sp, #160] -; CHECK-NEXT: mov.b v0[10], w8 -; CHECK-NEXT: mov.b v1[8], w14 -; CHECK-NEXT: ldr w12, [sp, #168] -; CHECK-NEXT: mov.b v0[11], w13 -; CHECK-NEXT: ldr w14, [sp, #40] -; CHECK-NEXT: mov.b v1[9], w10 -; CHECK-NEXT: ldr w8, [sp, #176] +; CHECK-NEXT: mov.b v1[10], w8 +; CHECK-NEXT: ldr w8, [sp, #152] +; CHECK-NEXT: mov.b v0[11], w10 +; CHECK-NEXT: ldr w10, [sp, #40] +; CHECK-NEXT: mov.b v1[11], w8 +; CHECK-NEXT: ldr w8, [sp, #160] ; CHECK-NEXT: mov.b v0[12], w9 ; CHECK-NEXT: ldr w9, [sp, #48] -; CHECK-NEXT: mov.b v1[10], w15 -; CHECK-NEXT: ldr w13, [sp, #184] +; CHECK-NEXT: mov.b v1[12], w8 +; CHECK-NEXT: ldr w8, [sp, #168] +; CHECK-NEXT: mov.b v0[13], w10 ; CHECK-NEXT: ldr w10, [sp, #56] -; CHECK-NEXT: mov.b v1[11], w16 -; CHECK-NEXT: mov.b v1[12], w11 -; CHECK-NEXT: mov.b v0[13], w14 -; CHECK-NEXT: mov.b v1[13], w12 +; CHECK-NEXT: mov.b v1[13], w8 +; CHECK-NEXT: ldr w8, [sp, #176] ; CHECK-NEXT: mov.b v0[14], w9 ; CHECK-NEXT: mov.b v1[14], w8 +; CHECK-NEXT: ldr w8, [sp, #184] ; CHECK-NEXT: mov.b v0[15], w10 -; CHECK-NEXT: mov.b v1[15], w13 +; CHECK-NEXT: mov.b v1[15], w8 ; CHECK-NEXT: shl.16b v0, v0, #7 ; CHECK-NEXT: shl.16b v1, v1, #7 ; CHECK-NEXT: sshr.16b v0, v0, #7 @@ -358,127 +358,127 @@ define <64 x i8> @zext_v64i1(<64 x i1> %arg) { ; CHECK-LABEL: zext_v64i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w11, [sp, #64] ; CHECK-NEXT: ldr w8, [sp, #320] +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ldr w9, [sp, #64] ; CHECK-NEXT: ldr w10, [sp, #192] -; CHECK-NEXT: ldr w9, [sp, #328] -; CHECK-NEXT: ldr w12, [sp, #336] -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: ldr w11, [sp, #200] ; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: ldr w8, [sp, #344] -; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ldr w8, [sp, #328] +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: ldr w9, [sp, #200] ; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: ldr w10, [sp, #352] +; CHECK-NEXT: ldr w10, [sp, #336] +; CHECK-NEXT: mov.b v3[1], w8 +; CHECK-NEXT: ldr w8, [sp, #72] ; CHECK-NEXT: mov.b v0[1], w1 -; CHECK-NEXT: mov.b v3[1], w9 -; CHECK-NEXT: ldr w9, [sp, #360] +; CHECK-NEXT: ldr w11, [sp, #352] +; CHECK-NEXT: mov.b v2[1], w9 +; CHECK-NEXT: ldr w9, [sp, #80] +; CHECK-NEXT: mov.b v1[1], w8 +; CHECK-NEXT: ldr w8, [sp, #344] +; CHECK-NEXT: mov.b v3[2], w10 +; CHECK-NEXT: ldr w10, [sp, #208] ; CHECK-NEXT: mov.b v0[2], w2 -; CHECK-NEXT: ldr w13, [sp, #72] -; CHECK-NEXT: mov.b v2[1], w11 -; CHECK-NEXT: ldr w11, [sp, #368] -; CHECK-NEXT: mov.b v3[2], w12 -; CHECK-NEXT: mov.b v0[3], w3 -; CHECK-NEXT: ldr w12, [sp, #376] +; CHECK-NEXT: ldr w12, [sp, #368] +; CHECK-NEXT: ldr w13, [sp, #384] +; CHECK-NEXT: mov.b v1[2], w9 +; CHECK-NEXT: ldr w9, [sp, #360] +; CHECK-NEXT: mov.b v2[2], w10 +; CHECK-NEXT: ldr w10, [sp, #88] ; CHECK-NEXT: mov.b v3[3], w8 +; CHECK-NEXT: ldr w8, [sp, #216] +; CHECK-NEXT: mov.b v0[3], w3 +; CHECK-NEXT: ldr w14, [sp, #400] +; CHECK-NEXT: mov.b v1[3], w10 +; CHECK-NEXT: ldr w10, [sp, #376] +; CHECK-NEXT: mov.b v2[3], w8 +; CHECK-NEXT: ldr w8, [sp, #96] +; CHECK-NEXT: mov.b v3[4], w11 +; CHECK-NEXT: ldr w11, [sp, #224] ; CHECK-NEXT: mov.b v0[4], w4 -; CHECK-NEXT: mov.b v3[4], w10 -; CHECK-NEXT: ldr w10, [sp] -; CHECK-NEXT: mov.b v0[5], w5 -; CHECK-NEXT: ldr w14, [sp, #80] +; CHECK-NEXT: ldr w15, [sp, #416] +; CHECK-NEXT: mov.b v1[4], w8 +; CHECK-NEXT: ldr w8, [sp, #392] +; CHECK-NEXT: mov.b v2[4], w11 +; CHECK-NEXT: ldr w11, [sp, #104] ; CHECK-NEXT: mov.b v3[5], w9 -; CHECK-NEXT: ldr w9, [sp, #8] -; CHECK-NEXT: mov.b v1[1], w13 -; CHECK-NEXT: ldr w13, [sp, #208] -; CHECK-NEXT: mov.b v0[6], w6 -; CHECK-NEXT: ldr w15, [sp, #88] -; CHECK-NEXT: mov.b v3[6], w11 -; CHECK-NEXT: ldr w11, [sp, #16] -; CHECK-NEXT: mov.b v0[7], w7 -; CHECK-NEXT: ldr w16, [sp, #96] -; CHECK-NEXT: mov.b v3[7], w12 -; CHECK-NEXT: ldr w12, [sp, #24] -; CHECK-NEXT: mov.b v0[8], w10 -; CHECK-NEXT: ldr w10, [sp, #104] -; CHECK-NEXT: mov.b v1[2], w14 -; CHECK-NEXT: ldr w14, [sp, #216] -; CHECK-NEXT: mov.b v0[9], w9 +; CHECK-NEXT: ldr w9, [sp, #232] +; CHECK-NEXT: mov.b v0[5], w5 +; CHECK-NEXT: ldr w16, [sp, #432] +; CHECK-NEXT: mov.b v1[5], w11 +; CHECK-NEXT: ldr w11, [sp, #408] +; CHECK-NEXT: mov.b v2[5], w9 ; CHECK-NEXT: ldr w9, [sp, #112] -; CHECK-NEXT: mov.b v2[2], w13 -; CHECK-NEXT: ldr w13, [sp, #384] -; CHECK-NEXT: mov.b v1[3], w15 -; CHECK-NEXT: ldr w15, [sp, #224] -; CHECK-NEXT: mov.b v0[10], w11 -; CHECK-NEXT: ldr w11, [sp, #120] -; CHECK-NEXT: mov.b v1[4], w16 -; CHECK-NEXT: ldr w16, [sp, #232] -; CHECK-NEXT: mov.b v0[11], w12 -; CHECK-NEXT: ldr w12, [sp, #128] -; CHECK-NEXT: mov.b v1[5], w10 -; CHECK-NEXT: ldr w10, [sp, #240] -; CHECK-NEXT: mov.b v2[3], w14 +; CHECK-NEXT: mov.b v3[6], w12 +; CHECK-NEXT: ldr w12, [sp, #240] +; CHECK-NEXT: mov.b v0[6], w6 ; CHECK-NEXT: mov.b v1[6], w9 -; CHECK-NEXT: ldr w9, [sp, #248] -; CHECK-NEXT: ldr w8, [sp, #392] -; CHECK-NEXT: mov.b v2[4], w15 -; CHECK-NEXT: mov.b v1[7], w11 -; CHECK-NEXT: ldr w11, [sp, #256] +; CHECK-NEXT: ldr w9, [sp, #424] +; CHECK-NEXT: mov.b v2[6], w12 +; CHECK-NEXT: ldr w12, [sp, #120] +; CHECK-NEXT: mov.b v3[7], w10 +; CHECK-NEXT: ldr w10, [sp, #248] +; CHECK-NEXT: mov.b v0[7], w7 +; CHECK-NEXT: mov.b v1[7], w12 +; CHECK-NEXT: ldr w12, [sp] +; CHECK-NEXT: mov.b v2[7], w10 +; CHECK-NEXT: ldr w10, [sp, #128] ; CHECK-NEXT: mov.b v3[8], w13 -; CHECK-NEXT: ldr w13, [sp, #32] -; CHECK-NEXT: ldr w14, [sp, #400] -; CHECK-NEXT: mov.b v2[5], w16 -; CHECK-NEXT: mov.b v1[8], w12 -; CHECK-NEXT: ldr w12, [sp, #264] -; CHECK-NEXT: ldr w15, [sp, #408] -; CHECK-NEXT: mov.b v2[6], w10 -; CHECK-NEXT: mov.b v2[7], w9 -; CHECK-NEXT: mov.b v2[8], w11 -; CHECK-NEXT: mov.b v0[12], w13 +; CHECK-NEXT: ldr w13, [sp, #256] +; CHECK-NEXT: mov.b v0[8], w12 +; CHECK-NEXT: ldr w12, [sp, #440] +; CHECK-NEXT: mov.b v1[8], w10 +; CHECK-NEXT: ldr w10, [sp, #8] +; CHECK-NEXT: mov.b v2[8], w13 ; CHECK-NEXT: ldr w13, [sp, #136] ; CHECK-NEXT: mov.b v3[9], w8 -; CHECK-NEXT: ldr w8, [sp, #40] -; CHECK-NEXT: mov.b v2[9], w12 -; CHECK-NEXT: ldr w12, [sp, #272] -; CHECK-NEXT: mov.b v3[10], w14 -; CHECK-NEXT: ldr w14, [sp, #48] -; CHECK-NEXT: mov.b v3[11], w15 -; CHECK-NEXT: ldr w15, [sp, #56] -; CHECK-NEXT: ldr w16, [sp, #416] +; CHECK-NEXT: ldr w8, [sp, #264] +; CHECK-NEXT: mov.b v0[9], w10 +; CHECK-NEXT: ldr w10, [sp, #272] ; CHECK-NEXT: mov.b v1[9], w13 -; CHECK-NEXT: ldr w13, [sp, #280] -; CHECK-NEXT: mov.b v0[13], w8 +; CHECK-NEXT: ldr w13, [sp, #16] +; CHECK-NEXT: mov.b v2[9], w8 ; CHECK-NEXT: ldr w8, [sp, #144] -; CHECK-NEXT: mov.b v2[10], w12 -; CHECK-NEXT: ldr w12, [sp, #288] -; CHECK-NEXT: mov.b v0[14], w14 -; CHECK-NEXT: ldr w14, [sp, #152] -; CHECK-NEXT: mov.b v0[15], w15 -; CHECK-NEXT: ldr w15, [sp, #160] -; CHECK-NEXT: ldr w10, [sp, #424] +; CHECK-NEXT: mov.b v3[10], w14 +; CHECK-NEXT: ldr w14, [sp, #280] +; CHECK-NEXT: mov.b v0[10], w13 +; CHECK-NEXT: ldr w13, [sp, #296] ; CHECK-NEXT: mov.b v1[10], w8 -; CHECK-NEXT: ldr w8, [sp, #296] -; CHECK-NEXT: mov.b v2[11], w13 -; CHECK-NEXT: mov.b v3[12], w16 -; CHECK-NEXT: ldr w16, [sp, #168] -; CHECK-NEXT: ldr w9, [sp, #432] -; CHECK-NEXT: ldr w13, [sp, #304] -; CHECK-NEXT: mov.b v1[11], w14 -; CHECK-NEXT: mov.b v2[12], w12 -; CHECK-NEXT: ldr w12, [sp, #176] -; CHECK-NEXT: ldr w11, [sp, #440] -; CHECK-NEXT: ldr w14, [sp, #312] -; CHECK-NEXT: mov.b v1[12], w15 -; CHECK-NEXT: ldr w15, [sp, #184] -; CHECK-NEXT: mov.b v3[13], w10 -; CHECK-NEXT: mov.b v2[13], w8 -; CHECK-NEXT: mov.b v1[13], w16 -; CHECK-NEXT: mov.b v3[14], w9 -; CHECK-NEXT: mov.b v2[14], w13 -; CHECK-NEXT: mov.b v1[14], w12 +; CHECK-NEXT: ldr w8, [sp, #24] +; CHECK-NEXT: mov.b v2[10], w10 +; CHECK-NEXT: ldr w10, [sp, #152] +; CHECK-NEXT: mov.b v3[11], w11 +; CHECK-NEXT: ldr w11, [sp, #288] +; CHECK-NEXT: mov.b v0[11], w8 +; CHECK-NEXT: ldr w8, [sp, #32] +; CHECK-NEXT: mov.b v1[11], w10 +; CHECK-NEXT: ldr w10, [sp, #160] +; CHECK-NEXT: mov.b v2[11], w14 +; CHECK-NEXT: mov.b v3[12], w15 +; CHECK-NEXT: mov.b v0[12], w8 +; CHECK-NEXT: ldr w8, [sp, #40] +; CHECK-NEXT: mov.b v1[12], w10 +; CHECK-NEXT: ldr w10, [sp, #168] +; CHECK-NEXT: mov.b v2[12], w11 +; CHECK-NEXT: ldr w11, [sp, #312] +; CHECK-NEXT: mov.b v3[13], w9 +; CHECK-NEXT: ldr w9, [sp, #304] +; CHECK-NEXT: mov.b v0[13], w8 +; CHECK-NEXT: ldr w8, [sp, #48] +; CHECK-NEXT: mov.b v1[13], w10 +; CHECK-NEXT: ldr w10, [sp, #176] +; CHECK-NEXT: mov.b v2[13], w13 +; CHECK-NEXT: mov.b v3[14], w16 +; CHECK-NEXT: mov.b v0[14], w8 +; CHECK-NEXT: ldr w8, [sp, #56] +; CHECK-NEXT: mov.b v1[14], w10 +; CHECK-NEXT: mov.b v2[14], w9 +; CHECK-NEXT: ldr w9, [sp, #184] ; CHECK-NEXT: movi.16b v4, #1 -; CHECK-NEXT: mov.b v3[15], w11 -; CHECK-NEXT: mov.b v2[15], w14 -; CHECK-NEXT: mov.b v1[15], w15 +; CHECK-NEXT: mov.b v0[15], w8 +; CHECK-NEXT: mov.b v1[15], w9 +; CHECK-NEXT: mov.b v2[15], w11 +; CHECK-NEXT: mov.b v3[15], w12 ; CHECK-NEXT: and.16b v0, v0, v4 ; CHECK-NEXT: and.16b v1, v1, v4 ; CHECK-NEXT: and.16b v2, v2, v4 @@ -491,134 +491,134 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) { ; CHECK-LABEL: sext_v64i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w11, [sp, #64] ; CHECK-NEXT: ldr w8, [sp, #320] +; CHECK-NEXT: fmov s3, w0 +; CHECK-NEXT: ldr w9, [sp, #64] ; CHECK-NEXT: ldr w10, [sp, #192] -; CHECK-NEXT: ldr w9, [sp, #328] -; CHECK-NEXT: ldr w12, [sp, #336] -; CHECK-NEXT: fmov s3, w11 -; CHECK-NEXT: ldr w11, [sp, #200] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: ldr w8, [sp, #344] -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: ldr w10, [sp, #352] -; CHECK-NEXT: mov.b v0[1], w1 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldr w8, [sp, #72] +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: ldr w9, [sp, #200] +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: ldr w10, [sp, #328] +; CHECK-NEXT: mov.b v3[1], w1 +; CHECK-NEXT: ldr w11, [sp, #344] +; CHECK-NEXT: mov.b v2[1], w8 +; CHECK-NEXT: ldr w8, [sp, #336] ; CHECK-NEXT: mov.b v1[1], w9 -; CHECK-NEXT: ldr w9, [sp, #360] -; CHECK-NEXT: mov.b v0[2], w2 -; CHECK-NEXT: ldr w13, [sp, #72] -; CHECK-NEXT: mov.b v2[1], w11 -; CHECK-NEXT: ldr w11, [sp, #368] -; CHECK-NEXT: mov.b v1[2], w12 -; CHECK-NEXT: mov.b v0[3], w3 -; CHECK-NEXT: ldr w12, [sp, #376] +; CHECK-NEXT: ldr w9, [sp, #80] +; CHECK-NEXT: mov.b v0[1], w10 +; CHECK-NEXT: ldr w10, [sp, #208] +; CHECK-NEXT: mov.b v3[2], w2 +; CHECK-NEXT: ldr w12, [sp, #360] +; CHECK-NEXT: mov.b v2[2], w9 +; CHECK-NEXT: ldr w9, [sp, #352] +; CHECK-NEXT: mov.b v1[2], w10 +; CHECK-NEXT: ldr w10, [sp, #88] +; CHECK-NEXT: mov.b v0[2], w8 +; CHECK-NEXT: ldr w8, [sp, #216] +; CHECK-NEXT: mov.b v3[3], w3 +; CHECK-NEXT: ldr w13, [sp, #376] +; CHECK-NEXT: mov.b v2[3], w10 +; CHECK-NEXT: ldr w10, [sp, #368] ; CHECK-NEXT: mov.b v1[3], w8 -; CHECK-NEXT: mov.b v0[4], w4 -; CHECK-NEXT: mov.b v1[4], w10 -; CHECK-NEXT: ldr w10, [sp] -; CHECK-NEXT: mov.b v0[5], w5 -; CHECK-NEXT: ldr w14, [sp, #80] +; CHECK-NEXT: ldr w8, [sp, #96] +; CHECK-NEXT: mov.b v0[3], w11 +; CHECK-NEXT: ldr w11, [sp, #224] +; CHECK-NEXT: mov.b v3[4], w4 +; CHECK-NEXT: ldr w14, [sp, #392] +; CHECK-NEXT: mov.b v2[4], w8 +; CHECK-NEXT: ldr w8, [sp, #384] +; CHECK-NEXT: mov.b v1[4], w11 +; CHECK-NEXT: ldr w11, [sp, #104] +; CHECK-NEXT: mov.b v0[4], w9 +; CHECK-NEXT: ldr w9, [sp, #232] +; CHECK-NEXT: mov.b v3[5], w5 +; CHECK-NEXT: ldr w15, [sp, #408] +; CHECK-NEXT: mov.b v2[5], w11 +; CHECK-NEXT: ldr w11, [sp, #400] ; CHECK-NEXT: mov.b v1[5], w9 -; CHECK-NEXT: ldr w9, [sp, #8] -; CHECK-NEXT: mov.b v3[1], w13 -; CHECK-NEXT: ldr w13, [sp, #208] -; CHECK-NEXT: mov.b v0[6], w6 -; CHECK-NEXT: ldr w15, [sp, #88] -; CHECK-NEXT: mov.b v1[6], w11 -; CHECK-NEXT: ldr w11, [sp, #16] -; CHECK-NEXT: mov.b v0[7], w7 -; CHECK-NEXT: ldr w16, [sp, #96] -; CHECK-NEXT: mov.b v1[7], w12 -; CHECK-NEXT: ldr w12, [sp, #24] -; CHECK-NEXT: mov.b v0[8], w10 -; CHECK-NEXT: ldr w10, [sp, #104] -; CHECK-NEXT: mov.b v3[2], w14 -; CHECK-NEXT: ldr w14, [sp, #216] -; CHECK-NEXT: mov.b v0[9], w9 ; CHECK-NEXT: ldr w9, [sp, #112] -; CHECK-NEXT: mov.b v2[2], w13 -; CHECK-NEXT: ldr w13, [sp, #384] -; CHECK-NEXT: mov.b v3[3], w15 -; CHECK-NEXT: ldr w15, [sp, #224] -; CHECK-NEXT: mov.b v0[10], w11 -; CHECK-NEXT: ldr w11, [sp, #120] -; CHECK-NEXT: mov.b v3[4], w16 -; CHECK-NEXT: ldr w16, [sp, #232] -; CHECK-NEXT: mov.b v0[11], w12 -; CHECK-NEXT: ldr w12, [sp, #128] -; CHECK-NEXT: mov.b v3[5], w10 -; CHECK-NEXT: ldr w10, [sp, #240] -; CHECK-NEXT: mov.b v2[3], w14 -; CHECK-NEXT: mov.b v3[6], w9 -; CHECK-NEXT: ldr w9, [sp, #248] -; CHECK-NEXT: ldr w8, [sp, #392] -; CHECK-NEXT: mov.b v2[4], w15 -; CHECK-NEXT: mov.b v3[7], w11 -; CHECK-NEXT: ldr w11, [sp, #256] -; CHECK-NEXT: mov.b v1[8], w13 -; CHECK-NEXT: ldr w13, [sp, #32] -; CHECK-NEXT: ldr w14, [sp, #400] -; CHECK-NEXT: mov.b v2[5], w16 +; CHECK-NEXT: mov.b v0[5], w12 +; CHECK-NEXT: ldr w12, [sp, #240] +; CHECK-NEXT: mov.b v3[6], w6 +; CHECK-NEXT: ldr w16, [sp, #424] +; CHECK-NEXT: mov.b v2[6], w9 +; CHECK-NEXT: ldr w9, [sp, #416] +; CHECK-NEXT: mov.b v1[6], w12 +; CHECK-NEXT: ldr w12, [sp, #120] +; CHECK-NEXT: mov.b v0[6], w10 +; CHECK-NEXT: ldr w10, [sp, #248] +; CHECK-NEXT: mov.b v3[7], w7 +; CHECK-NEXT: mov.b v2[7], w12 +; CHECK-NEXT: ldr w12, [sp] +; CHECK-NEXT: mov.b v1[7], w10 +; CHECK-NEXT: ldr w10, [sp, #128] +; CHECK-NEXT: mov.b v0[7], w13 +; CHECK-NEXT: ldr w13, [sp, #256] ; CHECK-NEXT: mov.b v3[8], w12 -; CHECK-NEXT: ldr w12, [sp, #264] -; CHECK-NEXT: ldr w15, [sp, #408] -; CHECK-NEXT: mov.b v2[6], w10 -; CHECK-NEXT: mov.b v2[7], w9 -; CHECK-NEXT: mov.b v2[8], w11 -; CHECK-NEXT: mov.b v0[12], w13 +; CHECK-NEXT: ldr w12, [sp, #432] +; CHECK-NEXT: mov.b v2[8], w10 +; CHECK-NEXT: ldr w10, [sp, #8] +; CHECK-NEXT: mov.b v1[8], w13 ; CHECK-NEXT: ldr w13, [sp, #136] +; CHECK-NEXT: mov.b v0[8], w8 +; CHECK-NEXT: ldr w8, [sp, #264] +; CHECK-NEXT: mov.b v3[9], w10 +; CHECK-NEXT: ldr w10, [sp, #440] +; CHECK-NEXT: mov.b v2[9], w13 +; CHECK-NEXT: ldr w13, [sp, #16] ; CHECK-NEXT: mov.b v1[9], w8 -; CHECK-NEXT: ldr w8, [sp, #40] -; CHECK-NEXT: mov.b v2[9], w12 -; CHECK-NEXT: ldr w12, [sp, #272] -; CHECK-NEXT: mov.b v1[10], w14 -; CHECK-NEXT: ldr w14, [sp, #48] -; CHECK-NEXT: mov.b v1[11], w15 -; CHECK-NEXT: ldr w15, [sp, #56] -; CHECK-NEXT: ldr w16, [sp, #416] -; CHECK-NEXT: mov.b v3[9], w13 -; CHECK-NEXT: ldr w13, [sp, #280] -; CHECK-NEXT: mov.b v0[13], w8 ; CHECK-NEXT: ldr w8, [sp, #144] -; CHECK-NEXT: mov.b v2[10], w12 -; CHECK-NEXT: ldr w12, [sp, #288] -; CHECK-NEXT: mov.b v0[14], w14 +; CHECK-NEXT: mov.b v0[9], w14 +; CHECK-NEXT: ldr w14, [sp, #272] +; CHECK-NEXT: mov.b v3[10], w13 +; CHECK-NEXT: ldr w13, [sp, #280] +; CHECK-NEXT: mov.b v2[10], w8 +; CHECK-NEXT: ldr w8, [sp, #24] +; CHECK-NEXT: mov.b v1[10], w14 ; CHECK-NEXT: ldr w14, [sp, #152] -; CHECK-NEXT: mov.b v0[15], w15 -; CHECK-NEXT: ldr w15, [sp, #160] -; CHECK-NEXT: ldr w10, [sp, #424] -; CHECK-NEXT: mov.b v3[10], w8 -; CHECK-NEXT: ldr w8, [sp, #296] -; CHECK-NEXT: mov.b v2[11], w13 -; CHECK-NEXT: mov.b v1[12], w16 -; CHECK-NEXT: ldr w16, [sp, #168] -; CHECK-NEXT: ldr w9, [sp, #432] -; CHECK-NEXT: ldr w13, [sp, #304] -; CHECK-NEXT: mov.b v3[11], w14 -; CHECK-NEXT: mov.b v2[12], w12 -; CHECK-NEXT: ldr w12, [sp, #176] -; CHECK-NEXT: ldr w11, [sp, #440] -; CHECK-NEXT: ldr w14, [sp, #312] -; CHECK-NEXT: mov.b v3[12], w15 -; CHECK-NEXT: ldr w15, [sp, #184] -; CHECK-NEXT: mov.b v1[13], w10 -; CHECK-NEXT: mov.b v2[13], w8 -; CHECK-NEXT: mov.b v3[13], w16 +; CHECK-NEXT: mov.b v0[10], w11 +; CHECK-NEXT: ldr w11, [sp, #288] +; CHECK-NEXT: mov.b v3[11], w8 +; CHECK-NEXT: ldr w8, [sp, #32] +; CHECK-NEXT: mov.b v2[11], w14 +; CHECK-NEXT: ldr w14, [sp, #296] +; CHECK-NEXT: mov.b v1[11], w13 +; CHECK-NEXT: ldr w13, [sp, #160] +; CHECK-NEXT: mov.b v0[11], w15 +; CHECK-NEXT: mov.b v3[12], w8 +; CHECK-NEXT: ldr w8, [sp, #40] +; CHECK-NEXT: mov.b v2[12], w13 +; CHECK-NEXT: ldr w13, [sp, #312] +; CHECK-NEXT: mov.b v1[12], w11 +; CHECK-NEXT: ldr w11, [sp, #168] +; CHECK-NEXT: mov.b v0[12], w9 +; CHECK-NEXT: ldr w9, [sp, #304] +; CHECK-NEXT: mov.b v3[13], w8 +; CHECK-NEXT: ldr w8, [sp, #48] +; CHECK-NEXT: mov.b v2[13], w11 +; CHECK-NEXT: ldr w11, [sp, #176] +; CHECK-NEXT: mov.b v1[13], w14 +; CHECK-NEXT: mov.b v0[13], w16 +; CHECK-NEXT: mov.b v3[14], w8 +; CHECK-NEXT: ldr w8, [sp, #56] +; CHECK-NEXT: mov.b v2[14], w11 ; CHECK-NEXT: mov.b v1[14], w9 -; CHECK-NEXT: mov.b v2[14], w13 -; CHECK-NEXT: mov.b v3[14], w12 -; CHECK-NEXT: mov.b v1[15], w11 -; CHECK-NEXT: mov.b v2[15], w14 -; CHECK-NEXT: mov.b v3[15], w15 -; CHECK-NEXT: shl.16b v0, v0, #7 +; CHECK-NEXT: ldr w9, [sp, #184] +; CHECK-NEXT: mov.b v0[14], w12 +; CHECK-NEXT: mov.b v3[15], w8 +; CHECK-NEXT: mov.b v2[15], w9 +; CHECK-NEXT: mov.b v1[15], w13 +; CHECK-NEXT: mov.b v0[15], w10 ; CHECK-NEXT: shl.16b v3, v3, #7 ; CHECK-NEXT: shl.16b v2, v2, #7 ; CHECK-NEXT: shl.16b v4, v1, #7 -; CHECK-NEXT: sshr.16b v0, v0, #7 -; CHECK-NEXT: sshr.16b v1, v3, #7 -; CHECK-NEXT: sshr.16b v2, v2, #7 -; CHECK-NEXT: sshr.16b v3, v4, #7 +; CHECK-NEXT: shl.16b v5, v0, #7 +; CHECK-NEXT: sshr.16b v0, v3, #7 +; CHECK-NEXT: sshr.16b v1, v2, #7 +; CHECK-NEXT: sshr.16b v2, v4, #7 +; CHECK-NEXT: sshr.16b v3, v5, #7 ; CHECK-NEXT: ret %res = sext <64 x i1> %arg to <64 x i8> ret <64 x i8> %res @@ -638,8 +638,8 @@ ; FALLBACK-LABEL: sext_v1x64: ; FALLBACK: // %bb.0: ; FALLBACK-NEXT: fmov x8, d0 -; FALLBACK-NEXT: asr x1, x8, #63 ; FALLBACK-NEXT: fmov x0, d0 +; FALLBACK-NEXT: asr x1, x8, #63 ; FALLBACK-NEXT: ret %res = sext <1 x i64> %arg to <1 x i128> ret <1 x i128> %res diff --git a/llvm/test/CodeGen/AArch64/arm64-tls-dynamics.ll b/llvm/test/CodeGen/AArch64/arm64-tls-dynamics.ll --- a/llvm/test/CodeGen/AArch64/arm64-tls-dynamics.ll +++ b/llvm/test/CodeGen/AArch64/arm64-tls-dynamics.ll @@ -91,8 +91,8 @@ ; CHECK-NEXT: .tlsdesccall _TLS_MODULE_BASE_ ; CHECK-NEXT: blr [[CALLEE]] ; CHECK-NEXT: add x[[TPOFF:[0-9]+]], x0, :dtprel_hi12:local_dynamic_var -; CHECK-NEXT: add x[[TPOFF]], x[[TPOFF]], :dtprel_lo12_nc:local_dynamic_var -; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0 +; CHECK-DAG: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0 +; CHECK-DAG: add x[[TPOFF]], x[[TPOFF]], :dtprel_lo12_nc:local_dynamic_var ; CHECK: ldr w0, [x[[TPIDR]], x[[TPOFF]]] ; CHECK-NOLD: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:local_dynamic_var @@ -127,8 +127,8 @@ ; CHECK-NEXT: .tlsdesccall _TLS_MODULE_BASE_ ; CHECK-NEXT: blr [[CALLEE]] ; CHECK-NEXT: add x[[TPOFF:[0-9]+]], x0, :dtprel_hi12:local_dynamic_var -; CHECK-NEXT: add x[[TPOFF]], x[[TPOFF]], :dtprel_lo12_nc:local_dynamic_var -; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0 +; CHECK-DAG: add x[[TPOFF]], x[[TPOFF]], :dtprel_lo12_nc:local_dynamic_var +; CHECK-DAG: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0 ; CHECK: add x0, x[[TPIDR]], x[[TPOFF]] ; CHECK-NOLD: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:local_dynamic_var diff --git a/llvm/test/CodeGen/AArch64/arm64-tls-local-exec.ll b/llvm/test/CodeGen/AArch64/arm64-tls-local-exec.ll --- a/llvm/test/CodeGen/AArch64/arm64-tls-local-exec.ll +++ b/llvm/test/CodeGen/AArch64/arm64-tls-local-exec.ll @@ -47,17 +47,17 @@ ; CHECK-24-RELOC: R_AARCH64_TLSLE_ADD_TPREL_LO12_NC ; CHECK-32: movz x[[R2:[0-9]+]], #:tprel_g1:local_exec_var -; CHECK-32: movk x[[R2]], #:tprel_g0_nc:local_exec_var ; CHECK-32: mrs x[[R1:[0-9]+]], TPIDR_EL0 +; CHECK-32: movk x[[R2]], #:tprel_g0_nc:local_exec_var ; CHECK-32: ldr w0, [x[[R1]], x[[R2]]] ; CHECK-32-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1 ; CHECK-32-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC ; CHECK-48: movz x[[R2:[0-9]+]], #:tprel_g2:local_exec_var +; CHECK-48: mrs x[[R1:[0-9]+]], TPIDR_EL0 ; CHECK-48: movk x[[R2]], #:tprel_g1_nc:local_exec_var ; CHECK-48: movk x[[R2]], #:tprel_g0_nc:local_exec_var -; CHECK-48: mrs x[[R1:[0-9]+]], TPIDR_EL0 ; CHECK-48: ldr w0, [x[[R1]], x[[R2]]] ; CHECK-48-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G2 @@ -85,8 +85,8 @@ ; CHECK-24-RELOC: R_AARCH64_TLSLE_ADD_TPREL_LO12_NC ; CHECK-32: movz x[[R2:[0-9]+]], #:tprel_g1:local_exec_var -; CHECK-32: movk x[[R2]], #:tprel_g0_nc:local_exec_var ; CHECK-32: mrs x[[R1:[0-9]+]], TPIDR_EL0 +; CHECK-32: movk x[[R2]], #:tprel_g0_nc:local_exec_var ; CHECK-32: add x0, x[[R1]], x[[R2]] ; CHECK-32: ret @@ -94,9 +94,9 @@ ; CHECK-32-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC ; CHECK-48: movz x[[R2:[0-9]+]], #:tprel_g2:local_exec_var +; CHECK-48: mrs x[[R1:[0-9]+]], TPIDR_EL0 ; CHECK-48: movk x[[R2]], #:tprel_g1_nc:local_exec_var ; CHECK-48: movk x[[R2]], #:tprel_g0_nc:local_exec_var -; CHECK-48: mrs x[[R1:[0-9]+]], TPIDR_EL0 ; CHECK-48: add x0, x[[R1]], x[[R2]] ; CHECK-48: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-trunc-store.ll b/llvm/test/CodeGen/AArch64/arm64-trunc-store.ll --- a/llvm/test/CodeGen/AArch64/arm64-trunc-store.ll +++ b/llvm/test/CodeGen/AArch64/arm64-trunc-store.ll @@ -20,8 +20,8 @@ ; CHECK-LABEL: fct32: ; CHECK: // %bb.0: // %bb ; CHECK-NEXT: adrp x8, :got:zptr32 -; CHECK-NEXT: ldr x8, [x8, :got_lo12:zptr32] ; CHECK-NEXT: sub w9, w0, #1 +; CHECK-NEXT: ldr x8, [x8, :got_lo12:zptr32] ; CHECK-NEXT: ldr x8, [x8] ; CHECK-NEXT: str w1, [x8, w9, sxtw #2] ; CHECK-NEXT: ret @@ -39,8 +39,8 @@ ; CHECK-LABEL: fct16: ; CHECK: // %bb.0: // %bb ; CHECK-NEXT: adrp x8, :got:zptr16 -; CHECK-NEXT: ldr x8, [x8, :got_lo12:zptr16] ; CHECK-NEXT: sub w9, w0, #1 +; CHECK-NEXT: ldr x8, [x8, :got_lo12:zptr16] ; CHECK-NEXT: ldr x8, [x8] ; CHECK-NEXT: strh w1, [x8, w9, sxtw #1] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -310,8 +310,8 @@ ; GISEL-NEXT: movi.2d v2, #0000000000000000 ; GISEL-NEXT: usubl.4s v0, v0, v1 ; GISEL-NEXT: cmgt.4s v1, v2, v0 -; GISEL-NEXT: shl.4s v1, v1, #31 ; GISEL-NEXT: neg.4s v2, v0 +; GISEL-NEXT: shl.4s v1, v1, #31 ; GISEL-NEXT: sshr.4s v1, v1, #31 ; GISEL-NEXT: bit.16b v0, v2, v1 ; GISEL-NEXT: addv.4s s0, v0 @@ -387,8 +387,8 @@ ; GISEL-NEXT: movi.2d v2, #0000000000000000 ; GISEL-NEXT: usubl.2d v0, v0, v1 ; GISEL-NEXT: cmgt.2d v1, v2, v0 -; GISEL-NEXT: shl.2d v1, v1, #63 ; GISEL-NEXT: neg.2d v2, v0 +; GISEL-NEXT: shl.2d v1, v1, #63 ; GISEL-NEXT: sshr.2d v1, v1, #63 ; GISEL-NEXT: bit.16b v0, v2, v1 ; GISEL-NEXT: addp.2d d0, v0 @@ -908,13 +908,21 @@ ; FALLBACK-NOT: remark:{{.*}} sabal8h define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { -; CHECK-LABEL: sabal8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sabal.8h v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: sabal8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: sabal.8h v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: sabal8h: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: sabal.8h v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = load <8 x i16>, <8 x i16>* %C @@ -926,13 +934,21 @@ ; FALLBACK-NOT: remark:{{.*}} sabal4s define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -; CHECK-LABEL: sabal4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sabal.4s v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: sabal4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: sabal.4s v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: sabal4s: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: sabal.4s v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -944,13 +960,21 @@ ; FALLBACK-NOT: remark:{{.*}} sabal2d define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -; CHECK-LABEL: sabal2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sabal.2d v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: sabal2d: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: sabal.2d v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: sabal2d: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: sabal.2d v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -1020,13 +1044,21 @@ ; FALLBACK-NOT: remark:{{.*}} uabal8h define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind { -; CHECK-LABEL: uabal8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: uabal.8h v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: uabal8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: uabal.8h v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uabal8h: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: uabal.8h v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = load <8 x i16>, <8 x i16>* %C @@ -1038,13 +1070,21 @@ ; FALLBACK-NOT: remark:{{.*}} uabal8s define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -; CHECK-LABEL: uabal4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: uabal.4s v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: uabal4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: uabal.4s v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uabal4s: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: uabal.4s v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1056,13 +1096,21 @@ ; FALLBACK-NOT: remark:{{.*}} uabal2d define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -; CHECK-LABEL: uabal2d: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: uabal.2d v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: uabal2d: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: uabal.2d v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uabal2d: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: uabal.2d v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -1130,13 +1178,21 @@ } define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { -; CHECK-LABEL: saba_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: saba.8b v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: saba_8b: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr d0, [x2] +; DAG-NEXT: saba.8b v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: saba_8b: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr d0, [x2] +; GISEL-NEXT: saba.8b v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -1146,13 +1202,21 @@ } define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { -; CHECK-LABEL: saba_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: saba.16b v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: saba_16b: +; DAG: // %bb.0: +; DAG-NEXT: ldr q1, [x1] +; DAG-NEXT: ldr q2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: saba.16b v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: saba_16b: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr q1, [x0] +; GISEL-NEXT: ldr q2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: saba.16b v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -1162,13 +1226,21 @@ } define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { -; CHECK-LABEL: saba_4h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: saba.4h v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: saba_4h: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr d0, [x2] +; DAG-NEXT: saba.4h v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: saba_4h: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr d0, [x2] +; GISEL-NEXT: saba.4h v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -1178,13 +1250,21 @@ } define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { -; CHECK-LABEL: saba_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: saba.8h v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: saba_8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr q1, [x1] +; DAG-NEXT: ldr q2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: saba.8h v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: saba_8h: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr q1, [x0] +; GISEL-NEXT: ldr q2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: saba.8h v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -1194,13 +1274,21 @@ } define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { -; CHECK-LABEL: saba_2s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: saba.2s v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: saba_2s: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr d0, [x2] +; DAG-NEXT: saba.2s v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: saba_2s: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr d0, [x2] +; GISEL-NEXT: saba.2s v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -1210,13 +1298,21 @@ } define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { -; CHECK-LABEL: saba_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: saba.4s v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: saba_4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr q1, [x1] +; DAG-NEXT: ldr q2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: saba.4s v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: saba_4s: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr q1, [x0] +; GISEL-NEXT: ldr q2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: saba.4s v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -1226,13 +1322,21 @@ } define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { -; CHECK-LABEL: uaba_8b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: uaba.8b v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: uaba_8b: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr d0, [x2] +; DAG-NEXT: uaba.8b v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uaba_8b: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr d0, [x2] +; GISEL-NEXT: uaba.8b v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -1242,13 +1346,21 @@ } define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { -; CHECK-LABEL: uaba_16b: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: uaba.16b v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: uaba_16b: +; DAG: // %bb.0: +; DAG-NEXT: ldr q1, [x1] +; DAG-NEXT: ldr q2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: uaba.16b v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uaba_16b: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr q1, [x0] +; GISEL-NEXT: ldr q2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: uaba.16b v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -1258,13 +1370,21 @@ } define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { -; CHECK-LABEL: uaba_4h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: uaba.4h v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: uaba_4h: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr d0, [x2] +; DAG-NEXT: uaba.4h v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uaba_4h: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr d0, [x2] +; GISEL-NEXT: uaba.4h v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -1274,13 +1394,21 @@ } define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { -; CHECK-LABEL: uaba_8h: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: uaba.8h v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: uaba_8h: +; DAG: // %bb.0: +; DAG-NEXT: ldr q1, [x1] +; DAG-NEXT: ldr q2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: uaba.8h v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uaba_8h: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr q1, [x0] +; GISEL-NEXT: ldr q2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: uaba.8h v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -1290,13 +1418,21 @@ } define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { -; CHECK-LABEL: uaba_2s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: uaba.2s v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: uaba_2s: +; DAG: // %bb.0: +; DAG-NEXT: ldr d1, [x1] +; DAG-NEXT: ldr d2, [x0] +; DAG-NEXT: ldr d0, [x2] +; DAG-NEXT: uaba.2s v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uaba_2s: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr d1, [x0] +; GISEL-NEXT: ldr d2, [x1] +; GISEL-NEXT: ldr d0, [x2] +; GISEL-NEXT: uaba.2s v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -1306,13 +1442,21 @@ } define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { -; CHECK-LABEL: uaba_4s: -; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: uaba.4s v0, v1, v2 -; CHECK-NEXT: ret +; DAG-LABEL: uaba_4s: +; DAG: // %bb.0: +; DAG-NEXT: ldr q1, [x1] +; DAG-NEXT: ldr q2, [x0] +; DAG-NEXT: ldr q0, [x2] +; DAG-NEXT: uaba.4s v0, v2, v1 +; DAG-NEXT: ret +; +; GISEL-LABEL: uaba_4s: +; GISEL: // %bb.0: +; GISEL-NEXT: ldr q1, [x0] +; GISEL-NEXT: ldr q2, [x1] +; GISEL-NEXT: ldr q0, [x2] +; GISEL-NEXT: uaba.4s v0, v1, v2 +; GISEL-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -1439,9 +1583,9 @@ ; GISEL-LABEL: abspattern1: ; GISEL: // %bb.0: ; GISEL-NEXT: movi.2d v1, #0000000000000000 +; GISEL-NEXT: neg.2s v2, v0 ; GISEL-NEXT: cmge.2s v1, v0, v1 ; GISEL-NEXT: shl.2s v1, v1, #31 -; GISEL-NEXT: neg.2s v2, v0 ; GISEL-NEXT: sshr.2s v1, v1, #31 ; GISEL-NEXT: bif.8b v0, v2, v1 ; GISEL-NEXT: ret @@ -1461,9 +1605,9 @@ ; GISEL-LABEL: abspattern2: ; GISEL: // %bb.0: ; GISEL-NEXT: movi.2d v1, #0000000000000000 +; GISEL-NEXT: neg.4h v2, v0 ; GISEL-NEXT: cmgt.4h v1, v0, v1 ; GISEL-NEXT: shl.4h v1, v1, #15 -; GISEL-NEXT: neg.4h v2, v0 ; GISEL-NEXT: sshr.4h v1, v1, #15 ; GISEL-NEXT: bif.8b v0, v2, v1 ; GISEL-NEXT: ret @@ -1484,9 +1628,9 @@ ; GISEL-LABEL: abspattern3: ; GISEL: // %bb.0: ; GISEL-NEXT: movi.2d v1, #0000000000000000 +; GISEL-NEXT: neg.8b v2, v0 ; GISEL-NEXT: cmgt.8b v1, v1, v0 ; GISEL-NEXT: shl.8b v1, v1, #7 -; GISEL-NEXT: neg.8b v2, v0 ; GISEL-NEXT: sshr.8b v1, v1, #7 ; GISEL-NEXT: bit.8b v0, v2, v1 ; GISEL-NEXT: ret @@ -1506,9 +1650,9 @@ ; GISEL-LABEL: abspattern4: ; GISEL: // %bb.0: ; GISEL-NEXT: movi.2d v1, #0000000000000000 +; GISEL-NEXT: neg.4s v2, v0 ; GISEL-NEXT: cmge.4s v1, v0, v1 ; GISEL-NEXT: shl.4s v1, v1, #31 -; GISEL-NEXT: neg.4s v2, v0 ; GISEL-NEXT: sshr.4s v1, v1, #31 ; GISEL-NEXT: bif.16b v0, v2, v1 ; GISEL-NEXT: ret @@ -1528,9 +1672,9 @@ ; GISEL-LABEL: abspattern5: ; GISEL: // %bb.0: ; GISEL-NEXT: movi.2d v1, #0000000000000000 +; GISEL-NEXT: neg.8h v2, v0 ; GISEL-NEXT: cmgt.8h v1, v0, v1 ; GISEL-NEXT: shl.8h v1, v1, #15 -; GISEL-NEXT: neg.8h v2, v0 ; GISEL-NEXT: sshr.8h v1, v1, #15 ; GISEL-NEXT: bif.16b v0, v2, v1 ; GISEL-NEXT: ret @@ -1550,9 +1694,9 @@ ; GISEL-LABEL: abspattern6: ; GISEL: // %bb.0: ; GISEL-NEXT: movi.2d v1, #0000000000000000 +; GISEL-NEXT: neg.16b v2, v0 ; GISEL-NEXT: cmgt.16b v1, v1, v0 ; GISEL-NEXT: shl.16b v1, v1, #7 -; GISEL-NEXT: neg.16b v2, v0 ; GISEL-NEXT: sshr.16b v1, v1, #7 ; GISEL-NEXT: bit.16b v0, v2, v1 ; GISEL-NEXT: ret @@ -1572,9 +1716,9 @@ ; GISEL-LABEL: abspattern7: ; GISEL: // %bb.0: ; GISEL-NEXT: movi.2d v1, #0000000000000000 +; GISEL-NEXT: neg.2d v2, v0 ; GISEL-NEXT: cmge.2d v1, v1, v0 ; GISEL-NEXT: shl.2d v1, v1, #63 -; GISEL-NEXT: neg.2d v2, v0 ; GISEL-NEXT: sshr.2d v1, v1, #63 ; GISEL-NEXT: bit.16b v0, v2, v1 ; GISEL-NEXT: ret @@ -1596,8 +1740,8 @@ ; GISEL-NEXT: movi.2d v2, #0000000000000000 ; GISEL-NEXT: ssubl.2d v0, v0, v1 ; GISEL-NEXT: cmgt.2d v1, v2, v0 -; GISEL-NEXT: shl.2d v1, v1, #63 ; GISEL-NEXT: neg.2d v2, v0 +; GISEL-NEXT: shl.2d v1, v1, #63 ; GISEL-NEXT: sshr.2d v1, v1, #63 ; GISEL-NEXT: bit.16b v0, v2, v1 ; GISEL-NEXT: ret @@ -1615,27 +1759,27 @@ ; CHECK-LABEL: uabd_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: fmov x12, d1 -; CHECK-NEXT: asr x10, x9, #63 -; CHECK-NEXT: asr x13, x12, #63 -; CHECK-NEXT: subs x9, x9, x12 +; CHECK-NEXT: fmov x11, d1 ; CHECK-NEXT: mov.d x8, v0[1] -; CHECK-NEXT: mov.d x11, v1[1] -; CHECK-NEXT: sbcs x10, x10, x13 +; CHECK-NEXT: mov.d x10, v1[1] +; CHECK-NEXT: asr x12, x9, #63 +; CHECK-NEXT: asr x13, x11, #63 +; CHECK-NEXT: subs x9, x9, x11 +; CHECK-NEXT: sbcs x11, x12, x13 ; CHECK-NEXT: asr x12, x8, #63 -; CHECK-NEXT: asr x14, x11, #63 -; CHECK-NEXT: subs x8, x8, x11 -; CHECK-NEXT: sbcs x11, x12, x14 +; CHECK-NEXT: asr x13, x10, #63 +; CHECK-NEXT: subs x8, x8, x10 +; CHECK-NEXT: sbcs x10, x12, x13 ; CHECK-NEXT: negs x12, x8 -; CHECK-NEXT: ngcs x13, x11 -; CHECK-NEXT: cmp x11, #0 +; CHECK-NEXT: ngcs x13, x10 +; CHECK-NEXT: cmp x10, #0 ; CHECK-NEXT: csel x2, x12, x8, lt -; CHECK-NEXT: csel x3, x13, x11, lt +; CHECK-NEXT: csel x3, x13, x10, lt ; CHECK-NEXT: negs x8, x9 -; CHECK-NEXT: ngcs x11, x10 -; CHECK-NEXT: cmp x10, #0 +; CHECK-NEXT: ngcs x10, x11 +; CHECK-NEXT: cmp x11, #0 ; CHECK-NEXT: csel x8, x8, x9, lt -; CHECK-NEXT: csel x1, x11, x10, lt +; CHECK-NEXT: csel x1, x10, x11, lt ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov.d v0[1], x1 ; CHECK-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -759,10 +759,10 @@ define <4 x i64> @hadd32_sext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_sext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: saddl.2d v2, v0, v1 -; CHECK-NEXT: saddl2.2d v0, v0, v1 -; CHECK-NEXT: sshr.2d v1, v0, #1 -; CHECK-NEXT: sshr.2d v0, v2, #1 +; CHECK-NEXT: saddl2.2d v2, v0, v1 +; CHECK-NEXT: saddl.2d v0, v0, v1 +; CHECK-NEXT: sshr.2d v1, v2, #1 +; CHECK-NEXT: sshr.2d v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i32> %src1 to <4 x i64> %zextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -774,10 +774,10 @@ define <4 x i64> @hadd32_zext_asr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_zext_asr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl.2d v2, v0, v1 -; CHECK-NEXT: uaddl2.2d v0, v0, v1 -; CHECK-NEXT: ushr.2d v1, v0, #1 -; CHECK-NEXT: ushr.2d v0, v2, #1 +; CHECK-NEXT: uaddl2.2d v2, v0, v1 +; CHECK-NEXT: uaddl.2d v0, v0, v1 +; CHECK-NEXT: ushr.2d v1, v2, #1 +; CHECK-NEXT: ushr.2d v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> @@ -789,10 +789,10 @@ define <4 x i64> @hadd32_sext_lsr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_sext_lsr: ; CHECK: // %bb.0: -; CHECK-NEXT: saddl.2d v2, v0, v1 -; CHECK-NEXT: saddl2.2d v0, v0, v1 -; CHECK-NEXT: ushr.2d v1, v0, #1 -; CHECK-NEXT: ushr.2d v0, v2, #1 +; CHECK-NEXT: saddl2.2d v2, v0, v1 +; CHECK-NEXT: saddl.2d v0, v0, v1 +; CHECK-NEXT: ushr.2d v1, v2, #1 +; CHECK-NEXT: ushr.2d v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i32> %src1 to <4 x i64> %zextsrc2 = sext <4 x i32> %src2 to <4 x i64> @@ -804,10 +804,10 @@ define <4 x i64> @hadd32_zext_lsr(<4 x i32> %src1, <4 x i32> %src2) nounwind { ; CHECK-LABEL: hadd32_zext_lsr: ; CHECK: // %bb.0: -; CHECK-NEXT: uaddl.2d v2, v0, v1 -; CHECK-NEXT: uaddl2.2d v0, v0, v1 -; CHECK-NEXT: ushr.2d v1, v0, #1 -; CHECK-NEXT: ushr.2d v0, v2, #1 +; CHECK-NEXT: uaddl2.2d v2, v0, v1 +; CHECK-NEXT: uaddl.2d v0, v0, v1 +; CHECK-NEXT: ushr.2d v1, v2, #1 +; CHECK-NEXT: ushr.2d v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -356,10 +356,10 @@ define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: smlal4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlal.4s v0, v1, v2 +; CHECK-NEXT: smlal.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -372,10 +372,10 @@ define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: smlal2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlal.2d v0, v1, v2 +; CHECK-NEXT: smlal.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -388,10 +388,10 @@ define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: smlsl4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlsl.4s v0, v1, v2 +; CHECK-NEXT: smlsl.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -404,10 +404,10 @@ define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: smlsl2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlsl.2d v0, v1, v2 +; CHECK-NEXT: smlsl.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -425,10 +425,10 @@ define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: sqdmlal4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlal.4s v0, v1, v2 +; CHECK-NEXT: sqdmlal.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -441,10 +441,10 @@ define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: sqdmlal2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlal.2d v0, v1, v2 +; CHECK-NEXT: sqdmlal.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -493,10 +493,10 @@ define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: sqdmlsl4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlsl.4s v0, v1, v2 +; CHECK-NEXT: sqdmlsl.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -509,10 +509,10 @@ define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: sqdmlsl2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlsl.2d v0, v1, v2 +; CHECK-NEXT: sqdmlsl.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -561,10 +561,10 @@ define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: umlal4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlal.4s v0, v1, v2 +; CHECK-NEXT: umlal.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -577,10 +577,10 @@ define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: umlal2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlal.2d v0, v1, v2 +; CHECK-NEXT: umlal.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -593,10 +593,10 @@ define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: umlsl4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlsl.4s v0, v1, v2 +; CHECK-NEXT: umlsl.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -609,10 +609,10 @@ define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: umlsl2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlsl.2d v0, v1, v2 +; CHECK-NEXT: umlsl.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -625,10 +625,10 @@ define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { ; CHECK-LABEL: fmla_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: fmla.2s v0, v2, v1 +; CHECK-NEXT: fmla.2s v0, v1, v2 ; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B @@ -640,10 +640,10 @@ define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { ; CHECK-LABEL: fmla_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmla.4s v0, v2, v1 +; CHECK-NEXT: fmla.4s v0, v1, v2 ; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B @@ -655,10 +655,10 @@ define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { ; CHECK-LABEL: fmla_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmla.2d v0, v2, v1 +; CHECK-NEXT: fmla.2d v0, v1, v2 ; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B @@ -674,10 +674,10 @@ define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { ; CHECK-LABEL: fmls_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: fmls.2s v0, v1, v2 +; CHECK-NEXT: fmls.2s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B @@ -690,10 +690,10 @@ define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { ; CHECK-LABEL: fmls_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmls.4s v0, v1, v2 +; CHECK-NEXT: fmls.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B @@ -706,10 +706,10 @@ define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { ; CHECK-LABEL: fmls_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmls.2d v0, v1, v2 +; CHECK-NEXT: fmls.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B @@ -722,10 +722,10 @@ define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { ; CHECK-LABEL: fmls_commuted_neg_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr d0, [x2] -; CHECK-NEXT: fmls.2s v0, v1, v2 +; CHECK-NEXT: fmls.2s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B @@ -738,10 +738,10 @@ define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { ; CHECK-LABEL: fmls_commuted_neg_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmls.4s v0, v1, v2 +; CHECK-NEXT: fmls.4s v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B @@ -754,10 +754,10 @@ define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { ; CHECK-LABEL: fmls_commuted_neg_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q1, [x0] -; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr q2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: fmls.2d v0, v1, v2 +; CHECK-NEXT: fmls.2d v0, v2, v1 ; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B @@ -904,13 +904,13 @@ define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind { ; CHECK-LABEL: mul_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x10, d0 ; CHECK-NEXT: mov.d x8, v1[1] -; CHECK-NEXT: mov.d x9, v0[1] -; CHECK-NEXT: mul x10, x11, x10 -; CHECK-NEXT: mul x8, x9, x8 -; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: mov.d x11, v0[1] +; CHECK-NEXT: mul x9, x10, x9 +; CHECK-NEXT: mul x8, x11, x8 +; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: mov.d v0[1], x8 ; CHECK-NEXT: ret %tmp1 = mul <2 x i64> %A, %B @@ -1276,10 +1276,10 @@ define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: smlal_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlal.4s v0, v1, v2[1] +; CHECK-NEXT: smlal.4s v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1293,10 +1293,10 @@ define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: smlal_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlal.2d v0, v1, v2[1] +; CHECK-NEXT: smlal.2d v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1310,10 +1310,10 @@ define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: sqdmlal_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlal.4s v0, v1, v2[1] +; CHECK-NEXT: sqdmlal.4s v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1327,10 +1327,10 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: sqdmlal_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1] +; CHECK-NEXT: sqdmlal.2d v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1416,11 +1416,11 @@ define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { ; CHECK-LABEL: sqdmlal_lane_1d: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d1, x0 -; CHECK-NEXT: fmov s2, w1 +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov d2, x0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqdmlal.s d1, s2, v0[1] -; CHECK-NEXT: fmov x0, d1 +; CHECK-NEXT: sqdmlal.s d2, s1, v0[1] +; CHECK-NEXT: fmov x0, d2 ; CHECK-NEXT: ret %rhs = extractelement <2 x i32> %C, i32 1 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs) @@ -1433,11 +1433,11 @@ define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { ; CHECK-LABEL: sqdmlsl_lane_1d: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d1, x0 -; CHECK-NEXT: fmov s2, w1 +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov d2, x0 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqdmlsl.s d1, s2, v0[1] -; CHECK-NEXT: fmov x0, d1 +; CHECK-NEXT: sqdmlsl.s d2, s1, v0[1] +; CHECK-NEXT: fmov x0, d2 ; CHECK-NEXT: ret %rhs = extractelement <2 x i32> %C, i32 1 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs) @@ -1450,10 +1450,10 @@ define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: umlal_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlal.4s v0, v1, v2[1] +; CHECK-NEXT: umlal.4s v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1467,10 +1467,10 @@ define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: umlal_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlal.2d v0, v1, v2[1] +; CHECK-NEXT: umlal.2d v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1485,10 +1485,10 @@ define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: smlsl_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlsl.4s v0, v1, v2[1] +; CHECK-NEXT: smlsl.4s v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1502,10 +1502,10 @@ define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: smlsl_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlsl.2d v0, v1, v2[1] +; CHECK-NEXT: smlsl.2d v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1519,10 +1519,10 @@ define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: sqdmlsl_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[1] +; CHECK-NEXT: sqdmlsl.4s v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1536,10 +1536,10 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: sqdmlsl_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1] +; CHECK-NEXT: sqdmlsl.2d v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -1589,10 +1589,10 @@ define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { ; CHECK-LABEL: umlsl_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlsl.4s v0, v1, v2[1] +; CHECK-NEXT: umlsl.4s v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -1606,10 +1606,10 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { ; CHECK-LABEL: umlsl_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlsl.2d v0, v1, v2[1] +; CHECK-NEXT: umlsl.2d v0, v2, v1[1] ; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll --- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll +++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll @@ -7,8 +7,8 @@ define dso_local i64 @"?f1"() { entry: ; CHECK-LABEL: f1 -; CHECK: str xzr, [sp, #8] -; CHECK: mov x0, xzr +; CHECK-DAG: str xzr, [sp, #8] +; CHECK-DAG: mov x0, xzr %retval = alloca %struct.S1, align 4 %a = getelementptr inbounds %struct.S1, %struct.S1* %retval, i32 0, i32 0 @@ -29,10 +29,10 @@ ; CHECK: sub sp, sp, #16 ; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: .seh_endprologue -; CHECK-NEXT: stp xzr, xzr, [sp] -; CHECK-NEXT: mov x0, xzr -; CHECK-NEXT: mov x1, xzr -; CHECK-NEXT: .seh_startepilogue +; CHECK-DAG: stp xzr, xzr, [sp] +; CHECK-DAG: mov x0, xzr +; CHECK-DAG: mov x1, xzr +; CHECK: .seh_startepilogue ; CHECK-NEXT: add sp, sp, #16 %retval = alloca %struct.S2, align 4 @@ -90,8 +90,8 @@ define dso_local void @"?inst@C"(%class.C* %this, %class.A* inreg noalias sret(%class.A) %agg.result) { entry: ; CHECK-LABEL: inst@C -; CHECK: str x0, [sp, #8] -; CHECK: mov x0, x1 +; CHECK-DAG: mov x0, x1 +; CHECK-DAG: str x8, [sp, #8] %this.addr = alloca %class.C*, align 8 store %class.C* %this, %class.C** %this.addr, align 8 @@ -148,7 +148,8 @@ store %struct.Pod %x, %struct.Pod* @Pod ret void ; CHECK: bl copy_pod - ; CHECK-NEXT: stp d0, d1, [{{.*}}] + ; CHECK-NEXT: str d0, [{{.*}}] + ; CHECK-NEXT: str d1, [{{.*}}] } @NotCXX14Aggregate = external global %struct.NotCXX14Aggregate diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll @@ -19,10 +19,10 @@ entry: ; ALL-LABEL: t1: ; ALL-NOT: fmov -; NONEFP: ldr h0,{{.*}} -; NONEFP: fmov s1, wzr -; NONEFP: fmov d2, xzr -; NONEFP: movi{{(.16b)?}} v3{{(.2d)?}}, #0 +; NONEFP-DAG: ldr h0,{{.*}} +; NONEFP-DAG: fmov s1, wzr +; NONEFP-DAG: fmov d2, xzr +; NONEFP-DAG: movi{{(.16b)?}} v3{{(.2d)?}}, #0 ; NONE16: fmov h0, wzr ; NONE16: fmov s1, wzr ; NONE16: fmov d2, xzr diff --git a/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll b/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll --- a/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32-addrs.ll @@ -42,10 +42,10 @@ define i8 @test_valid_wrap_optimizable2(i8* %base, i32 %offset) { ; CHECK-LABEL: test_valid_wrap_optimizable2: ; CHECK: ; %bb.0: +; CHECK-NEXT: mov w8, #-100 ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: mov w9, #-100 -; CHECK-NEXT: ldrb w0, [x8, x9] +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: ldrb w0, [x9, x8] ; CHECK-NEXT: ret %newaddr = getelementptr inbounds i8, i8* inttoptr(i32 -100 to i8*), i32 %offset diff --git a/llvm/test/CodeGen/AArch64/arm64_32-atomics.ll b/llvm/test/CodeGen/AArch64/arm64_32-atomics.ll --- a/llvm/test/CodeGen/AArch64/arm64_32-atomics.ll +++ b/llvm/test/CodeGen/AArch64/arm64_32-atomics.ll @@ -254,9 +254,9 @@ ; CHECK: ret ; CHECK: [[DONE]]: -; CHECK: clrex ; CHECK: mov w1, wzr ; CHECK: mov w0, [[OLD]] +; CHECK: clrex ; CHECK: ret %res = cmpxchg i8** %addr, i8* %cmp, i8* %new acq_rel acquire ret {i8*, i1} %res diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll --- a/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll +++ b/llvm/test/CodeGen/AArch64/atomic-ops-lse.ll @@ -3,7 +3,6 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics < %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS ; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s --check-prefix=CHECK-REG -; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira -mattr=-lse2 < %s | FileCheck %s ; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created ; (i.e. reusing a register for status & data in store exclusive). @@ -1543,9 +1542,9 @@ %old = atomicrmw sub i8* @var8, i8 -1 seq_cst ; CHECK-NOT: dmb +; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8 -; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: ldaddalb w[[IMM]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb @@ -1566,9 +1565,9 @@ %old = atomicrmw sub i16* @var16, i16 -1 seq_cst ; CHECK-NOT: dmb +; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16 -; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: ldaddalh w[[IMM]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb @@ -1589,9 +1588,9 @@ %old = atomicrmw sub i32* @var32, i32 -1 seq_cst ; CHECK-NOT: dmb +; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 -; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: ldaddal w[[IMM]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb @@ -1612,9 +1611,9 @@ %old = atomicrmw sub i64* @var64, i64 -1 seq_cst ; CHECK-NOT: dmb +; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64 -; CHECK: mov w[[IMM:[0-9]+]], #1 ; CHECK: ldaddal x[[IMM]], x[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb @@ -1810,9 +1809,9 @@ ; OUTLINE-ATOMICS-NEXT: ret %old = atomicrmw and i8* @var8, i8 -2 seq_cst ; CHECK-NOT: dmb +; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var8 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8 -; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: ldclralb w[[CONST]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb ret i8 %old @@ -1831,9 +1830,9 @@ ; OUTLINE-ATOMICS-NEXT: ret %old = atomicrmw and i16* @var16, i16 -2 seq_cst ; CHECK-NOT: dmb +; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var16 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16 -; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: ldclralh w[[CONST]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb ret i16 %old @@ -1852,9 +1851,9 @@ ; OUTLINE-ATOMICS-NEXT: ret %old = atomicrmw and i32* @var32, i32 -2 seq_cst ; CHECK-NOT: dmb +; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32 -; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: ldclral w[[CONST]], w[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb ret i32 %old @@ -1873,9 +1872,9 @@ ; OUTLINE-ATOMICS-NEXT: ret %old = atomicrmw and i64* @var64, i64 -2 seq_cst ; CHECK-NOT: dmb +; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var64 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64 -; CHECK: mov w[[CONST:[0-9]+]], #1 ; CHECK: ldclral x[[CONST]], x[[NEW:[0-9]+]], [x[[ADDR]]] ; CHECK-NOT: dmb ret i64 %old diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-not-barriers.ll b/llvm/test/CodeGen/AArch64/atomic-ops-not-barriers.ll --- a/llvm/test/CodeGen/AArch64/atomic-ops-not-barriers.ll +++ b/llvm/test/CodeGen/AArch64/atomic-ops-not-barriers.ll @@ -21,7 +21,7 @@ ; The key point here is that the second dmb isn't immediately followed by the ; simple_ver basic block, which LLVM attempted to do when DMB had been marked ; with isBarrier. For now, look for something that looks like "somewhere". -; CHECK-NEXT: mov +; CHECK-NEXT: ret somewhere: %combined = phi i32 [ %val, %atomic_ver ], [ %newval, %simple_ver] ret i32 %combined diff --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll --- a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll +++ b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll @@ -76,12 +76,12 @@ ; CHECKN-NEXT: ldr x12, [x0, #16] ; CHECKN-NEXT: ldr x13, [x1, #16] ; CHECKN-NEXT: ldur x14, [x0, #23] -; CHECKN-NEXT: ldur x15, [x1, #23] ; CHECKN-NEXT: eor x8, x8, x10 +; CHECKN-NEXT: ldur x15, [x1, #23] ; CHECKN-NEXT: eor x9, x9, x11 ; CHECKN-NEXT: eor x10, x12, x13 -; CHECKN-NEXT: eor x11, x14, x15 ; CHECKN-NEXT: orr x8, x8, x9 +; CHECKN-NEXT: eor x11, x14, x15 ; CHECKN-NEXT: orr x9, x10, x11 ; CHECKN-NEXT: orr x8, x8, x9 ; CHECKN-NEXT: cmp x8, #0 diff --git a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll --- a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll @@ -8,10 +8,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov v1.s[1], w8 -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: umov w9, v0.h[1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %y = bitcast <2 x half> %x to <2 x i16> ret <2 x i16> %y diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll --- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll +++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll @@ -10,9 +10,9 @@ ; CHECK-LABEL: from_clang: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w8, #135 +; CHECK-NEXT: and x9, x0, #0xffffff00 ; CHECK-NEXT: and w8, w0, w8 ; CHECK-NEXT: bfi w8, w1, #3, #4 -; CHECK-NEXT: and x9, x0, #0xffffff00 ; CHECK-NEXT: orr x0, x8, x9 ; CHECK-NEXT: ret entry: @@ -96,8 +96,8 @@ ; CHECK-LABEL: test_32bit_masked: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: mov w10, #135 +; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: and w8, w8, w10 ; CHECK-NEXT: bfi w8, w9, #3, #4 ; CHECK-NEXT: str w8, [x0] @@ -142,8 +142,8 @@ ; CHECK-LABEL: test_32bit_complexmask: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: mov w10, #647 +; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: and w8, w8, w10 ; CHECK-NEXT: bfi w8, w9, #3, #4 ; CHECK-NEXT: str w8, [x0] @@ -166,8 +166,8 @@ ; CHECK-LABEL: test_32bit_badmask: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: mov w10, #135 +; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: mov w11, #632 ; CHECK-NEXT: and w8, w8, w10 ; CHECK-NEXT: and w9, w11, w9, lsl #3 @@ -191,13 +191,13 @@ define void @test_64bit_badmask(i64 *%existing, i64 *%new) { ; CHECK-LABEL: test_64bit_badmask: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ldr x9, [x1] -; CHECK-NEXT: mov w10, #135 -; CHECK-NEXT: and x8, x8, x10 -; CHECK-NEXT: lsl w9, w9, #3 -; CHECK-NEXT: mov w10, #664 -; CHECK-NEXT: and x9, x9, x10 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov w8, #135 +; CHECK-NEXT: ldr x10, [x1] +; CHECK-NEXT: mov w11, #664 +; CHECK-NEXT: and x8, x9, x8 +; CHECK-NEXT: lsl w10, w10, #3 +; CHECK-NEXT: and x9, x10, x11 ; CHECK-NEXT: orr x8, x8, x9 ; CHECK-NEXT: str x8, [x0] ; CHECK-NEXT: ret @@ -380,8 +380,8 @@ ; CHECK-LABEL: test_or_and_and2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: lsr w8, w0, #4 -; CHECK-NEXT: bfi w1, w8, #4, #12 ; CHECK-NEXT: mov w0, w1 +; CHECK-NEXT: bfi w0, w8, #4, #12 ; CHECK-NEXT: ret entry: %and = and i32 %a, 65520 ; 0x0000fff0 @@ -528,10 +528,10 @@ define i64 @test8(i64 %a) { ; CHECK-LABEL: test8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #2035482624 -; CHECK-NEXT: and x8, x0, #0xff000000000000ff -; CHECK-NEXT: movk x9, #36694, lsl #32 -; CHECK-NEXT: orr x0, x8, x9 +; CHECK-NEXT: mov x8, #2035482624 +; CHECK-NEXT: and x9, x0, #0xff000000000000ff +; CHECK-NEXT: movk x8, #36694, lsl #32 +; CHECK-NEXT: orr x0, x9, x8 ; CHECK-NEXT: ret %1 = and i64 %a, -72057594037927681 ; 0xff000000000000ff %2 = or i64 %1, 157601565442048 ; 0x00008f5679530000 @@ -545,8 +545,8 @@ define i32 @test9(i64 %b, i32 %e) { ; CHECK-LABEL: test9: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x0, x0, #12 ; CHECK-NEXT: lsr w8, w1, #23 +; CHECK-NEXT: lsr x0, x0, #12 ; CHECK-NEXT: bfi w0, w8, #23, #9 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/build-one-lane.ll b/llvm/test/CodeGen/AArch64/build-one-lane.ll --- a/llvm/test/CodeGen/AArch64/build-one-lane.ll +++ b/llvm/test/CodeGen/AArch64/build-one-lane.ll @@ -318,13 +318,14 @@ define <32 x i8> @test_lanex_32xi8(<32 x i8> %a, i32 %x) { ; CHECK-LABEL: test_lanex_32xi8: ; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: stp q0, q1, [sp, #-32]! -; CHECK-NEXT: and x8, x0, #0x1f -; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: and x9, x0, #0x1f +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: mov w10, #30 -; CHECK-NEXT: strb w10, [x9, x8] +; CHECK-NEXT: stp q0, q1, [sp] +; CHECK-NEXT: strb w10, [x8, x9] ; CHECK-NEXT: ldp q0, q1, [sp], #32 ; CHECK-NEXT: ret %b = insertelement <32 x i8> %a, i8 30, i32 %x diff --git a/llvm/test/CodeGen/AArch64/build-vector-extract.ll b/llvm/test/CodeGen/AArch64/build-vector-extract.ll --- a/llvm/test/CodeGen/AArch64/build-vector-extract.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-extract.ll @@ -16,9 +16,10 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract0_i32_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 0 %z = zext i32 %e to i64 @@ -29,8 +30,8 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s ; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 1 @@ -42,9 +43,10 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract1_i32_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 1 %z = zext i32 %e to i64 @@ -55,8 +57,8 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract2_i32_zext_insert0_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s ; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 2 @@ -68,9 +70,10 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract2_i32_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w8, v0.s[2] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 2 %z = zext i32 %e to i64 @@ -93,9 +96,10 @@ define <2 x i64> @extract3_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract3_i32_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 3 %z = zext i32 %e to i64 @@ -119,9 +123,10 @@ define <2 x i64> @extract0_i32_zext_insert1_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract0_i32_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 0 %z = zext i32 %e to i64 @@ -132,8 +137,8 @@ define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract1_i32_zext_insert1_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 1 @@ -145,9 +150,10 @@ define <2 x i64> @extract1_i32_zext_insert1_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract1_i32_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 1 %z = zext i32 %e to i64 @@ -169,9 +175,10 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract2_i32_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w8, v0.s[2] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 2 %z = zext i32 %e to i64 @@ -194,9 +201,10 @@ define <2 x i64> @extract3_i32_zext_insert1_i64_zero(<4 x i32> %x) { ; CHECK-LABEL: extract3_i32_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 3 %z = zext i32 %e to i64 @@ -219,9 +227,10 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; CHECK-LABEL: extract0_i16_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <8 x i16> %x, i32 0 %z = zext i16 %e to i64 @@ -244,9 +253,10 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; CHECK-LABEL: extract1_i16_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <8 x i16> %x, i32 1 %z = zext i16 %e to i64 @@ -269,9 +279,10 @@ define <2 x i64> @extract2_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; CHECK-LABEL: extract2_i16_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <8 x i16> %x, i32 2 %z = zext i16 %e to i64 @@ -294,9 +305,10 @@ define <2 x i64> @extract3_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; CHECK-LABEL: extract3_i16_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <8 x i16> %x, i32 3 %z = zext i16 %e to i64 @@ -319,9 +331,10 @@ define <2 x i64> @extract0_i16_zext_insert1_i64_zero(<8 x i16> %x) { ; CHECK-LABEL: extract0_i16_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <8 x i16> %x, i32 0 %z = zext i16 %e to i64 @@ -344,9 +357,10 @@ define <2 x i64> @extract1_i16_zext_insert1_i64_zero(<8 x i16> %x) { ; CHECK-LABEL: extract1_i16_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <8 x i16> %x, i32 1 %z = zext i16 %e to i64 @@ -369,9 +383,10 @@ define <2 x i64> @extract2_i16_zext_insert1_i64_zero(<8 x i16> %x) { ; CHECK-LABEL: extract2_i16_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <8 x i16> %x, i32 2 %z = zext i16 %e to i64 @@ -394,9 +409,10 @@ define <2 x i64> @extract3_i16_zext_insert1_i64_zero(<8 x i16> %x) { ; CHECK-LABEL: extract3_i16_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <8 x i16> %x, i32 3 %z = zext i16 %e to i64 @@ -421,9 +437,10 @@ define <2 x i64> @extract0_i8_zext_insert0_i64_zero(<16 x i8> %x) { ; CHECK-LABEL: extract0_i8_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.b[0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <16 x i8> %x, i32 0 %z = zext i8 %e to i64 @@ -446,9 +463,10 @@ define <2 x i64> @extract1_i8_zext_insert0_i64_zero(<16 x i8> %x) { ; CHECK-LABEL: extract1_i8_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <16 x i8> %x, i32 1 %z = zext i8 %e to i64 @@ -471,9 +489,10 @@ define <2 x i64> @extract2_i8_zext_insert0_i64_zero(<16 x i8> %x) { ; CHECK-LABEL: extract2_i8_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.b[2] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <16 x i8> %x, i32 2 %z = zext i8 %e to i64 @@ -496,9 +515,10 @@ define <2 x i64> @extract3_i8_zext_insert0_i64_zero(<16 x i8> %x) { ; CHECK-LABEL: extract3_i8_zext_insert0_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.b[3] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[0], x8 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <16 x i8> %x, i32 3 %z = zext i8 %e to i64 @@ -521,9 +541,10 @@ define <2 x i64> @extract0_i8_zext_insert1_i64_zero(<16 x i8> %x) { ; CHECK-LABEL: extract0_i8_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.b[0] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <16 x i8> %x, i32 0 %z = zext i8 %e to i64 @@ -546,9 +567,10 @@ define <2 x i64> @extract1_i8_zext_insert1_i64_zero(<16 x i8> %x) { ; CHECK-LABEL: extract1_i8_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <16 x i8> %x, i32 1 %z = zext i8 %e to i64 @@ -571,9 +593,10 @@ define <2 x i64> @extract2_i8_zext_insert1_i64_zero(<16 x i8> %x) { ; CHECK-LABEL: extract2_i8_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.b[2] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <16 x i8> %x, i32 2 %z = zext i8 %e to i64 @@ -596,9 +619,10 @@ define <2 x i64> @extract3_i8_zext_insert1_i64_zero(<16 x i8> %x) { ; CHECK-LABEL: extract3_i8_zext_insert1_i64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: umov w8, v0.b[3] -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %e = extractelement <16 x i8> %x, i32 3 %z = zext i8 %e to i64 diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll --- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll +++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll @@ -40,7 +40,8 @@ ; CHECK-NEXT: mov w9, #42 ; CHECK-NEXT: cmp w8, #42 ; CHECK-NEXT: sub w9, w9, w0 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: cset w8, hi +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: strb w9, [x1] ; CHECK-NEXT: ret %s = sub i8 42, %x @@ -58,7 +59,8 @@ ; CHECK-NEXT: mov w9, #43 ; CHECK-NEXT: cmp w8, #43 ; CHECK-NEXT: sub w9, w9, w0 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: cset w8, hi +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: strh w9, [x1] ; CHECK-NEXT: ret %s = sub i16 43, %x @@ -73,9 +75,10 @@ ; CHECK-LABEL: usubo_ult_constant_op1_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: cmp w8, #44 ; CHECK-NEXT: sub w9, w0, #44 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: cmp w8, #44 +; CHECK-NEXT: cset w8, lo +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: strh w9, [x1] ; CHECK-NEXT: ret %s = add i16 %x, -44 @@ -88,9 +91,9 @@ ; CHECK-LABEL: usubo_ugt_constant_op1_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: sub w9, w0, #45 ; CHECK-NEXT: cmp w8, #45 ; CHECK-NEXT: cset w8, lo -; CHECK-NEXT: sub w9, w0, #45 ; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: strb w9, [x1] ; CHECK-NEXT: ret @@ -106,9 +109,10 @@ ; CHECK-LABEL: usubo_eq_constant1_op1_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, #0 -; CHECK-NEXT: sub w8, w0, #1 -; CHECK-NEXT: cset w0, eq -; CHECK-NEXT: str w8, [x1] +; CHECK-NEXT: sub w9, w0, #1 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: str w9, [x1] ; CHECK-NEXT: ret %s = add i32 %x, -1 %ov = icmp eq i32 %x, 0 @@ -157,10 +161,10 @@ ; CHECK-NEXT: tbz w3, #0, .LBB8_3 ; CHECK-NEXT: // %bb.1: // %t ; CHECK-NEXT: cmp x0, x1 -; CHECK-NEXT: cset w21, lo ; CHECK-NEXT: mov x23, x0 -; CHECK-NEXT: mov w0, w21 +; CHECK-NEXT: cset w21, lo ; CHECK-NEXT: mov x20, x2 +; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: mov x22, x1 ; CHECK-NEXT: bl call ; CHECK-NEXT: subs x8, x23, x22 diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll --- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll +++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll @@ -71,8 +71,8 @@ define i64 @not_sign_i64(i64 %a) { ; CHECK-LABEL: not_sign_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, #0 ; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: cmp x0, #0 ; CHECK-NEXT: cneg x0, x8, le ; CHECK-NEXT: ret %c = icmp sgt i64 %a, 0 @@ -114,8 +114,8 @@ define <7 x i8> @sign_7xi8(<7 x i8> %a) { ; CHECK-LABEL: sign_7xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.8b, v0.8b, #7 ; CHECK-NEXT: movi v1.8b, #1 +; CHECK-NEXT: sshr v0.8b, v0.8b, #7 ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %c = icmp sgt <7 x i8> %a, @@ -126,8 +126,8 @@ define <8 x i8> @sign_8xi8(<8 x i8> %a) { ; CHECK-LABEL: sign_8xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.8b, v0.8b, #7 ; CHECK-NEXT: movi v1.8b, #1 +; CHECK-NEXT: sshr v0.8b, v0.8b, #7 ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %c = icmp sgt <8 x i8> %a, @@ -138,8 +138,8 @@ define <16 x i8> @sign_16xi8(<16 x i8> %a) { ; CHECK-LABEL: sign_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-NEXT: movi v1.16b, #1 +; CHECK-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %c = icmp sgt <16 x i8> %a, @@ -198,9 +198,9 @@ ; CHECK-LABEL: not_sign_4xi32: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s ; CHECK-NEXT: and v1.16b, v0.16b, v1.16b ; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret @@ -213,10 +213,10 @@ define <4 x i32> @not_sign_4xi32_2(<4 x i32> %a) { ; CHECK-LABEL: not_sign_4xi32_2: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff ; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] -; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff -; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s ; CHECK-NEXT: and v1.16b, v0.16b, v1.16b ; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret @@ -229,12 +229,12 @@ define <4 x i32> @not_sign_4xi32_3(<4 x i32> %a) { ; CHECK-LABEL: not_sign_4xi32_3: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff ; CHECK-NEXT: adrp x8, .LCPI18_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] -; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff -; CHECK-NEXT: cmgt v0.4s, v0.4s, v2.4s -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: cmgt v0.4s, v0.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret %c = icmp sgt <4 x i32> %a, %res = select <4 x i1> %c, <4 x i32> , <4 x i32> @@ -246,19 +246,19 @@ ; CHECK-LABEL: sign_4xi65: ; CHECK: // %bb.0: ; CHECK-NEXT: sbfx x8, x1, #0, #1 -; CHECK-NEXT: sbfx x9, x7, #0, #1 -; CHECK-NEXT: orr x6, x9, #0x1 -; CHECK-NEXT: lsr x7, x9, #63 +; CHECK-NEXT: sbfx x10, x5, #0, #1 ; CHECK-NEXT: orr x9, x8, #0x1 ; CHECK-NEXT: lsr x1, x8, #63 -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: sbfx x10, x5, #0, #1 -; CHECK-NEXT: sbfx x11, x3, #0, #1 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: orr x2, x11, #0x1 -; CHECK-NEXT: lsr x3, x11, #63 +; CHECK-NEXT: sbfx x8, x7, #0, #1 ; CHECK-NEXT: orr x4, x10, #0x1 ; CHECK-NEXT: lsr x5, x10, #63 +; CHECK-NEXT: orr x6, x8, #0x1 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: sbfx x9, x3, #0, #1 +; CHECK-NEXT: orr x2, x9, #0x1 +; CHECK-NEXT: lsr x3, x9, #63 +; CHECK-NEXT: lsr x7, x8, #63 +; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %c = icmp sgt <4 x i65> %a, diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll --- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll +++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll @@ -18,8 +18,8 @@ ; CHECK-NEXT: mov w0, #1 ; CHECK-NEXT: ret ; CHECK-NEXT: LBB0_4: ; %cmpxchg.nostore -; CHECK-NEXT: clrex ; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: clrex ; CHECK-NEXT: ret ; ; OUTLINE-ATOMICS-LABEL: test_return: @@ -68,8 +68,8 @@ ; CHECK-NEXT: eor w0, w8, #0x1 ; CHECK-NEXT: ret ; CHECK-NEXT: LBB1_4: ; %cmpxchg.nostore -; CHECK-NEXT: clrex ; CHECK-NEXT: eor w0, wzr, #0x1 +; CHECK-NEXT: clrex ; CHECK-NEXT: ret ; ; OUTLINE-ATOMICS-LABEL: test_return_bool: @@ -191,8 +191,8 @@ ; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: b LBB3_5 ; CHECK-NEXT: LBB3_4: ; %cmpxchg.nostore -; CHECK-NEXT: clrex ; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: clrex ; CHECK-NEXT: LBB3_5: ; %for.cond.preheader ; CHECK-NEXT: mov w22, #2 ; CHECK-NEXT: LBB3_6: ; %for.cond @@ -201,8 +201,8 @@ ; CHECK-NEXT: ; %bb.7: ; %for.body ; CHECK-NEXT: ; in Loop: Header=BB3_6 Depth=1 ; CHECK-NEXT: sub w22, w22, #1 -; CHECK-NEXT: ldr w10, [x19, w22, sxtw #2] ; CHECK-NEXT: orr w9, w21, w20 +; CHECK-NEXT: ldr w10, [x19, w22, sxtw #2] ; CHECK-NEXT: cmp w9, w10 ; CHECK-NEXT: b.eq LBB3_6 ; CHECK-NEXT: ; %bb.8: ; %if.then @@ -214,8 +214,8 @@ ; CHECK-NEXT: b LBB3_6 ; CHECK-NEXT: LBB3_9: ; %for.cond.cleanup ; CHECK-NEXT: ldp x29, x30, [sp, #32] ; 16-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload ; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp], #48 ; 16-byte Folded Reload ; CHECK-NEXT: ret ; @@ -236,16 +236,16 @@ ; OUTLINE-ATOMICS-NEXT: mov w21, w0 ; OUTLINE-ATOMICS-NEXT: bl ___aarch64_cas4_acq_rel ; OUTLINE-ATOMICS-NEXT: cmp w0, w21 -; OUTLINE-ATOMICS-NEXT: cset w8, eq ; OUTLINE-ATOMICS-NEXT: mov w22, #2 +; OUTLINE-ATOMICS-NEXT: cset w8, eq ; OUTLINE-ATOMICS-NEXT: LBB3_1: ; %for.cond ; OUTLINE-ATOMICS-NEXT: ; =>This Inner Loop Header: Depth=1 ; OUTLINE-ATOMICS-NEXT: cbz w22, LBB3_4 ; OUTLINE-ATOMICS-NEXT: ; %bb.2: ; %for.body ; OUTLINE-ATOMICS-NEXT: ; in Loop: Header=BB3_1 Depth=1 ; OUTLINE-ATOMICS-NEXT: sub w22, w22, #1 -; OUTLINE-ATOMICS-NEXT: ldr w10, [x19, w22, sxtw #2] ; OUTLINE-ATOMICS-NEXT: orr w9, w21, w20 +; OUTLINE-ATOMICS-NEXT: ldr w10, [x19, w22, sxtw #2] ; OUTLINE-ATOMICS-NEXT: cmp w9, w10 ; OUTLINE-ATOMICS-NEXT: b.eq LBB3_1 ; OUTLINE-ATOMICS-NEXT: ; %bb.3: ; %if.then @@ -257,8 +257,8 @@ ; OUTLINE-ATOMICS-NEXT: b LBB3_1 ; OUTLINE-ATOMICS-NEXT: LBB3_4: ; %for.cond.cleanup ; OUTLINE-ATOMICS-NEXT: ldp x29, x30, [sp, #32] ; 16-byte Folded Reload -; OUTLINE-ATOMICS-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload ; OUTLINE-ATOMICS-NEXT: and w0, w8, #0x1 +; OUTLINE-ATOMICS-NEXT: ldp x20, x19, [sp, #16] ; 16-byte Folded Reload ; OUTLINE-ATOMICS-NEXT: ldp x22, x21, [sp], #48 ; 16-byte Folded Reload ; OUTLINE-ATOMICS-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll --- a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll +++ b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll @@ -13,17 +13,17 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, :got:a ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] -; CHECK-NEXT: ldr w8, [x8] -; CHECK-NEXT: cmp w8, #10 +; CHECK-NEXT: ldr w9, [x8] ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] +; CHECK-NEXT: cmp w9, #10 ; CHECK-NEXT: b.le .LBB0_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true -; CHECK-NEXT: adrp x10, :got:c -; CHECK-NEXT: ldr w9, [x8] -; CHECK-NEXT: ldr x10, [x10, :got_lo12:c] -; CHECK-NEXT: ldr w10, [x10] -; CHECK-NEXT: cmp w9, w10 +; CHECK-NEXT: adrp x9, :got:c +; CHECK-NEXT: ldr x9, [x9, :got_lo12:c] +; CHECK-NEXT: ldr w10, [x8] +; CHECK-NEXT: ldr w9, [x9] +; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: b.ne .LBB0_4 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, #1 @@ -32,8 +32,8 @@ ; CHECK-NEXT: b.lt .LBB0_6 ; CHECK-NEXT: .LBB0_4: // %land.lhs.true3 ; CHECK-NEXT: adrp x9, :got:d -; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] +; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: b.ne .LBB0_6 @@ -145,17 +145,17 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, :got:a ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] -; CHECK-NEXT: ldr w8, [x8] -; CHECK-NEXT: cmp w8, #5 +; CHECK-NEXT: ldr w9, [x8] ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] +; CHECK-NEXT: cmp w9, #5 ; CHECK-NEXT: b.ge .LBB2_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true -; CHECK-NEXT: adrp x10, :got:c -; CHECK-NEXT: ldr w9, [x8] -; CHECK-NEXT: ldr x10, [x10, :got_lo12:c] -; CHECK-NEXT: ldr w10, [x10] -; CHECK-NEXT: cmp w9, w10 +; CHECK-NEXT: adrp x9, :got:c +; CHECK-NEXT: ldr x9, [x9, :got_lo12:c] +; CHECK-NEXT: ldr w10, [x8] +; CHECK-NEXT: ldr w9, [x9] +; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: b.ne .LBB2_4 ; CHECK-NEXT: // %bb.2: ; CHECK-NEXT: mov w0, #1 @@ -164,8 +164,8 @@ ; CHECK-NEXT: b.gt .LBB2_6 ; CHECK-NEXT: .LBB2_4: // %land.lhs.true3 ; CHECK-NEXT: adrp x9, :got:d -; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] +; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: b.ne .LBB2_6 @@ -424,13 +424,13 @@ ; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: ldr x19, [x0] -; CHECK-NEXT: mov w20, #24 +; CHECK-NEXT: ldr x20, [x0] +; CHECK-NEXT: mov w19, #24 ; CHECK-NEXT: adrp x22, glob -; CHECK-NEXT: add x21, x19, #2 +; CHECK-NEXT: add x21, x20, #2 ; CHECK-NEXT: .LBB6_1: // %land.rhs ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr x8, [x20] +; CHECK-NEXT: ldr x8, [x19] ; CHECK-NEXT: cmp x8, #1 ; CHECK-NEXT: b.lt .LBB6_3 ; CHECK-NEXT: // %bb.2: // %while.body @@ -438,7 +438,7 @@ ; CHECK-NEXT: ldr x0, [x22, :lo12:glob] ; CHECK-NEXT: bl Update ; CHECK-NEXT: sub x21, x21, #2 -; CHECK-NEXT: cmp x19, x21 +; CHECK-NEXT: cmp x20, x21 ; CHECK-NEXT: b.lt .LBB6_1 ; CHECK-NEXT: .LBB6_3: // %while.end ; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload @@ -662,15 +662,15 @@ ; CHECK-NEXT: mov w0, #-1 ; CHECK-NEXT: bl yoo ; CHECK-NEXT: cmp w19, #0 -; CHECK-NEXT: cinc w0, w19, gt ; CHECK-NEXT: mov w1, #2 +; CHECK-NEXT: cinc w0, w19, gt ; CHECK-NEXT: fmov d8, d0 ; CHECK-NEXT: bl xoo ; CHECK-NEXT: fmov d0, #-1.00000000 -; CHECK-NEXT: fadd d0, d8, d0 ; CHECK-NEXT: fcmp d8, #0.0 -; CHECK-NEXT: fcsel d0, d8, d0, gt ; CHECK-NEXT: fmov d1, #-2.00000000 +; CHECK-NEXT: fadd d0, d8, d0 +; CHECK-NEXT: fcsel d0, d8, d0, gt ; CHECK-NEXT: bl woo ; CHECK-NEXT: mov w0, #4 ; CHECK-NEXT: .LBB9_4: // %return @@ -720,8 +720,8 @@ ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: csinc w8, w8, wzr, gt ; CHECK-NEXT: cmp w0, #2, lsl #12 // =8192 ; CHECK-NEXT: mov w9, #128 diff --git a/llvm/test/CodeGen/AArch64/cond-sel-value-prop.ll b/llvm/test/CodeGen/AArch64/cond-sel-value-prop.ll --- a/llvm/test/CodeGen/AArch64/cond-sel-value-prop.ll +++ b/llvm/test/CodeGen/AArch64/cond-sel-value-prop.ll @@ -5,8 +5,8 @@ define i32 @test1(i32 %x) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #2 ; CHECK-NEXT: mov w8, #7 +; CHECK-NEXT: cmp w0, #2 ; CHECK-NEXT: csel w0, w0, w8, eq ; CHECK-NEXT: ret %cmp = icmp eq i32 %x, 2 @@ -18,8 +18,8 @@ define i64 @test2(i64 %x) { ; CHECK-LABEL: test2: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, #2 ; CHECK-NEXT: mov w8, #7 +; CHECK-NEXT: cmp x0, #2 ; CHECK-NEXT: csel x0, x0, x8, eq ; CHECK-NEXT: ret %cmp = icmp eq i64 %x, 2 @@ -31,8 +31,8 @@ define i64 @test3(i64 %x) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, #7 ; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: cmp x0, #7 ; CHECK-NEXT: csel x0, x8, x0, ne ; CHECK-NEXT: ret %cmp = icmp ne i64 %x, 7 @@ -45,8 +45,8 @@ define i64 @test4(i64 %x) { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, #0 ; CHECK-NEXT: mov w8, #7 +; CHECK-NEXT: cmp x0, #0 ; CHECK-NEXT: csel x0, xzr, x8, eq ; CHECK-NEXT: ret %cmp = icmp eq i64 %x, 0 @@ -60,8 +60,8 @@ define i64 @test5(i64 %x) { ; CHECK-LABEL: test5: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x0, #1 ; CHECK-NEXT: mov w8, #7 +; CHECK-NEXT: cmp x0, #1 ; CHECK-NEXT: csinc x0, x8, xzr, ne ; CHECK-NEXT: ret %cmp = icmp eq i64 %x, 1 @@ -75,8 +75,8 @@ define i64 @test6(i64 %x) { ; CHECK-LABEL: test6: ; CHECK: // %bb.0: -; CHECK-NEXT: cmn x0, #1 ; CHECK-NEXT: mov w8, #7 +; CHECK-NEXT: cmn x0, #1 ; CHECK-NEXT: csinv x0, x8, xzr, ne ; CHECK-NEXT: ret %cmp = icmp eq i64 %x, -1 diff --git a/llvm/test/CodeGen/AArch64/consthoist-gep.ll b/llvm/test/CodeGen/AArch64/consthoist-gep.ll --- a/llvm/test/CodeGen/AArch64/consthoist-gep.ll +++ b/llvm/test/CodeGen/AArch64/consthoist-gep.ll @@ -35,8 +35,8 @@ ; CHECK-NEXT: // %bb.1: // %bb3 ; CHECK-NEXT: mov w9, #44032 ; CHECK-NEXT: movk w9, #12296, lsl #16 -; CHECK-NEXT: ldr w10, [x9] ; CHECK-NEXT: orr w11, w9, #0x4 +; CHECK-NEXT: ldr w10, [x9] ; CHECK-NEXT: stur w10, [x8, #158] ; CHECK-NEXT: ldr w10, [x11] ; CHECK-NEXT: orr w11, w9, #0x8 @@ -51,26 +51,26 @@ ; CHECK-NEXT: orr w11, w9, w11 ; CHECK-NEXT: and w10, w10, #0x1f1f1f1f ; CHECK-NEXT: stur w10, [x8, #170] +; CHECK-NEXT: mov w10, #176 ; CHECK-NEXT: ldr w8, [x11] -; CHECK-NEXT: adrp x10, global+528 -; CHECK-NEXT: add x10, x10, :lo12:global+528 -; CHECK-NEXT: mov w11, #176 +; CHECK-NEXT: adrp x11, global+528 +; CHECK-NEXT: add x11, x11, :lo12:global+528 +; CHECK-NEXT: orr w10, w9, w10 ; CHECK-NEXT: and w8, w8, #0xffffff -; CHECK-NEXT: orr w11, w9, w11 -; CHECK-NEXT: str w8, [x10] -; CHECK-NEXT: ldr w8, [x11] -; CHECK-NEXT: mov w11, #180 -; CHECK-NEXT: orr w11, w9, w11 +; CHECK-NEXT: str w8, [x11] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: mov w10, #180 +; CHECK-NEXT: orr w10, w9, w10 ; CHECK-NEXT: and w8, w8, #0xffffff -; CHECK-NEXT: str w8, [x10, #4] -; CHECK-NEXT: ldr w8, [x11] -; CHECK-NEXT: mov w11, #184 +; CHECK-NEXT: str w8, [x11, #4] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: mov w10, #184 +; CHECK-NEXT: orr w9, w9, w10 ; CHECK-NEXT: and w8, w8, #0xffffff -; CHECK-NEXT: str w8, [x10, #8] -; CHECK-NEXT: orr w8, w9, w11 -; CHECK-NEXT: ldr w8, [x8] +; CHECK-NEXT: str w8, [x11, #8] +; CHECK-NEXT: ldr w8, [x9] ; CHECK-NEXT: and w8, w8, #0xffffff -; CHECK-NEXT: str w8, [x10, #12] +; CHECK-NEXT: str w8, [x11, #12] ; CHECK-NEXT: .LBB0_2: // %bb19 ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/csr-split.ll b/llvm/test/CodeGen/AArch64/csr-split.ll --- a/llvm/test/CodeGen/AArch64/csr-split.ll +++ b/llvm/test/CodeGen/AArch64/csr-split.ll @@ -85,8 +85,8 @@ ; CHECK-NEXT: cbz x0, .LBB1_3 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: adrp x8, a -; CHECK-NEXT: ldrsw x8, [x8, :lo12:a] ; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: ldrsw x8, [x8, :lo12:a] ; CHECK-NEXT: cmp x8, x0 ; CHECK-NEXT: b.ne .LBB1_3 ; CHECK-NEXT: // %bb.2: // %if.then2 @@ -112,9 +112,9 @@ ; CHECK-APPLE-NEXT: ; %bb.1: ; %entry ; CHECK-APPLE-NEXT: Lloh2: ; CHECK-APPLE-NEXT: adrp x8, _a@PAGE +; CHECK-APPLE-NEXT: mov x19, x0 ; CHECK-APPLE-NEXT: Lloh3: ; CHECK-APPLE-NEXT: ldrsw x8, [x8, _a@PAGEOFF] -; CHECK-APPLE-NEXT: mov x19, x0 ; CHECK-APPLE-NEXT: cmp x8, x0 ; CHECK-APPLE-NEXT: b.ne LBB1_3 ; CHECK-APPLE-NEXT: ; %bb.2: ; %if.then2 diff --git a/llvm/test/CodeGen/AArch64/ctpop-nonean.ll b/llvm/test/CodeGen/AArch64/ctpop-nonean.ll --- a/llvm/test/CodeGen/AArch64/ctpop-nonean.ll +++ b/llvm/test/CodeGen/AArch64/ctpop-nonean.ll @@ -7,29 +7,29 @@ ; CHECK-LABEL: ctpop_i128: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr x8, x1, #1 +; CHECK-NEXT: lsr x9, x0, #1 ; CHECK-NEXT: and x8, x8, #0x5555555555555555 +; CHECK-NEXT: and x9, x9, #0x5555555555555555 ; CHECK-NEXT: sub x8, x1, x8 -; CHECK-NEXT: lsr x10, x0, #1 -; CHECK-NEXT: and x10, x10, #0x5555555555555555 -; CHECK-NEXT: and x11, x8, #0x3333333333333333 +; CHECK-NEXT: sub x9, x0, x9 +; CHECK-NEXT: and x10, x8, #0x3333333333333333 ; CHECK-NEXT: lsr x8, x8, #2 -; CHECK-NEXT: sub x10, x0, x10 +; CHECK-NEXT: and x11, x9, #0x3333333333333333 +; CHECK-NEXT: lsr x9, x9, #2 ; CHECK-NEXT: and x8, x8, #0x3333333333333333 -; CHECK-NEXT: add x8, x11, x8 -; CHECK-NEXT: and x11, x10, #0x3333333333333333 -; CHECK-NEXT: lsr x10, x10, #2 -; CHECK-NEXT: and x10, x10, #0x3333333333333333 -; CHECK-NEXT: add x10, x11, x10 +; CHECK-NEXT: and x9, x9, #0x3333333333333333 +; CHECK-NEXT: add x8, x10, x8 +; CHECK-NEXT: add x9, x11, x9 +; CHECK-NEXT: mov x10, #72340172838076673 +; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: add x8, x8, x8, lsr #4 -; CHECK-NEXT: add x10, x10, x10, lsr #4 -; CHECK-NEXT: mov x9, #72340172838076673 +; CHECK-NEXT: add x9, x9, x9, lsr #4 ; CHECK-NEXT: and x8, x8, #0xf0f0f0f0f0f0f0f -; CHECK-NEXT: and x10, x10, #0xf0f0f0f0f0f0f0f -; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: mul x9, x10, x9 +; CHECK-NEXT: and x9, x9, #0xf0f0f0f0f0f0f0f +; CHECK-NEXT: mul x8, x8, x10 +; CHECK-NEXT: mul x9, x9, x10 ; CHECK-NEXT: lsr x9, x9, #56 ; CHECK-NEXT: add x0, x9, x8, lsr #56 -; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: ret %c = call i128 @llvm.ctpop.i128(i128 %i) ret i128 %c diff --git a/llvm/test/CodeGen/AArch64/dag-combine-select.ll b/llvm/test/CodeGen/AArch64/dag-combine-select.ll --- a/llvm/test/CodeGen/AArch64/dag-combine-select.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-select.ll @@ -26,11 +26,11 @@ ; CHECK-LABEL: test1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, #7 +; CHECK-NEXT: adrp x8, out ; CHECK-NEXT: csel w9, w1, w2, eq ; CHECK-NEXT: cmp w9, #13 ; CHECK-NEXT: csel w9, w1, w2, lo ; CHECK-NEXT: cmp w0, #42 -; CHECK-NEXT: adrp x8, out ; CHECK-NEXT: csel w10, w1, w9, eq ; CHECK-NEXT: str w9, [x8, :lo12:out] ; CHECK-NEXT: str w10, [x8, :lo12:out] diff --git a/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll b/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll --- a/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll @@ -10,13 +10,13 @@ define void @no_combine(i32 %p) local_unnamed_addr { ; CHECK-LABEL: no_combine: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v0.4s, w0 -; CHECK-NEXT: movi v1.4h, #4 -; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: xtn v1.8b, v0.8h -; CHECK-NEXT: xtn2 v1.16b, v0.8h -; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: movi v0.4h, #4 +; CHECK-NEXT: dup v1.4s, w0 +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: xtn v0.8b, v1.8h +; CHECK-NEXT: xtn2 v0.16b, v1.8h +; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret ; The two shufflevector operations are needed to force the DAGCombine to happen diff --git a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll --- a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll +++ b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll @@ -7,21 +7,21 @@ ; CHECK-LABEL: signbits_vXi1: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: adrp x8, .LCPI0_1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: mov w1, wzr ; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: mov w2, wzr +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_1 ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: movi v1.4h, #1 ; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: shl v0.4h, v0.4h, #15 ; CHECK-NEXT: sshr v0.4h, v0.4h, #15 ; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: umov w3, v0.h[3] -; CHECK-NEXT: mov w1, wzr -; CHECK-NEXT: mov w2, wzr ; CHECK-NEXT: b foo %tmp3 = shufflevector <4 x i16> %a1, <4 x i16> undef, <4 x i32> zeroinitializer %tmp5 = add <4 x i16> %tmp3, diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll --- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll @@ -70,69 +70,69 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst) nounwind { ; ALL-LABEL: vector_i128_i8: ; ALL: // %bb.0: -; ALL-NEXT: smov w10, v1.b[0] -; ALL-NEXT: smov w11, v0.b[0] ; ALL-NEXT: smov w8, v1.b[1] ; ALL-NEXT: smov w9, v0.b[1] -; ALL-NEXT: sdiv w10, w11, w10 -; ALL-NEXT: smov w12, v1.b[2] -; ALL-NEXT: smov w13, v0.b[2] +; ALL-NEXT: smov w10, v0.b[0] +; ALL-NEXT: smov w11, v0.b[2] +; ALL-NEXT: smov w12, v0.b[3] +; ALL-NEXT: smov w13, v0.b[4] +; ALL-NEXT: smov w14, v0.b[5] +; ALL-NEXT: smov w15, v0.b[6] ; ALL-NEXT: sdiv w8, w9, w8 -; ALL-NEXT: fmov s2, w10 -; ALL-NEXT: smov w14, v1.b[3] -; ALL-NEXT: smov w15, v0.b[3] -; ALL-NEXT: sdiv w12, w13, w12 +; ALL-NEXT: smov w9, v1.b[0] +; ALL-NEXT: smov w16, v0.b[7] +; ALL-NEXT: smov w17, v0.b[8] +; ALL-NEXT: sdiv w9, w10, w9 +; ALL-NEXT: smov w10, v1.b[2] +; ALL-NEXT: sdiv w10, w11, w10 +; ALL-NEXT: smov w11, v1.b[3] +; ALL-NEXT: fmov s2, w9 +; ALL-NEXT: smov w9, v1.b[9] ; ALL-NEXT: mov v2.b[1], w8 -; ALL-NEXT: smov w16, v1.b[4] -; ALL-NEXT: smov w17, v0.b[4] +; ALL-NEXT: sdiv w11, w12, w11 +; ALL-NEXT: smov w12, v1.b[4] +; ALL-NEXT: mov v2.b[2], w10 +; ALL-NEXT: smov w10, v0.b[10] +; ALL-NEXT: sdiv w12, w13, w12 +; ALL-NEXT: smov w13, v1.b[5] +; ALL-NEXT: mov v2.b[3], w11 +; ALL-NEXT: smov w11, v0.b[11] +; ALL-NEXT: sdiv w13, w14, w13 +; ALL-NEXT: smov w14, v1.b[6] +; ALL-NEXT: mov v2.b[4], w12 +; ALL-NEXT: smov w12, v0.b[12] ; ALL-NEXT: sdiv w14, w15, w14 -; ALL-NEXT: mov v2.b[2], w12 -; ALL-NEXT: smov w18, v1.b[5] -; ALL-NEXT: smov w1, v0.b[5] +; ALL-NEXT: smov w15, v1.b[7] +; ALL-NEXT: mov v2.b[5], w13 +; ALL-NEXT: smov w13, v0.b[13] +; ALL-NEXT: sdiv w15, w16, w15 +; ALL-NEXT: smov w16, v1.b[8] +; ALL-NEXT: mov v2.b[6], w14 ; ALL-NEXT: sdiv w16, w17, w16 -; ALL-NEXT: mov v2.b[3], w14 -; ALL-NEXT: smov w2, v1.b[6] -; ALL-NEXT: smov w3, v0.b[6] -; ALL-NEXT: sdiv w18, w1, w18 -; ALL-NEXT: mov v2.b[4], w16 -; ALL-NEXT: smov w4, v1.b[7] -; ALL-NEXT: smov w5, v0.b[7] -; ALL-NEXT: sdiv w2, w3, w2 -; ALL-NEXT: mov v2.b[5], w18 -; ALL-NEXT: smov w9, v1.b[8] -; ALL-NEXT: smov w11, v0.b[8] -; ALL-NEXT: sdiv w4, w5, w4 -; ALL-NEXT: mov v2.b[6], w2 -; ALL-NEXT: smov w13, v1.b[9] -; ALL-NEXT: smov w15, v0.b[9] -; ALL-NEXT: sdiv w9, w11, w9 -; ALL-NEXT: mov v2.b[7], w4 -; ALL-NEXT: smov w17, v1.b[10] -; ALL-NEXT: smov w1, v0.b[10] -; ALL-NEXT: sdiv w13, w15, w13 -; ALL-NEXT: mov v2.b[8], w9 -; ALL-NEXT: smov w3, v1.b[11] -; ALL-NEXT: smov w5, v0.b[11] -; ALL-NEXT: sdiv w17, w1, w17 -; ALL-NEXT: mov v2.b[9], w13 +; ALL-NEXT: smov w17, v0.b[9] +; ALL-NEXT: mov v2.b[7], w15 +; ALL-NEXT: sdiv w8, w17, w9 +; ALL-NEXT: smov w9, v1.b[10] +; ALL-NEXT: mov v2.b[8], w16 +; ALL-NEXT: sdiv w9, w10, w9 +; ALL-NEXT: smov w10, v1.b[11] +; ALL-NEXT: mov v2.b[9], w8 +; ALL-NEXT: sdiv w10, w11, w10 ; ALL-NEXT: smov w11, v1.b[12] -; ALL-NEXT: smov w15, v0.b[12] -; ALL-NEXT: sdiv w3, w5, w3 -; ALL-NEXT: mov v2.b[10], w17 -; ALL-NEXT: smov w1, v1.b[13] -; ALL-NEXT: smov w5, v0.b[13] -; ALL-NEXT: sdiv w11, w15, w11 -; ALL-NEXT: mov v2.b[11], w3 -; ALL-NEXT: smov w15, v1.b[14] -; ALL-NEXT: sdiv w1, w5, w1 -; ALL-NEXT: smov w5, v0.b[14] +; ALL-NEXT: mov v2.b[10], w9 +; ALL-NEXT: smov w9, v1.b[14] +; ALL-NEXT: sdiv w11, w12, w11 +; ALL-NEXT: smov w12, v1.b[13] +; ALL-NEXT: mov v2.b[11], w10 +; ALL-NEXT: smov w10, v1.b[15] +; ALL-NEXT: sdiv w8, w13, w12 +; ALL-NEXT: smov w12, v0.b[14] ; ALL-NEXT: mov v2.b[12], w11 -; ALL-NEXT: sdiv w15, w5, w15 -; ALL-NEXT: smov w8, v1.b[15] -; ALL-NEXT: mov v2.b[13], w1 -; ALL-NEXT: smov w9, v0.b[15] -; ALL-NEXT: mov v2.b[14], w15 -; ALL-NEXT: sdiv w8, w9, w8 +; ALL-NEXT: smov w11, v0.b[15] +; ALL-NEXT: sdiv w9, w12, w9 +; ALL-NEXT: mov v2.b[13], w8 +; ALL-NEXT: sdiv w8, w11, w10 +; ALL-NEXT: mov v2.b[14], w9 ; ALL-NEXT: mov v2.b[15], w8 ; ALL-NEXT: mls v0.16b, v2.16b, v1.16b ; ALL-NEXT: str q2, [x0] @@ -147,37 +147,37 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16>* %divdst) nounwind { ; ALL-LABEL: vector_i128_i16: ; ALL: // %bb.0: -; ALL-NEXT: smov w10, v1.h[0] -; ALL-NEXT: smov w11, v0.h[0] ; ALL-NEXT: smov w8, v1.h[1] ; ALL-NEXT: smov w9, v0.h[1] -; ALL-NEXT: sdiv w10, w11, w10 -; ALL-NEXT: smov w12, v1.h[2] -; ALL-NEXT: smov w13, v0.h[2] +; ALL-NEXT: smov w10, v0.h[0] +; ALL-NEXT: smov w11, v0.h[2] +; ALL-NEXT: smov w12, v0.h[3] +; ALL-NEXT: smov w13, v0.h[4] ; ALL-NEXT: sdiv w8, w9, w8 -; ALL-NEXT: fmov s2, w10 -; ALL-NEXT: smov w14, v1.h[3] -; ALL-NEXT: smov w15, v0.h[3] -; ALL-NEXT: sdiv w12, w13, w12 +; ALL-NEXT: smov w9, v1.h[0] +; ALL-NEXT: sdiv w9, w10, w9 +; ALL-NEXT: smov w10, v1.h[2] +; ALL-NEXT: sdiv w10, w11, w10 +; ALL-NEXT: smov w11, v1.h[3] +; ALL-NEXT: fmov s2, w9 +; ALL-NEXT: smov w9, v1.h[5] ; ALL-NEXT: mov v2.h[1], w8 -; ALL-NEXT: smov w9, v1.h[4] -; ALL-NEXT: smov w11, v0.h[4] -; ALL-NEXT: sdiv w14, w15, w14 -; ALL-NEXT: mov v2.h[2], w12 -; ALL-NEXT: smov w13, v1.h[5] -; ALL-NEXT: smov w15, v0.h[5] -; ALL-NEXT: sdiv w9, w11, w9 -; ALL-NEXT: mov v2.h[3], w14 -; ALL-NEXT: smov w11, v1.h[6] -; ALL-NEXT: sdiv w13, w15, w13 -; ALL-NEXT: smov w15, v0.h[6] -; ALL-NEXT: mov v2.h[4], w9 -; ALL-NEXT: sdiv w11, w15, w11 -; ALL-NEXT: smov w8, v1.h[7] -; ALL-NEXT: mov v2.h[5], w13 -; ALL-NEXT: smov w9, v0.h[7] -; ALL-NEXT: mov v2.h[6], w11 -; ALL-NEXT: sdiv w8, w9, w8 +; ALL-NEXT: sdiv w11, w12, w11 +; ALL-NEXT: smov w12, v1.h[4] +; ALL-NEXT: mov v2.h[2], w10 +; ALL-NEXT: smov w10, v0.h[6] +; ALL-NEXT: sdiv w12, w13, w12 +; ALL-NEXT: smov w13, v0.h[5] +; ALL-NEXT: mov v2.h[3], w11 +; ALL-NEXT: smov w11, v0.h[7] +; ALL-NEXT: sdiv w8, w13, w9 +; ALL-NEXT: smov w9, v1.h[6] +; ALL-NEXT: mov v2.h[4], w12 +; ALL-NEXT: sdiv w9, w10, w9 +; ALL-NEXT: smov w10, v1.h[7] +; ALL-NEXT: mov v2.h[5], w8 +; ALL-NEXT: sdiv w8, w11, w10 +; ALL-NEXT: mov v2.h[6], w9 ; ALL-NEXT: mov v2.h[7], w8 ; ALL-NEXT: mls v0.8h, v2.8h, v1.8h ; ALL-NEXT: str q2, [x0] @@ -192,21 +192,21 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst) nounwind { ; ALL-LABEL: vector_i128_i32: ; ALL: // %bb.0: -; ALL-NEXT: fmov w9, s1 -; ALL-NEXT: fmov w10, s0 ; ALL-NEXT: mov w8, v1.s[1] +; ALL-NEXT: mov w9, v0.s[1] +; ALL-NEXT: fmov w10, s0 +; ALL-NEXT: mov w11, v0.s[2] +; ALL-NEXT: mov w12, v0.s[3] +; ALL-NEXT: sdiv w8, w9, w8 +; ALL-NEXT: fmov w9, s1 ; ALL-NEXT: sdiv w9, w10, w9 -; ALL-NEXT: mov w10, v0.s[1] -; ALL-NEXT: sdiv w8, w10, w8 ; ALL-NEXT: mov w10, v1.s[2] +; ALL-NEXT: sdiv w10, w11, w10 +; ALL-NEXT: mov w11, v1.s[3] ; ALL-NEXT: fmov s2, w9 -; ALL-NEXT: mov w9, v0.s[2] -; ALL-NEXT: sdiv w9, w9, w10 -; ALL-NEXT: mov w10, v1.s[3] ; ALL-NEXT: mov v2.s[1], w8 -; ALL-NEXT: mov w8, v0.s[3] -; ALL-NEXT: mov v2.s[2], w9 -; ALL-NEXT: sdiv w8, w8, w10 +; ALL-NEXT: sdiv w8, w12, w11 +; ALL-NEXT: mov v2.s[2], w10 ; ALL-NEXT: mov v2.s[3], w8 ; ALL-NEXT: mls v0.4s, v2.4s, v1.4s ; ALL-NEXT: str q2, [x0] @@ -221,19 +221,19 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst) nounwind { ; ALL-LABEL: vector_i128_i64: ; ALL: // %bb.0: -; ALL-NEXT: fmov x10, d1 -; ALL-NEXT: fmov x11, d0 -; ALL-NEXT: mov x8, v1.d[1] -; ALL-NEXT: mov x9, v0.d[1] -; ALL-NEXT: sdiv x11, x11, x10 +; ALL-NEXT: fmov x8, d1 +; ALL-NEXT: fmov x9, d0 +; ALL-NEXT: mov x10, v1.d[1] +; ALL-NEXT: mov x11, v0.d[1] ; ALL-NEXT: sdiv x9, x9, x8 -; ALL-NEXT: mul x10, x11, x10 ; ALL-NEXT: mul x8, x9, x8 -; ALL-NEXT: fmov d1, x10 -; ALL-NEXT: mov v1.d[1], x8 +; ALL-NEXT: sdiv x11, x11, x10 +; ALL-NEXT: fmov d1, x8 +; ALL-NEXT: mul x10, x11, x10 +; ALL-NEXT: mov v1.d[1], x10 ; ALL-NEXT: sub v0.2d, v0.2d, v1.2d -; ALL-NEXT: fmov d1, x11 -; ALL-NEXT: mov v1.d[1], x9 +; ALL-NEXT: fmov d1, x9 +; ALL-NEXT: mov v1.d[1], x11 ; ALL-NEXT: str q1, [x0] ; ALL-NEXT: ret %div = sdiv <2 x i64> %x, %y @@ -266,10 +266,10 @@ ; ALL-LABEL: extrause: ; ALL: // %bb.0: ; ALL-NEXT: sdiv w8, w0, w1 +; ALL-NEXT: mul w9, w8, w1 ; ALL-NEXT: str w8, [x2] -; ALL-NEXT: mul w8, w8, w1 -; ALL-NEXT: sub w0, w0, w8 -; ALL-NEXT: str w8, [x3] +; ALL-NEXT: sub w0, w0, w9 +; ALL-NEXT: str w9, [x3] ; ALL-NEXT: ret %div = sdiv i32 %x, %y store i32 %div, i32* %divdst, align 4 diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll --- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll @@ -70,69 +70,69 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst) nounwind { ; ALL-LABEL: vector_i128_i8: ; ALL: // %bb.0: -; ALL-NEXT: umov w10, v1.b[0] -; ALL-NEXT: umov w11, v0.b[0] ; ALL-NEXT: umov w8, v1.b[1] ; ALL-NEXT: umov w9, v0.b[1] -; ALL-NEXT: udiv w10, w11, w10 -; ALL-NEXT: umov w12, v1.b[2] -; ALL-NEXT: umov w13, v0.b[2] +; ALL-NEXT: umov w10, v0.b[0] +; ALL-NEXT: umov w11, v0.b[2] +; ALL-NEXT: umov w12, v0.b[3] +; ALL-NEXT: umov w13, v0.b[4] +; ALL-NEXT: umov w14, v0.b[5] +; ALL-NEXT: umov w15, v0.b[6] ; ALL-NEXT: udiv w8, w9, w8 -; ALL-NEXT: fmov s2, w10 -; ALL-NEXT: umov w14, v1.b[3] -; ALL-NEXT: umov w15, v0.b[3] -; ALL-NEXT: udiv w12, w13, w12 +; ALL-NEXT: umov w9, v1.b[0] +; ALL-NEXT: umov w16, v0.b[7] +; ALL-NEXT: umov w17, v0.b[8] +; ALL-NEXT: udiv w9, w10, w9 +; ALL-NEXT: umov w10, v1.b[2] +; ALL-NEXT: udiv w10, w11, w10 +; ALL-NEXT: umov w11, v1.b[3] +; ALL-NEXT: fmov s2, w9 +; ALL-NEXT: umov w9, v1.b[9] ; ALL-NEXT: mov v2.b[1], w8 -; ALL-NEXT: umov w16, v1.b[4] -; ALL-NEXT: umov w17, v0.b[4] +; ALL-NEXT: udiv w11, w12, w11 +; ALL-NEXT: umov w12, v1.b[4] +; ALL-NEXT: mov v2.b[2], w10 +; ALL-NEXT: umov w10, v0.b[10] +; ALL-NEXT: udiv w12, w13, w12 +; ALL-NEXT: umov w13, v1.b[5] +; ALL-NEXT: mov v2.b[3], w11 +; ALL-NEXT: umov w11, v0.b[11] +; ALL-NEXT: udiv w13, w14, w13 +; ALL-NEXT: umov w14, v1.b[6] +; ALL-NEXT: mov v2.b[4], w12 +; ALL-NEXT: umov w12, v0.b[12] ; ALL-NEXT: udiv w14, w15, w14 -; ALL-NEXT: mov v2.b[2], w12 -; ALL-NEXT: umov w18, v1.b[5] -; ALL-NEXT: umov w1, v0.b[5] +; ALL-NEXT: umov w15, v1.b[7] +; ALL-NEXT: mov v2.b[5], w13 +; ALL-NEXT: umov w13, v0.b[13] +; ALL-NEXT: udiv w15, w16, w15 +; ALL-NEXT: umov w16, v1.b[8] +; ALL-NEXT: mov v2.b[6], w14 ; ALL-NEXT: udiv w16, w17, w16 -; ALL-NEXT: mov v2.b[3], w14 -; ALL-NEXT: umov w2, v1.b[6] -; ALL-NEXT: umov w3, v0.b[6] -; ALL-NEXT: udiv w18, w1, w18 -; ALL-NEXT: mov v2.b[4], w16 -; ALL-NEXT: umov w4, v1.b[7] -; ALL-NEXT: umov w5, v0.b[7] -; ALL-NEXT: udiv w2, w3, w2 -; ALL-NEXT: mov v2.b[5], w18 -; ALL-NEXT: umov w9, v1.b[8] -; ALL-NEXT: umov w11, v0.b[8] -; ALL-NEXT: udiv w4, w5, w4 -; ALL-NEXT: mov v2.b[6], w2 -; ALL-NEXT: umov w13, v1.b[9] -; ALL-NEXT: umov w15, v0.b[9] -; ALL-NEXT: udiv w9, w11, w9 -; ALL-NEXT: mov v2.b[7], w4 -; ALL-NEXT: umov w17, v1.b[10] -; ALL-NEXT: umov w1, v0.b[10] -; ALL-NEXT: udiv w13, w15, w13 -; ALL-NEXT: mov v2.b[8], w9 -; ALL-NEXT: umov w3, v1.b[11] -; ALL-NEXT: umov w5, v0.b[11] -; ALL-NEXT: udiv w17, w1, w17 -; ALL-NEXT: mov v2.b[9], w13 +; ALL-NEXT: umov w17, v0.b[9] +; ALL-NEXT: mov v2.b[7], w15 +; ALL-NEXT: udiv w8, w17, w9 +; ALL-NEXT: umov w9, v1.b[10] +; ALL-NEXT: mov v2.b[8], w16 +; ALL-NEXT: udiv w9, w10, w9 +; ALL-NEXT: umov w10, v1.b[11] +; ALL-NEXT: mov v2.b[9], w8 +; ALL-NEXT: udiv w10, w11, w10 ; ALL-NEXT: umov w11, v1.b[12] -; ALL-NEXT: umov w15, v0.b[12] -; ALL-NEXT: udiv w3, w5, w3 -; ALL-NEXT: mov v2.b[10], w17 -; ALL-NEXT: umov w1, v1.b[13] -; ALL-NEXT: umov w5, v0.b[13] -; ALL-NEXT: udiv w11, w15, w11 -; ALL-NEXT: mov v2.b[11], w3 -; ALL-NEXT: umov w15, v1.b[14] -; ALL-NEXT: udiv w1, w5, w1 -; ALL-NEXT: umov w5, v0.b[14] +; ALL-NEXT: mov v2.b[10], w9 +; ALL-NEXT: umov w9, v1.b[14] +; ALL-NEXT: udiv w11, w12, w11 +; ALL-NEXT: umov w12, v1.b[13] +; ALL-NEXT: mov v2.b[11], w10 +; ALL-NEXT: umov w10, v1.b[15] +; ALL-NEXT: udiv w8, w13, w12 +; ALL-NEXT: umov w12, v0.b[14] ; ALL-NEXT: mov v2.b[12], w11 -; ALL-NEXT: udiv w15, w5, w15 -; ALL-NEXT: umov w8, v1.b[15] -; ALL-NEXT: mov v2.b[13], w1 -; ALL-NEXT: umov w9, v0.b[15] -; ALL-NEXT: mov v2.b[14], w15 -; ALL-NEXT: udiv w8, w9, w8 +; ALL-NEXT: umov w11, v0.b[15] +; ALL-NEXT: udiv w9, w12, w9 +; ALL-NEXT: mov v2.b[13], w8 +; ALL-NEXT: udiv w8, w11, w10 +; ALL-NEXT: mov v2.b[14], w9 ; ALL-NEXT: mov v2.b[15], w8 ; ALL-NEXT: mls v0.16b, v2.16b, v1.16b ; ALL-NEXT: str q2, [x0] @@ -147,37 +147,37 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16>* %divdst) nounwind { ; ALL-LABEL: vector_i128_i16: ; ALL: // %bb.0: -; ALL-NEXT: umov w10, v1.h[0] -; ALL-NEXT: umov w11, v0.h[0] ; ALL-NEXT: umov w8, v1.h[1] ; ALL-NEXT: umov w9, v0.h[1] -; ALL-NEXT: udiv w10, w11, w10 -; ALL-NEXT: umov w12, v1.h[2] -; ALL-NEXT: umov w13, v0.h[2] +; ALL-NEXT: umov w10, v0.h[0] +; ALL-NEXT: umov w11, v0.h[2] +; ALL-NEXT: umov w12, v0.h[3] +; ALL-NEXT: umov w13, v0.h[4] ; ALL-NEXT: udiv w8, w9, w8 -; ALL-NEXT: fmov s2, w10 -; ALL-NEXT: umov w14, v1.h[3] -; ALL-NEXT: umov w15, v0.h[3] -; ALL-NEXT: udiv w12, w13, w12 +; ALL-NEXT: umov w9, v1.h[0] +; ALL-NEXT: udiv w9, w10, w9 +; ALL-NEXT: umov w10, v1.h[2] +; ALL-NEXT: udiv w10, w11, w10 +; ALL-NEXT: umov w11, v1.h[3] +; ALL-NEXT: fmov s2, w9 +; ALL-NEXT: umov w9, v1.h[5] ; ALL-NEXT: mov v2.h[1], w8 -; ALL-NEXT: umov w9, v1.h[4] -; ALL-NEXT: umov w11, v0.h[4] -; ALL-NEXT: udiv w14, w15, w14 -; ALL-NEXT: mov v2.h[2], w12 -; ALL-NEXT: umov w13, v1.h[5] -; ALL-NEXT: umov w15, v0.h[5] -; ALL-NEXT: udiv w9, w11, w9 -; ALL-NEXT: mov v2.h[3], w14 -; ALL-NEXT: umov w11, v1.h[6] -; ALL-NEXT: udiv w13, w15, w13 -; ALL-NEXT: umov w15, v0.h[6] -; ALL-NEXT: mov v2.h[4], w9 -; ALL-NEXT: udiv w11, w15, w11 -; ALL-NEXT: umov w8, v1.h[7] -; ALL-NEXT: mov v2.h[5], w13 -; ALL-NEXT: umov w9, v0.h[7] -; ALL-NEXT: mov v2.h[6], w11 -; ALL-NEXT: udiv w8, w9, w8 +; ALL-NEXT: udiv w11, w12, w11 +; ALL-NEXT: umov w12, v1.h[4] +; ALL-NEXT: mov v2.h[2], w10 +; ALL-NEXT: umov w10, v0.h[6] +; ALL-NEXT: udiv w12, w13, w12 +; ALL-NEXT: umov w13, v0.h[5] +; ALL-NEXT: mov v2.h[3], w11 +; ALL-NEXT: umov w11, v0.h[7] +; ALL-NEXT: udiv w8, w13, w9 +; ALL-NEXT: umov w9, v1.h[6] +; ALL-NEXT: mov v2.h[4], w12 +; ALL-NEXT: udiv w9, w10, w9 +; ALL-NEXT: umov w10, v1.h[7] +; ALL-NEXT: mov v2.h[5], w8 +; ALL-NEXT: udiv w8, w11, w10 +; ALL-NEXT: mov v2.h[6], w9 ; ALL-NEXT: mov v2.h[7], w8 ; ALL-NEXT: mls v0.8h, v2.8h, v1.8h ; ALL-NEXT: str q2, [x0] @@ -192,21 +192,21 @@ define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst) nounwind { ; ALL-LABEL: vector_i128_i32: ; ALL: // %bb.0: -; ALL-NEXT: fmov w9, s1 -; ALL-NEXT: fmov w10, s0 ; ALL-NEXT: mov w8, v1.s[1] +; ALL-NEXT: mov w9, v0.s[1] +; ALL-NEXT: fmov w10, s0 +; ALL-NEXT: mov w11, v0.s[2] +; ALL-NEXT: mov w12, v0.s[3] +; ALL-NEXT: udiv w8, w9, w8 +; ALL-NEXT: fmov w9, s1 ; ALL-NEXT: udiv w9, w10, w9 -; ALL-NEXT: mov w10, v0.s[1] -; ALL-NEXT: udiv w8, w10, w8 ; ALL-NEXT: mov w10, v1.s[2] +; ALL-NEXT: udiv w10, w11, w10 +; ALL-NEXT: mov w11, v1.s[3] ; ALL-NEXT: fmov s2, w9 -; ALL-NEXT: mov w9, v0.s[2] -; ALL-NEXT: udiv w9, w9, w10 -; ALL-NEXT: mov w10, v1.s[3] ; ALL-NEXT: mov v2.s[1], w8 -; ALL-NEXT: mov w8, v0.s[3] -; ALL-NEXT: mov v2.s[2], w9 -; ALL-NEXT: udiv w8, w8, w10 +; ALL-NEXT: udiv w8, w12, w11 +; ALL-NEXT: mov v2.s[2], w10 ; ALL-NEXT: mov v2.s[3], w8 ; ALL-NEXT: mls v0.4s, v2.4s, v1.4s ; ALL-NEXT: str q2, [x0] @@ -221,19 +221,19 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst) nounwind { ; ALL-LABEL: vector_i128_i64: ; ALL: // %bb.0: -; ALL-NEXT: fmov x10, d1 -; ALL-NEXT: fmov x11, d0 -; ALL-NEXT: mov x8, v1.d[1] -; ALL-NEXT: mov x9, v0.d[1] -; ALL-NEXT: udiv x11, x11, x10 +; ALL-NEXT: fmov x8, d1 +; ALL-NEXT: fmov x9, d0 +; ALL-NEXT: mov x10, v1.d[1] +; ALL-NEXT: mov x11, v0.d[1] ; ALL-NEXT: udiv x9, x9, x8 -; ALL-NEXT: mul x10, x11, x10 ; ALL-NEXT: mul x8, x9, x8 -; ALL-NEXT: fmov d1, x10 -; ALL-NEXT: mov v1.d[1], x8 +; ALL-NEXT: udiv x11, x11, x10 +; ALL-NEXT: fmov d1, x8 +; ALL-NEXT: mul x10, x11, x10 +; ALL-NEXT: mov v1.d[1], x10 ; ALL-NEXT: sub v0.2d, v0.2d, v1.2d -; ALL-NEXT: fmov d1, x11 -; ALL-NEXT: mov v1.d[1], x9 +; ALL-NEXT: fmov d1, x9 +; ALL-NEXT: mov v1.d[1], x11 ; ALL-NEXT: str q1, [x0] ; ALL-NEXT: ret %div = udiv <2 x i64> %x, %y @@ -266,10 +266,10 @@ ; ALL-LABEL: extrause: ; ALL: // %bb.0: ; ALL-NEXT: udiv w8, w0, w1 +; ALL-NEXT: mul w9, w8, w1 ; ALL-NEXT: str w8, [x2] -; ALL-NEXT: mul w8, w8, w1 -; ALL-NEXT: sub w0, w0, w8 -; ALL-NEXT: str w8, [x3] +; ALL-NEXT: sub w0, w0, w9 +; ALL-NEXT: str w9, [x3] ; ALL-NEXT: ret %div = udiv i32 %x, %y store i32 %div, i32* %divdst, align 4 diff --git a/llvm/test/CodeGen/AArch64/emutls.ll b/llvm/test/CodeGen/AArch64/emutls.ll --- a/llvm/test/CodeGen/AArch64/emutls.ll +++ b/llvm/test/CodeGen/AArch64/emutls.ll @@ -95,9 +95,9 @@ ; ARM64: adrp x0, :got:__emutls_v._ZN1AIiE1xE ; ARM64: ldr x0, [x0, :got_lo12:__emutls_v._ZN1AIiE1xE] ; ARM64-NEXT: bl __emutls_get_address -; ARM64-NEXT: ldr {{.*}}, [x0] +; ARM64: ldr {{.*}}, [x0] ; ARM64: add -; ARM64: str {{.*}}, [x0] +; ARM64: str {{.*}}, [x8] entry: %0 = load i32, i32* @_ZN1AIiE1xE, align 4 @@ -111,7 +111,7 @@ ; ARM64: adrp x0, :got:__emutls_v._ZN1AIfE1xE ; ARM64: ldr x0, [x0, :got_lo12:__emutls_v._ZN1AIfE1xE] ; ARM64-NEXT: bl __emutls_get_address -; ARM64-NEXT: ldr {{.*}}, [x0] +; ARM64: ldr {{.*}}, [x0] ; ARM64: fadd s{{.*}}, s ; ARM64: str s{{.*}}, [x0] diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll --- a/llvm/test/CodeGen/AArch64/expand-select.ll +++ b/llvm/test/CodeGen/AArch64/expand-select.ll @@ -7,17 +7,17 @@ ; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: fmov s0, wzr ; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: ldp x8, x9, [sp, #8] -; CHECK-NEXT: ldr x10, [sp] +; CHECK-NEXT: ldp x9, x8, [sp] ; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: tst w11, #0x1 -; CHECK-NEXT: csel x11, x2, x6, ne -; CHECK-NEXT: csel x12, x3, x7, ne -; CHECK-NEXT: csel x10, x4, x10, ne +; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: tst w10, #0x1 +; CHECK-NEXT: ldr x10, [sp, #16] ; CHECK-NEXT: csel x8, x5, x8, ne -; CHECK-NEXT: stp x10, x8, [x9, #16] -; CHECK-NEXT: stp x11, x12, [x9] +; CHECK-NEXT: csel x9, x4, x9, ne +; CHECK-NEXT: csel x11, x3, x7, ne +; CHECK-NEXT: csel x12, x2, x6, ne +; CHECK-NEXT: stp x9, x8, [x10, #16] +; CHECK-NEXT: stp x12, x11, [x10] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 %cbool = icmp eq i32 %cond, 0 @@ -31,26 +31,26 @@ define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, <2 x i96> *%Out) { ; CHECK-LABEL: bar: ; CHECK: // %bb.0: -; CHECK-NEXT: and w9, w0, #0x1 +; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: fmov s0, wzr -; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: ldp x10, x9, [sp] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: ldr x11, [sp, #16] ; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s -; CHECK-NEXT: ldp x11, x8, [sp, #8] -; CHECK-NEXT: ldr x10, [sp] ; CHECK-NEXT: dup v1.4s, v0.s[0] -; CHECK-NEXT: mov x9, v1.d[1] -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: tst w9, #0x1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: csel x11, x5, x11, ne +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: csel x9, x5, x9, ne ; CHECK-NEXT: csel x10, x4, x10, ne -; CHECK-NEXT: tst w9, #0x1 -; CHECK-NEXT: csel x9, x3, x7, ne -; CHECK-NEXT: csel x12, x2, x6, ne -; CHECK-NEXT: stur x10, [x8, #12] -; CHECK-NEXT: str x12, [x8] -; CHECK-NEXT: str w9, [x8, #8] -; CHECK-NEXT: str w11, [x8, #20] +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel x8, x2, x6, ne +; CHECK-NEXT: csel x12, x3, x7, ne +; CHECK-NEXT: stur x10, [x11, #12] +; CHECK-NEXT: str w9, [x11, #20] +; CHECK-NEXT: str x8, [x11] +; CHECK-NEXT: str w12, [x11, #8] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 %cbool = icmp eq i32 %cond, 0 diff --git a/llvm/test/CodeGen/AArch64/expand-vector-rot.ll b/llvm/test/CodeGen/AArch64/expand-vector-rot.ll --- a/llvm/test/CodeGen/AArch64/expand-vector-rot.ll +++ b/llvm/test/CodeGen/AArch64/expand-vector-rot.ll @@ -6,15 +6,15 @@ define <2 x i16> @rotlv2_16(<2 x i16> %vec2_16, <2 x i16> %shift) { ; CHECK-LABEL: rotlv2_16: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2s, #15 ; CHECK-NEXT: neg v3.2s, v1.2s -; CHECK-NEXT: movi v4.2s, #15 -; CHECK-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-NEXT: and v3.8b, v3.8b, v4.8b -; CHECK-NEXT: and v2.8b, v0.8b, v2.8b -; CHECK-NEXT: and v1.8b, v1.8b, v4.8b +; CHECK-NEXT: movi d4, #0x00ffff0000ffff +; CHECK-NEXT: and v3.8b, v3.8b, v2.8b +; CHECK-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-NEXT: and v4.8b, v0.8b, v4.8b ; CHECK-NEXT: neg v3.2s, v3.2s -; CHECK-NEXT: ushl v2.2s, v2.2s, v3.2s ; CHECK-NEXT: ushl v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ushl v2.2s, v4.2s, v3.2s ; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b ; CHECK-NEXT: ret %1 = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %vec2_16, <2 x i16> %vec2_16, <2 x i16> %shift) diff --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll --- a/llvm/test/CodeGen/AArch64/extract-bits.ll +++ b/llvm/test/CodeGen/AArch64/extract-bits.ll @@ -21,11 +21,11 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsr w9, w0, w1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %shifted = lshr i32 %val, %numskipbits %onebit = shl i32 1, %numlowbits @@ -37,11 +37,11 @@ define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a0_arithmetic: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: asr w8, w0, w1 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: asr w9, w0, w1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %shifted = ashr i32 %val, %numskipbits %onebit = shl i32 1, %numlowbits @@ -53,11 +53,11 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsr w9, w0, w1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %skip = zext i8 %numskipbits to i32 %shifted = lshr i32 %val, %skip @@ -71,12 +71,12 @@ define i32 @bextr32_a2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: lsr w9, w9, w1 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %val = load i32, i32* %w %shifted = lshr i32 %val, %numskipbits @@ -89,12 +89,12 @@ define i32 @bextr32_a3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: lsr w9, w9, w1 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %val = load i32, i32* %w %skip = zext i8 %numskipbits to i32 @@ -109,11 +109,11 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_a4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsr w9, w0, w1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: and w0, w9, w8 ; CHECK-NEXT: ret %shifted = lshr i32 %val, %numskipbits %onebit = shl i32 1, %numlowbits @@ -127,11 +127,11 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: sub x9, x9, #1 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %onebit = shl i64 1, %numlowbits @@ -143,11 +143,11 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a0_arithmetic: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: asr x8, x0, x1 -; CHECK-NEXT: sub x9, x9, #1 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: asr x9, x0, x1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %shifted = ashr i64 %val, %numskipbits %onebit = shl i64 1, %numlowbits @@ -159,13 +159,13 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: lsl x9, x9, x2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: sub x9, x9, #1 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %skip = zext i8 %numskipbits to i64 %shifted = lshr i64 %val, %skip @@ -179,12 +179,12 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: sub x9, x9, #1 -; CHECK-NEXT: lsr x8, x8, x1 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: lsr x9, x9, x1 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %val = load i64, i64* %w %shifted = lshr i64 %val, %numskipbits @@ -197,14 +197,14 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: lsl x9, x9, x2 +; CHECK-NEXT: lsl x8, x8, x2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sub x9, x9, #1 -; CHECK-NEXT: lsr x8, x8, x1 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: lsr x9, x9, x1 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %val = load i64, i64* %w %skip = zext i8 %numskipbits to i64 @@ -219,11 +219,11 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_a4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: sub x9, x9, #1 -; CHECK-NEXT: and x0, x8, x9 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: and x0, x9, x8 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %onebit = shl i64 1, %numlowbits @@ -238,11 +238,11 @@ define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_a0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %onebit = shl i64 1, %numlowbits @@ -256,11 +256,11 @@ define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_a1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %truncshifted = trunc i64 %shifted to i32 @@ -275,11 +275,11 @@ define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_a2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %onebit = shl i32 1, %numlowbits @@ -297,10 +297,10 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_b0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #-1 -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: lsr w9, w0, w1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: bic w0, w9, w8 ; CHECK-NEXT: ret %shifted = lshr i32 %val, %numskipbits %notmask = shl i32 -1, %numlowbits @@ -312,10 +312,10 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr32_b1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #-1 -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: lsr w9, w0, w1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: bic w0, w9, w8 ; CHECK-NEXT: ret %skip = zext i8 %numskipbits to i32 %shifted = lshr i32 %val, %skip @@ -329,11 +329,11 @@ define i32 @bextr32_b2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_b2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #-1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: lsr w9, w9, w1 +; CHECK-NEXT: bic w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %w %shifted = lshr i32 %val, %numskipbits @@ -346,11 +346,11 @@ define i32 @bextr32_b3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr32_b3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #-1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: lsr w9, w9, w1 +; CHECK-NEXT: bic w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %w %skip = zext i8 %numskipbits to i32 @@ -365,10 +365,10 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_b4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #-1 -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: lsr w9, w0, w1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: bic w0, w9, w8 ; CHECK-NEXT: ret %shifted = lshr i32 %val, %numskipbits %notmask = shl i32 -1, %numlowbits @@ -382,10 +382,10 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_b0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #-1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: bic x0, x8, x9 +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: bic x0, x9, x8 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %notmask = shl i64 -1, %numlowbits @@ -397,12 +397,12 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr64_b1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: mov x8, #-1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: bic x0, x8, x9 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: bic x0, x9, x8 ; CHECK-NEXT: ret %skip = zext i8 %numskipbits to i64 %shifted = lshr i64 %val, %skip @@ -416,11 +416,11 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_b2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov x9, #-1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: lsr x8, x8, x1 -; CHECK-NEXT: bic x0, x8, x9 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: lsr x9, x9, x1 +; CHECK-NEXT: bic x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %w %shifted = lshr i64 %val, %numskipbits @@ -433,13 +433,13 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bextr64_b3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov x8, #-1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: lsr x8, x8, x1 -; CHECK-NEXT: bic x0, x8, x9 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: lsr x9, x9, x1 +; CHECK-NEXT: bic x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %w %skip = zext i8 %numskipbits to i64 @@ -454,10 +454,10 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_b4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #-1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: bic x0, x8, x9 +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: bic x0, x9, x8 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %notmask = shl i64 -1, %numlowbits @@ -472,11 +472,11 @@ define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_b0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: mov x8, #-1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsl x9, x9, x2 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl x8, x8, x2 +; CHECK-NEXT: bic w0, w9, w8 ; CHECK-NEXT: ret %shiftedval = lshr i64 %val, %numskipbits %widenumlowbits = zext i8 %numlowbits to i64 @@ -491,11 +491,11 @@ define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_b1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w8, #-1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: bic w0, w9, w8 ; CHECK-NEXT: ret %shiftedval = lshr i64 %val, %numskipbits %truncshiftedval = trunc i64 %shiftedval to i32 @@ -511,11 +511,11 @@ define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_b2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w8, #-1 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsl w9, w9, w2 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl w8, w8, w2 +; CHECK-NEXT: bic w0, w9, w8 ; CHECK-NEXT: ret %shiftedval = lshr i64 %val, %numskipbits %widenumlowbits = zext i8 %numlowbits to i32 @@ -534,11 +534,11 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_c0: ; CHECK: // %bb.0: -; CHECK-NEXT: neg w9, w2 -; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: neg w8, w2 +; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: lsr w10, w0, w1 +; CHECK-NEXT: lsr w8, w9, w8 +; CHECK-NEXT: and w0, w8, w10 ; CHECK-NEXT: ret %shifted = lshr i32 %val, %numskipbits %numhighbits = sub i32 32, %numlowbits @@ -550,13 +550,13 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_c1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #32 -; CHECK-NEXT: sub w9, w9, w2 -; CHECK-NEXT: mov w10, #-1 +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: sub w8, w8, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: lsr w10, w0, w1 +; CHECK-NEXT: lsr w8, w9, w8 +; CHECK-NEXT: and w0, w8, w10 ; CHECK-NEXT: ret %skip = zext i8 %numskipbits to i32 %shifted = lshr i32 %val, %skip @@ -570,12 +570,12 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_c2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: neg w9, w2 +; CHECK-NEXT: neg w8, w2 +; CHECK-NEXT: ldr w9, [x0] ; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: lsr w9, w9, w1 +; CHECK-NEXT: lsr w8, w10, w8 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %val = load i32, i32* %w %shifted = lshr i32 %val, %numskipbits @@ -588,14 +588,14 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_c3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #32 +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: sub w8, w8, w2 ; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: sub w9, w9, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: lsr w9, w9, w1 +; CHECK-NEXT: lsr w8, w10, w8 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %val = load i32, i32* %w %skip = zext i8 %numskipbits to i32 @@ -610,11 +610,11 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_c4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: neg w9, w2 -; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: neg w8, w2 +; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: lsr w10, w0, w1 +; CHECK-NEXT: lsr w8, w9, w8 +; CHECK-NEXT: and w0, w10, w8 ; CHECK-NEXT: ret %shifted = lshr i32 %val, %numskipbits %numhighbits = sub i32 32, %numlowbits @@ -628,11 +628,11 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_c0: ; CHECK: // %bb.0: -; CHECK-NEXT: neg x9, x2 -; CHECK-NEXT: mov x10, #-1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsr x9, x10, x9 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: neg x8, x2 +; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: lsr x10, x0, x1 +; CHECK-NEXT: lsr x8, x9, x8 +; CHECK-NEXT: and x0, x8, x10 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %numhighbits = sub i64 64, %numlowbits @@ -644,13 +644,13 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_c1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #64 -; CHECK-NEXT: sub w9, w9, w2 -; CHECK-NEXT: mov x10, #-1 +; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: sub w8, w8, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsr x9, x10, x9 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: lsr x10, x0, x1 +; CHECK-NEXT: lsr x8, x9, x8 +; CHECK-NEXT: and x0, x8, x10 ; CHECK-NEXT: ret %skip = zext i8 %numskipbits to i64 %shifted = lshr i64 %val, %skip @@ -664,12 +664,12 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_c2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: neg x9, x2 +; CHECK-NEXT: neg x8, x2 +; CHECK-NEXT: ldr x9, [x0] ; CHECK-NEXT: mov x10, #-1 -; CHECK-NEXT: lsr x9, x10, x9 -; CHECK-NEXT: lsr x8, x8, x1 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: lsr x9, x9, x1 +; CHECK-NEXT: lsr x8, x10, x8 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %val = load i64, i64* %w %shifted = lshr i64 %val, %numskipbits @@ -682,14 +682,14 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_c3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #64 +; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: sub w8, w8, w2 ; CHECK-NEXT: mov x10, #-1 -; CHECK-NEXT: sub w9, w9, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: lsr x8, x8, x1 -; CHECK-NEXT: lsr x9, x10, x9 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: lsr x9, x9, x1 +; CHECK-NEXT: lsr x8, x10, x8 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %val = load i64, i64* %w %skip = zext i8 %numskipbits to i64 @@ -704,11 +704,11 @@ define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_c4_commutative: ; CHECK: // %bb.0: -; CHECK-NEXT: neg x9, x2 -; CHECK-NEXT: mov x10, #-1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsr x9, x10, x9 -; CHECK-NEXT: and x0, x8, x9 +; CHECK-NEXT: neg x8, x2 +; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: lsr x10, x0, x1 +; CHECK-NEXT: lsr x8, x9, x8 +; CHECK-NEXT: and x0, x10, x8 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %numhighbits = sub i64 64, %numlowbits @@ -723,11 +723,11 @@ define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_c0: ; CHECK: // %bb.0: -; CHECK-NEXT: neg x9, x2 -; CHECK-NEXT: mov x10, #-1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsr x9, x10, x9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: neg x8, x2 +; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: lsr x10, x0, x1 +; CHECK-NEXT: lsr x8, x9, x8 +; CHECK-NEXT: and w0, w8, w10 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %numhighbits = sub i64 64, %numlowbits @@ -741,11 +741,11 @@ define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_c1: ; CHECK: // %bb.0: -; CHECK-NEXT: neg w9, w2 -; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: neg w8, w2 +; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: lsr x10, x0, x1 +; CHECK-NEXT: lsr w8, w9, w8 +; CHECK-NEXT: and w0, w8, w10 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %truncshifted = trunc i64 %shifted to i32 @@ -760,11 +760,11 @@ define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_c2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg w9, w2 -; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: neg w8, w2 +; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: lsr x10, x0, x1 +; CHECK-NEXT: lsr w8, w9, w8 +; CHECK-NEXT: and w0, w8, w10 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %numhighbits = sub i32 32, %numlowbits @@ -782,10 +782,10 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_d0: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: neg w9, w2 -; CHECK-NEXT: lsl w8, w8, w9 -; CHECK-NEXT: lsr w0, w8, w9 +; CHECK-NEXT: neg w8, w2 +; CHECK-NEXT: lsr w9, w0, w1 +; CHECK-NEXT: lsl w9, w9, w8 +; CHECK-NEXT: lsr w0, w9, w8 ; CHECK-NEXT: ret %shifted = lshr i32 %val, %numskipbits %numhighbits = sub i32 32, %numlowbits @@ -797,12 +797,12 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_d1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #32 +; CHECK-NEXT: mov w8, #32 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: lsr w8, w0, w1 -; CHECK-NEXT: sub w9, w9, w2 -; CHECK-NEXT: lsl w8, w8, w9 -; CHECK-NEXT: lsr w0, w8, w9 +; CHECK-NEXT: lsr w9, w0, w1 +; CHECK-NEXT: sub w8, w8, w2 +; CHECK-NEXT: lsl w9, w9, w8 +; CHECK-NEXT: lsr w0, w9, w8 ; CHECK-NEXT: ret %skip = zext i8 %numskipbits to i32 %shifted = lshr i32 %val, %skip @@ -833,13 +833,13 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr32_d3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #32 +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: sub w8, w8, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sub w9, w9, w2 -; CHECK-NEXT: lsr w8, w8, w1 -; CHECK-NEXT: lsl w8, w8, w9 -; CHECK-NEXT: lsr w0, w8, w9 +; CHECK-NEXT: lsr w9, w9, w1 +; CHECK-NEXT: lsl w9, w9, w8 +; CHECK-NEXT: lsr w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %w %skip = zext i8 %numskipbits to i32 @@ -856,10 +856,10 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_d0: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: neg x9, x2 -; CHECK-NEXT: lsl x8, x8, x9 -; CHECK-NEXT: lsr x0, x8, x9 +; CHECK-NEXT: neg x8, x2 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl x9, x9, x8 +; CHECK-NEXT: lsr x0, x9, x8 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %numhighbits = sub i64 64, %numlowbits @@ -871,12 +871,12 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_d1_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #64 +; CHECK-NEXT: mov w8, #64 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: sub w9, w9, w2 -; CHECK-NEXT: lsl x8, x8, x9 -; CHECK-NEXT: lsr x0, x8, x9 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: sub w8, w8, w2 +; CHECK-NEXT: lsl x9, x9, x8 +; CHECK-NEXT: lsr x0, x9, x8 ; CHECK-NEXT: ret %skip = zext i8 %numskipbits to i64 %shifted = lshr i64 %val, %skip @@ -907,13 +907,13 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_d3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #64 +; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: sub w8, w8, w2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sub w9, w9, w2 -; CHECK-NEXT: lsr x8, x8, x1 -; CHECK-NEXT: lsl x8, x8, x9 -; CHECK-NEXT: lsr x0, x8, x9 +; CHECK-NEXT: lsr x9, x9, x1 +; CHECK-NEXT: lsl x9, x9, x8 +; CHECK-NEXT: lsr x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %w %skip = zext i8 %numskipbits to i64 @@ -931,10 +931,10 @@ define i32 @bextr64_32_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_d0: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: neg x9, x2 -; CHECK-NEXT: lsl x8, x8, x9 -; CHECK-NEXT: lsr x0, x8, x9 +; CHECK-NEXT: neg x8, x2 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl x9, x9, x8 +; CHECK-NEXT: lsr x0, x9, x8 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits @@ -949,10 +949,10 @@ define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind { ; CHECK-LABEL: bextr64_32_d1: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, x1 -; CHECK-NEXT: neg w9, w2 -; CHECK-NEXT: lsl w8, w8, w9 -; CHECK-NEXT: lsr w0, w8, w9 +; CHECK-NEXT: neg w8, w2 +; CHECK-NEXT: lsr x9, x0, x1 +; CHECK-NEXT: lsl w9, w9, w8 +; CHECK-NEXT: lsr w0, w9, w8 ; CHECK-NEXT: ret %shifted = lshr i64 %val, %numskipbits %truncshifted = trunc i64 %shifted to i32 diff --git a/llvm/test/CodeGen/AArch64/extract-lowbits.ll b/llvm/test/CodeGen/AArch64/extract-lowbits.ll --- a/llvm/test/CodeGen/AArch64/extract-lowbits.ll +++ b/llvm/test/CodeGen/AArch64/extract-lowbits.ll @@ -50,11 +50,11 @@ define i32 @bzhi32_a2_load(i32* %w, i32 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_a2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl w9, w9, w1 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %val = load i32, i32* %w %onebit = shl i32 1, %numlowbits @@ -66,11 +66,11 @@ define i32 @bzhi32_a3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_a3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl w9, w9, w1 -; CHECK-NEXT: sub w9, w9, #1 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: sub w8, w8, #1 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %val = load i32, i32* %w %conv = zext i8 %numlowbits to i32 @@ -129,11 +129,11 @@ define i64 @bzhi64_a2_load(i64* %w, i64 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_a2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #1 -; CHECK-NEXT: lsl x9, x9, x1 -; CHECK-NEXT: sub x9, x9, #1 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %val = load i64, i64* %w %onebit = shl i64 1, %numlowbits @@ -145,12 +145,12 @@ define i64 @bzhi64_a3_load_indexzext(i64* %w, i8 zeroext %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_a3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #1 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: ldr x9, [x0] ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: lsl x9, x9, x1 -; CHECK-NEXT: sub x9, x9, #1 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %val = load i64, i64* %w %conv = zext i8 %numlowbits to i64 @@ -346,8 +346,8 @@ ; CHECK-LABEL: bzhi32_c1_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #32 -; CHECK-NEXT: sub w8, w8, w1 ; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: sub w8, w8, w1 ; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: and w0, w8, w0 ; CHECK-NEXT: ret @@ -361,11 +361,11 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_c2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: neg w9, w1 +; CHECK-NEXT: neg w8, w1 +; CHECK-NEXT: ldr w9, [x0] ; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: lsr w8, w10, w8 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %val = load i32, i32* %w %numhighbits = sub i32 32, %numlowbits @@ -377,12 +377,12 @@ define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_c3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #32 -; CHECK-NEXT: sub w9, w9, w1 +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: sub w8, w8, w1 ; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: lsr w8, w10, w8 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %val = load i32, i32* %w %numhighbits = sub i8 32, %numlowbits @@ -426,8 +426,8 @@ ; CHECK-LABEL: bzhi64_c1_indexzext: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #64 -; CHECK-NEXT: sub w8, w8, w1 ; CHECK-NEXT: mov x9, #-1 +; CHECK-NEXT: sub w8, w8, w1 ; CHECK-NEXT: lsr x8, x9, x8 ; CHECK-NEXT: and x0, x8, x0 ; CHECK-NEXT: ret @@ -441,11 +441,11 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_c2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: neg x9, x1 +; CHECK-NEXT: neg x8, x1 +; CHECK-NEXT: ldr x9, [x0] ; CHECK-NEXT: mov x10, #-1 -; CHECK-NEXT: lsr x9, x10, x9 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: lsr x8, x10, x8 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %val = load i64, i64* %w %numhighbits = sub i64 64, %numlowbits @@ -457,12 +457,12 @@ define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_c3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #64 -; CHECK-NEXT: sub w9, w9, w1 +; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: sub w8, w8, w1 ; CHECK-NEXT: mov x10, #-1 -; CHECK-NEXT: lsr x9, x10, x9 -; CHECK-NEXT: and x0, x9, x8 +; CHECK-NEXT: lsr x8, x10, x8 +; CHECK-NEXT: and x0, x8, x9 ; CHECK-NEXT: ret %val = load i64, i64* %w %numhighbits = sub i8 64, %numlowbits @@ -521,10 +521,10 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_d2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: neg w9, w1 -; CHECK-NEXT: lsl w8, w8, w9 -; CHECK-NEXT: lsr w0, w8, w9 +; CHECK-NEXT: neg w8, w1 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: lsl w9, w9, w8 +; CHECK-NEXT: lsr w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %w %numhighbits = sub i32 32, %numlowbits @@ -536,11 +536,11 @@ define i32 @bzhi32_d3_load_indexzext(i32* %w, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi32_d3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #32 -; CHECK-NEXT: sub w9, w9, w1 -; CHECK-NEXT: lsl w8, w8, w9 -; CHECK-NEXT: lsr w0, w8, w9 +; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: sub w8, w8, w1 +; CHECK-NEXT: lsl w9, w9, w8 +; CHECK-NEXT: lsr w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %w %numhighbits = sub i8 32, %numlowbits @@ -583,10 +583,10 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_d2_load: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: neg x9, x1 -; CHECK-NEXT: lsl x8, x8, x9 -; CHECK-NEXT: lsr x0, x8, x9 +; CHECK-NEXT: neg x8, x1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: lsl x9, x9, x8 +; CHECK-NEXT: lsr x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %w %numhighbits = sub i64 64, %numlowbits @@ -598,11 +598,11 @@ define i64 @bzhi64_d3_load_indexzext(i64* %w, i8 %numlowbits) nounwind { ; CHECK-LABEL: bzhi64_d3_load_indexzext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #64 -; CHECK-NEXT: sub w9, w9, w1 -; CHECK-NEXT: lsl x8, x8, x9 -; CHECK-NEXT: lsr x0, x8, x9 +; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: sub w8, w8, w1 +; CHECK-NEXT: lsl x9, x9, x8 +; CHECK-NEXT: lsr x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %w %numhighbits = sub i8 64, %numlowbits diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll --- a/llvm/test/CodeGen/AArch64/f16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll @@ -486,13 +486,13 @@ ; CHECK-COMMON-LABEL: test_fccmp: ; CHECK-CVT: fcvt s0, h0 ; CHECK-CVT-NEXT: fmov s1, #8.00000000 -; CHECK-CVT-NEXT: fmov s2, #5.00000000 ; CHECK-CVT-NEXT: fcmp s0, s1 +; CHECK-CVT-NEXT: fmov s1, #5.00000000 ; CHECK-CVT-NEXT: cset w8, gt -; CHECK-CVT-NEXT: fcmp s0, s2 +; CHECK-CVT-NEXT: fcmp s0, s1 ; CHECK-CVT-NEXT: cset w9, mi ; CHECK-CVT-NEXT: tst w8, w9 -; CHECK-CVT-NEXT: fcsel s0, s0, s2, ne +; CHECK-CVT-NEXT: fcsel s0, s0, s1, ne ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: str h0, [x0] ; CHECK-CVT-NEXT: ret @@ -1100,9 +1100,9 @@ } ; CHECK-CVT-LABEL: test_copysign: +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret @@ -1118,15 +1118,15 @@ } ; CHECK-CVT-LABEL: test_copysign_f32: -; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 +; CHECK-CVT-NEXT: fcvt s0, h0 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_copysign_f32: -; CHECK-FP16-NEXT: fcvt h1, s1 ; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: fcvt h1, s1 ; CHECK-FP16-NEXT: bit.16b v0, v1, v2 ; CHECK-FP16-NEXT: ret @@ -1137,16 +1137,16 @@ } ; CHECK-CVT-LABEL: test_copysign_f64: +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: fcvt s1, d1 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: fcvt h0, s0 ; CHECK-CVT-NEXT: ret ; CHECK-FP16-LABEL: test_copysign_f64: -; CHECK-FP16-NEXT: fcvt h1, d1 ; CHECK-FP16-NEXT: movi.8h v2, #128, lsl #8 +; CHECK-FP16-NEXT: fcvt h1, d1 ; CHECK-FP16-NEXT: bit.16b v0, v1, v2 ; CHECK-FP16-NEXT: ret @@ -1160,9 +1160,9 @@ ; away the (fpext (fp_round )) here. ; CHECK-CVT-LABEL: test_copysign_extended: +; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: movi.4s v2, #128, lsl #24 ; CHECK-CVT-NEXT: bit.16b v0, v1, v2 ; CHECK-CVT-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll --- a/llvm/test/CodeGen/AArch64/fabs.ll +++ b/llvm/test/CodeGen/AArch64/fabs.ll @@ -23,10 +23,10 @@ ; CHECK-LABEL: still_not_fabs: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-2147483648 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fneg s1, s0 -; CHECK-NEXT: fcmp s0, s2 -; CHECK-NEXT: fcsel s0, s0, s1, ge +; CHECK-NEXT: fneg s2, s0 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fcsel s0, s0, s2, ge ; CHECK-NEXT: ret %cmp = fcmp nnan oge float %x, -0.0 %sub = fsub nnan float -0.0, %x diff --git a/llvm/test/CodeGen/AArch64/fadd-combines.ll b/llvm/test/CodeGen/AArch64/fadd-combines.ll --- a/llvm/test/CodeGen/AArch64/fadd-combines.ll +++ b/llvm/test/CodeGen/AArch64/fadd-combines.ll @@ -28,9 +28,9 @@ define double @test3(double %a, double %b, double %c) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: +; CHECK-NEXT: fadd d2, d2, d2 ; CHECK-NEXT: fmul d0, d0, d1 -; CHECK-NEXT: fadd d1, d2, d2 -; CHECK-NEXT: fsub d0, d0, d1 +; CHECK-NEXT: fsub d0, d0, d2 ; CHECK-NEXT: ret %mul = fmul double %a, %b %mul1 = fmul double %c, 2.000000e+00 @@ -41,9 +41,9 @@ define double @test4(double %a, double %b, double %c) { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: +; CHECK-NEXT: fadd d2, d2, d2 ; CHECK-NEXT: fmul d0, d0, d1 -; CHECK-NEXT: fadd d1, d2, d2 -; CHECK-NEXT: fsub d0, d0, d1 +; CHECK-NEXT: fsub d0, d0, d2 ; CHECK-NEXT: ret %mul = fmul double %a, %b %mul1 = fmul double %c, -2.000000e+00 @@ -114,10 +114,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: fmov d2, #-2.00000000 +; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fadd d8, d0, d1 ; CHECK-NEXT: fmov d0, d1 -; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: bl use ; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fmov d0, d8 @@ -268,8 +268,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmul s1, s0, s1 ; CHECK-NEXT: fmadd s0, s2, s3, s1 -; CHECK-NEXT: fadd s0, s4, s0 ; CHECK-NEXT: str s1, [x0] +; CHECK-NEXT: fadd s0, s4, s0 ; CHECK-NEXT: ret %m1 = fmul fast float %a, %b store float %m1, float* %p @@ -286,8 +286,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmul s2, s2, s3 ; CHECK-NEXT: fmadd s0, s0, s1, s2 -; CHECK-NEXT: fadd s0, s4, s0 ; CHECK-NEXT: str s2, [x0] +; CHECK-NEXT: fadd s0, s4, s0 ; CHECK-NEXT: ret %m1 = fmul fast float %a, %b %m2 = fmul fast float %c, %d diff --git a/llvm/test/CodeGen/AArch64/faddp-half.ll b/llvm/test/CodeGen/AArch64/faddp-half.ll --- a/llvm/test/CodeGen/AArch64/faddp-half.ll +++ b/llvm/test/CodeGen/AArch64/faddp-half.ll @@ -13,8 +13,8 @@ ; CHECKNOFP16: // %bb.0: // %entry ; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECKNOFP16-NEXT: dup v1.4h, v0.h[1] -; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECKNOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECKNOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s ; CHECKNOFP16-NEXT: fcvtn v0.4h, v0.4s ; CHECKNOFP16-NEXT: // kill: def $h0 killed $h0 killed $q0 @@ -37,8 +37,8 @@ ; CHECKNOFP16: // %bb.0: // %entry ; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECKNOFP16-NEXT: dup v1.4h, v0.h[1] -; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECKNOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECKNOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s ; CHECKNOFP16-NEXT: fcvtn v0.4h, v0.4s ; CHECKNOFP16-NEXT: // kill: def $h0 killed $h0 killed $q0 @@ -61,8 +61,8 @@ ; CHECKNOFP16: // %bb.0: // %entry ; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECKNOFP16-NEXT: dup v1.4h, v0.h[1] -; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECKNOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECKNOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s ; CHECKNOFP16-NEXT: fcvtn v0.4h, v0.4s ; CHECKNOFP16-NEXT: // kill: def $h0 killed $h0 killed $q0 @@ -85,8 +85,8 @@ ; CHECKNOFP16: // %bb.0: // %entry ; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECKNOFP16-NEXT: dup v1.4h, v0.h[1] -; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECKNOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECKNOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECKNOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s ; CHECKNOFP16-NEXT: fcvtn v0.4h, v0.4s ; CHECKNOFP16-NEXT: // kill: def $h0 killed $h0 killed $q0 diff --git a/llvm/test/CodeGen/AArch64/fast-isel-addressing-modes.ll b/llvm/test/CodeGen/AArch64/fast-isel-addressing-modes.ll --- a/llvm/test/CodeGen/AArch64/fast-isel-addressing-modes.ll +++ b/llvm/test/CodeGen/AArch64/fast-isel-addressing-modes.ll @@ -410,9 +410,9 @@ define i64 @load_breg_offreg_immoff_2(i64 %a, i64 %b) { ; SDAG-LABEL: load_breg_offreg_immoff_2: ; SDAG: ; %bb.0: -; SDAG-NEXT: add x8, x0, x1 -; SDAG-NEXT: mov w9, #61440 -; SDAG-NEXT: ldr x0, [x8, x9] +; SDAG-NEXT: mov w8, #61440 +; SDAG-NEXT: add x9, x0, x1 +; SDAG-NEXT: ldr x0, [x9, x8] ; SDAG-NEXT: ret ; ; FAST-LABEL: load_breg_offreg_immoff_2: diff --git a/llvm/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll b/llvm/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll --- a/llvm/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll +++ b/llvm/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll @@ -139,11 +139,11 @@ ; CHECK-LABEL: test_or_unpredictable: ; CHECK: ; %bb.0: ; %bb1 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: cset w8, eq ; CHECK-NEXT: cmp w1, #0 ; CHECK-NEXT: cset w9, eq ; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: tbnz w8, #0, LBB4_2 ; CHECK-NEXT: ; %bb.1: ; %bb4 ; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill @@ -172,11 +172,11 @@ ; CHECK-LABEL: test_and_unpredictable: ; CHECK: ; %bb.0: ; %bb1 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: cmp w1, #0 ; CHECK-NEXT: cset w9, ne ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: tbz w8, #0, LBB5_2 ; CHECK-NEXT: ; %bb.1: ; %bb4 ; CHECK-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill diff --git a/llvm/test/CodeGen/AArch64/fast-isel-gep.ll b/llvm/test/CodeGen/AArch64/fast-isel-gep.ll --- a/llvm/test/CodeGen/AArch64/fast-isel-gep.ll +++ b/llvm/test/CodeGen/AArch64/fast-isel-gep.ll @@ -54,9 +54,9 @@ ; CHECK-LABEL: test_array5: ; CHECK: ; %bb.0: ; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: mov x9, #4 -; CHECK-NEXT: madd x0, x8, x9, x0 +; CHECK-NEXT: mov x8, #4 +; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: madd x0, x9, x8, x0 ; CHECK-NEXT: ret %1 = getelementptr inbounds i32, i32* %a, i32 %i ret i32* %1 diff --git a/llvm/test/CodeGen/AArch64/fast-isel-memcpy.ll b/llvm/test/CodeGen/AArch64/fast-isel-memcpy.ll --- a/llvm/test/CodeGen/AArch64/fast-isel-memcpy.ll +++ b/llvm/test/CodeGen/AArch64/fast-isel-memcpy.ll @@ -5,9 +5,9 @@ define void @test(i64 %a, i8* %b) { ; CHECK-LABEL: test: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldr x8, [x1] -; CHECK-NEXT: and x9, x0, #0x7fffffffffffffff -; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: and x8, x0, #0x7fffffffffffffff +; CHECK-NEXT: ldr x9, [x1] +; CHECK-NEXT: str x9, [x8] ; CHECK-NEXT: ret %1 = and i64 %a, 9223372036854775807 %2 = inttoptr i64 %1 to i8* diff --git a/llvm/test/CodeGen/AArch64/fast-isel-shift.ll b/llvm/test/CodeGen/AArch64/fast-isel-shift.ll --- a/llvm/test/CodeGen/AArch64/fast-isel-shift.ll +++ b/llvm/test/CodeGen/AArch64/fast-isel-shift.ll @@ -391,9 +391,9 @@ define zeroext i8 @lsrv_i8(i8 %a, i8 %b) { ; CHECK-LABEL: lsrv_i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: and w9, w1, #0xff -; CHECK-NEXT: lsr w8, w8, w9 +; CHECK-NEXT: and w8, w1, #0xff +; CHECK-NEXT: and w9, w0, #0xff +; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: and w8, w8, #0xff ; CHECK-NEXT: uxtb w0, w8 ; CHECK-NEXT: ret @@ -458,9 +458,9 @@ define zeroext i16 @lsrv_i16(i16 %a, i16 %b) { ; CHECK-LABEL: lsrv_i16: ; CHECK: ; %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: and w9, w1, #0xffff -; CHECK-NEXT: lsr w8, w8, w9 +; CHECK-NEXT: and w8, w1, #0xffff +; CHECK-NEXT: and w9, w0, #0xffff +; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: and w8, w8, #0xffff ; CHECK-NEXT: uxth w0, w8 ; CHECK-NEXT: ret @@ -517,9 +517,9 @@ define zeroext i8 @asrv_i8(i8 %a, i8 %b) { ; CHECK-LABEL: asrv_i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: and w9, w1, #0xff -; CHECK-NEXT: asr w8, w8, w9 +; CHECK-NEXT: and w8, w1, #0xff +; CHECK-NEXT: sxtb w9, w0 +; CHECK-NEXT: asr w8, w9, w8 ; CHECK-NEXT: and w8, w8, #0xff ; CHECK-NEXT: uxtb w0, w8 ; CHECK-NEXT: ret @@ -582,9 +582,9 @@ define zeroext i16 @asrv_i16(i16 %a, i16 %b) { ; CHECK-LABEL: asrv_i16: ; CHECK: ; %bb.0: -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: and w9, w1, #0xffff -; CHECK-NEXT: asr w8, w8, w9 +; CHECK-NEXT: and w8, w1, #0xffff +; CHECK-NEXT: sxth w9, w0 +; CHECK-NEXT: asr w8, w9, w8 ; CHECK-NEXT: and w8, w8, #0xffff ; CHECK-NEXT: uxth w0, w8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fdiv_combine.ll b/llvm/test/CodeGen/AArch64/fdiv_combine.ll --- a/llvm/test/CodeGen/AArch64/fdiv_combine.ll +++ b/llvm/test/CodeGen/AArch64/fdiv_combine.ll @@ -29,8 +29,8 @@ define <2 x float> @test3(<2 x i32> %in) { ; CHECK-LABEL: test3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf.2s v0, v0 ; CHECK-NEXT: fmov.2s v1, #9.00000000 +; CHECK-NEXT: scvtf.2s v0, v0 ; CHECK-NEXT: fdiv.2s v0, v0, v1 ; CHECK-NEXT: ret entry: @@ -43,8 +43,8 @@ define <2 x float> @test4(<2 x i32> %in) { ; CHECK-LABEL: test4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf.2s v0, v0 ; CHECK-NEXT: movi.2s v1, #80, lsl #24 +; CHECK-NEXT: scvtf.2s v0, v0 ; CHECK-NEXT: fdiv.2s v0, v0, v1 ; CHECK-NEXT: ret entry: @@ -106,8 +106,8 @@ ; CHECK-LABEL: test9: ; CHECK: // %bb.0: ; CHECK-NEXT: ucvtf.2d v0, v0 -; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: movi.2s v1, #64, lsl #24 +; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fdiv.2s v0, v0, v1 ; CHECK-NEXT: ret %conv = uitofp <2 x i64> %in to <2 x float> diff --git a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll --- a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll +++ b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll @@ -100,16 +100,16 @@ define i64 @f6() { ; CHECK-LABEL: f6: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, x2 -; CHECK-NEXT: add x8, x8, :lo12:x2 -; CHECK-NEXT: mov w9, #2097152 -; CHECK-NEXT: ldr x0, [x8, x9] +; CHECK-NEXT: mov w8, #2097152 +; CHECK-NEXT: adrp x9, x2 +; CHECK-NEXT: add x9, x9, :lo12:x2 +; CHECK-NEXT: ldr x0, [x9, x8] ; CHECK-NEXT: ret ; ; GISEL-LABEL: f6: ; GISEL: // %bb.0: -; GISEL-NEXT: adrp x9, x2 ; GISEL-NEXT: mov w8, #2097152 +; GISEL-NEXT: adrp x9, x2 ; GISEL-NEXT: add x9, x9, :lo12:x2 ; GISEL-NEXT: ldr x0, [x9, x8] ; GISEL-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll --- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll @@ -7,57 +7,58 @@ ; CHECK-CVT: // %bb.0: // %entry ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 ; CHECK-CVT-NEXT: mov h6, v1.h[2] ; CHECK-CVT-NEXT: mov h7, v0.h[2] ; CHECK-CVT-NEXT: mov h16, v1.h[3] ; CHECK-CVT-NEXT: mov h17, v0.h[3] -; CHECK-CVT-NEXT: fcvt s4, h1 -; CHECK-CVT-NEXT: fcvt s5, h0 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s6, h6 -; CHECK-CVT-NEXT: fcvt s7, h7 -; CHECK-CVT-NEXT: fcvt s16, h16 -; CHECK-CVT-NEXT: fcvt s17, h17 ; CHECK-CVT-NEXT: fadd s4, s5, s4 +; CHECK-CVT-NEXT: fcvt s5, h6 +; CHECK-CVT-NEXT: fcvt s6, h7 +; CHECK-CVT-NEXT: fcvt s7, h16 +; CHECK-CVT-NEXT: fcvt s16, h17 +; CHECK-CVT-NEXT: fadd s3, s3, s2 +; CHECK-CVT-NEXT: fcvt h2, s4 +; CHECK-CVT-NEXT: fadd s4, s6, s5 ; CHECK-CVT-NEXT: mov h5, v1.h[4] -; CHECK-CVT-NEXT: fadd s2, s3, s2 -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fadd s6, s7, s6 -; CHECK-CVT-NEXT: mov h7, v1.h[5] -; CHECK-CVT-NEXT: fadd s16, s17, s16 -; CHECK-CVT-NEXT: mov h17, v0.h[5] -; CHECK-CVT-NEXT: fcvt s5, h5 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s7, h7 -; CHECK-CVT-NEXT: fcvt s17, h17 -; CHECK-CVT-NEXT: fadd s3, s3, s5 -; CHECK-CVT-NEXT: mov h5, v1.h[6] -; CHECK-CVT-NEXT: fadd s7, s17, s7 -; CHECK-CVT-NEXT: mov h17, v0.h[6] +; CHECK-CVT-NEXT: mov h6, v0.h[4] +; CHECK-CVT-NEXT: fadd s7, s16, s7 +; CHECK-CVT-NEXT: fcvt h3, s3 +; CHECK-CVT-NEXT: mov h16, v0.h[5] +; CHECK-CVT-NEXT: fcvt h7, s7 +; CHECK-CVT-NEXT: mov v2.h[1], v3.h[0] +; CHECK-CVT-NEXT: fcvt h3, s4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: fcvt s5, h6 +; CHECK-CVT-NEXT: mov h6, v1.h[5] +; CHECK-CVT-NEXT: mov v2.h[2], v3.h[0] +; CHECK-CVT-NEXT: fadd s3, s5, s4 +; CHECK-CVT-NEXT: fcvt s4, h6 +; CHECK-CVT-NEXT: fcvt s5, h16 +; CHECK-CVT-NEXT: mov h6, v1.h[6] +; CHECK-CVT-NEXT: mov h16, v0.h[6] ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: mov v2.h[3], v7.h[0] ; CHECK-CVT-NEXT: mov h0, v0.h[7] +; CHECK-CVT-NEXT: fcvt h3, s3 +; CHECK-CVT-NEXT: fadd s4, s5, s4 +; CHECK-CVT-NEXT: fcvt s5, h6 +; CHECK-CVT-NEXT: fcvt s6, h16 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: fadd s1, s0, s1 -; CHECK-CVT-NEXT: fcvt h0, s4 -; CHECK-CVT-NEXT: fcvt h2, s2 -; CHECK-CVT-NEXT: mov v0.h[1], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s6 -; CHECK-CVT-NEXT: mov v0.h[2], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s16 -; CHECK-CVT-NEXT: fcvt s5, h5 -; CHECK-CVT-NEXT: fcvt s17, h17 -; CHECK-CVT-NEXT: mov v0.h[3], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s3 -; CHECK-CVT-NEXT: fadd s5, s17, s5 -; CHECK-CVT-NEXT: fcvt h3, s7 -; CHECK-CVT-NEXT: mov v0.h[4], v2.h[0] -; CHECK-CVT-NEXT: fcvt h4, s5 -; CHECK-CVT-NEXT: mov v0.h[5], v3.h[0] -; CHECK-CVT-NEXT: mov v0.h[6], v4.h[0] -; CHECK-CVT-NEXT: fcvt h1, s1 -; CHECK-CVT-NEXT: mov v0.h[7], v1.h[0] +; CHECK-CVT-NEXT: mov v2.h[4], v3.h[0] +; CHECK-CVT-NEXT: fcvt h3, s4 +; CHECK-CVT-NEXT: fadd s4, s6, s5 +; CHECK-CVT-NEXT: fadd s0, s0, s1 +; CHECK-CVT-NEXT: mov v2.h[5], v3.h[0] +; CHECK-CVT-NEXT: fcvt h3, s4 +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: mov v2.h[6], v3.h[0] +; CHECK-CVT-NEXT: mov v2.h[7], v0.h[0] +; CHECK-CVT-NEXT: mov v0.16b, v2.16b ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: add_h: @@ -75,57 +76,58 @@ ; CHECK-CVT: // %bb.0: // %entry ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 ; CHECK-CVT-NEXT: mov h6, v1.h[2] ; CHECK-CVT-NEXT: mov h7, v0.h[2] ; CHECK-CVT-NEXT: mov h16, v1.h[3] ; CHECK-CVT-NEXT: mov h17, v0.h[3] -; CHECK-CVT-NEXT: fcvt s4, h1 -; CHECK-CVT-NEXT: fcvt s5, h0 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s6, h6 -; CHECK-CVT-NEXT: fcvt s7, h7 -; CHECK-CVT-NEXT: fcvt s16, h16 -; CHECK-CVT-NEXT: fcvt s17, h17 ; CHECK-CVT-NEXT: fsub s4, s5, s4 +; CHECK-CVT-NEXT: fcvt s5, h6 +; CHECK-CVT-NEXT: fcvt s6, h7 +; CHECK-CVT-NEXT: fcvt s7, h16 +; CHECK-CVT-NEXT: fcvt s16, h17 +; CHECK-CVT-NEXT: fsub s3, s3, s2 +; CHECK-CVT-NEXT: fcvt h2, s4 +; CHECK-CVT-NEXT: fsub s4, s6, s5 ; CHECK-CVT-NEXT: mov h5, v1.h[4] -; CHECK-CVT-NEXT: fsub s2, s3, s2 -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fsub s6, s7, s6 -; CHECK-CVT-NEXT: mov h7, v1.h[5] -; CHECK-CVT-NEXT: fsub s16, s17, s16 -; CHECK-CVT-NEXT: mov h17, v0.h[5] -; CHECK-CVT-NEXT: fcvt s5, h5 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s7, h7 -; CHECK-CVT-NEXT: fcvt s17, h17 -; CHECK-CVT-NEXT: fsub s3, s3, s5 -; CHECK-CVT-NEXT: mov h5, v1.h[6] -; CHECK-CVT-NEXT: fsub s7, s17, s7 -; CHECK-CVT-NEXT: mov h17, v0.h[6] +; CHECK-CVT-NEXT: mov h6, v0.h[4] +; CHECK-CVT-NEXT: fsub s7, s16, s7 +; CHECK-CVT-NEXT: fcvt h3, s3 +; CHECK-CVT-NEXT: mov h16, v0.h[5] +; CHECK-CVT-NEXT: fcvt h7, s7 +; CHECK-CVT-NEXT: mov v2.h[1], v3.h[0] +; CHECK-CVT-NEXT: fcvt h3, s4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: fcvt s5, h6 +; CHECK-CVT-NEXT: mov h6, v1.h[5] +; CHECK-CVT-NEXT: mov v2.h[2], v3.h[0] +; CHECK-CVT-NEXT: fsub s3, s5, s4 +; CHECK-CVT-NEXT: fcvt s4, h6 +; CHECK-CVT-NEXT: fcvt s5, h16 +; CHECK-CVT-NEXT: mov h6, v1.h[6] +; CHECK-CVT-NEXT: mov h16, v0.h[6] ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: mov v2.h[3], v7.h[0] ; CHECK-CVT-NEXT: mov h0, v0.h[7] +; CHECK-CVT-NEXT: fcvt h3, s3 +; CHECK-CVT-NEXT: fsub s4, s5, s4 +; CHECK-CVT-NEXT: fcvt s5, h6 +; CHECK-CVT-NEXT: fcvt s6, h16 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: fsub s1, s0, s1 -; CHECK-CVT-NEXT: fcvt h0, s4 -; CHECK-CVT-NEXT: fcvt h2, s2 -; CHECK-CVT-NEXT: mov v0.h[1], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s6 -; CHECK-CVT-NEXT: mov v0.h[2], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s16 -; CHECK-CVT-NEXT: fcvt s5, h5 -; CHECK-CVT-NEXT: fcvt s17, h17 -; CHECK-CVT-NEXT: mov v0.h[3], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s3 -; CHECK-CVT-NEXT: fsub s5, s17, s5 -; CHECK-CVT-NEXT: fcvt h3, s7 -; CHECK-CVT-NEXT: mov v0.h[4], v2.h[0] -; CHECK-CVT-NEXT: fcvt h4, s5 -; CHECK-CVT-NEXT: mov v0.h[5], v3.h[0] -; CHECK-CVT-NEXT: mov v0.h[6], v4.h[0] -; CHECK-CVT-NEXT: fcvt h1, s1 -; CHECK-CVT-NEXT: mov v0.h[7], v1.h[0] +; CHECK-CVT-NEXT: mov v2.h[4], v3.h[0] +; CHECK-CVT-NEXT: fcvt h3, s4 +; CHECK-CVT-NEXT: fsub s4, s6, s5 +; CHECK-CVT-NEXT: fsub s0, s0, s1 +; CHECK-CVT-NEXT: mov v2.h[5], v3.h[0] +; CHECK-CVT-NEXT: fcvt h3, s4 +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: mov v2.h[6], v3.h[0] +; CHECK-CVT-NEXT: mov v2.h[7], v0.h[0] +; CHECK-CVT-NEXT: mov v0.16b, v2.16b ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: sub_h: @@ -143,57 +145,58 @@ ; CHECK-CVT: // %bb.0: // %entry ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] -; CHECK-CVT-NEXT: mov h6, v1.h[2] -; CHECK-CVT-NEXT: mov h7, v0.h[2] -; CHECK-CVT-NEXT: mov h16, v1.h[3] -; CHECK-CVT-NEXT: mov h17, v0.h[3] ; CHECK-CVT-NEXT: fcvt s4, h1 ; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[2] +; CHECK-CVT-NEXT: mov h7, v0.h[2] +; CHECK-CVT-NEXT: mov h16, v0.h[3] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fmul s4, s5, s4 +; CHECK-CVT-NEXT: mov h5, v1.h[3] ; CHECK-CVT-NEXT: fcvt s6, h6 ; CHECK-CVT-NEXT: fcvt s7, h7 -; CHECK-CVT-NEXT: fcvt s16, h16 -; CHECK-CVT-NEXT: fcvt s17, h17 -; CHECK-CVT-NEXT: fmul s4, s5, s4 -; CHECK-CVT-NEXT: mov h5, v1.h[4] -; CHECK-CVT-NEXT: fmul s2, s3, s2 -; CHECK-CVT-NEXT: mov h3, v0.h[4] +; CHECK-CVT-NEXT: fmul s3, s3, s2 +; CHECK-CVT-NEXT: fcvt h2, s4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: fcvt s5, h16 ; CHECK-CVT-NEXT: fmul s6, s7, s6 -; CHECK-CVT-NEXT: mov h7, v1.h[5] -; CHECK-CVT-NEXT: fmul s16, s17, s16 -; CHECK-CVT-NEXT: mov h17, v0.h[5] +; CHECK-CVT-NEXT: mov h7, v1.h[4] +; CHECK-CVT-NEXT: mov h16, v0.h[4] +; CHECK-CVT-NEXT: fcvt h3, s3 +; CHECK-CVT-NEXT: fmul s4, s5, s4 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: fcvt h6, s6 +; CHECK-CVT-NEXT: fcvt s7, h7 +; CHECK-CVT-NEXT: mov v2.h[1], v3.h[0] +; CHECK-CVT-NEXT: mov h3, v1.h[5] +; CHECK-CVT-NEXT: fcvt s16, h16 +; CHECK-CVT-NEXT: fcvt h4, s4 ; CHECK-CVT-NEXT: fcvt s5, h5 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s7, h7 -; CHECK-CVT-NEXT: fcvt s17, h17 -; CHECK-CVT-NEXT: fmul s3, s3, s5 -; CHECK-CVT-NEXT: mov h5, v1.h[6] -; CHECK-CVT-NEXT: fmul s7, s17, s7 -; CHECK-CVT-NEXT: mov h17, v0.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], v6.h[0] +; CHECK-CVT-NEXT: fmul s6, s16, s7 +; CHECK-CVT-NEXT: mov h7, v1.h[6] +; CHECK-CVT-NEXT: mov h16, v0.h[6] ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: fmul s3, s5, s3 ; CHECK-CVT-NEXT: mov h0, v0.h[7] +; CHECK-CVT-NEXT: mov v2.h[3], v4.h[0] +; CHECK-CVT-NEXT: fcvt h4, s6 +; CHECK-CVT-NEXT: fcvt s5, h7 +; CHECK-CVT-NEXT: fcvt s6, h16 ; CHECK-CVT-NEXT: fcvt s1, h1 +; CHECK-CVT-NEXT: fcvt h3, s3 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: fmul s1, s0, s1 -; CHECK-CVT-NEXT: fcvt h0, s4 -; CHECK-CVT-NEXT: fcvt h2, s2 -; CHECK-CVT-NEXT: mov v0.h[1], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s6 -; CHECK-CVT-NEXT: mov v0.h[2], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s16 -; CHECK-CVT-NEXT: fcvt s5, h5 -; CHECK-CVT-NEXT: fcvt s17, h17 -; CHECK-CVT-NEXT: mov v0.h[3], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s3 -; CHECK-CVT-NEXT: fmul s5, s17, s5 -; CHECK-CVT-NEXT: fcvt h3, s7 -; CHECK-CVT-NEXT: mov v0.h[4], v2.h[0] -; CHECK-CVT-NEXT: fcvt h4, s5 -; CHECK-CVT-NEXT: mov v0.h[5], v3.h[0] -; CHECK-CVT-NEXT: mov v0.h[6], v4.h[0] -; CHECK-CVT-NEXT: fcvt h1, s1 -; CHECK-CVT-NEXT: mov v0.h[7], v1.h[0] +; CHECK-CVT-NEXT: mov v2.h[4], v4.h[0] +; CHECK-CVT-NEXT: fmul s4, s6, s5 +; CHECK-CVT-NEXT: fmul s0, s0, s1 +; CHECK-CVT-NEXT: mov v2.h[5], v3.h[0] +; CHECK-CVT-NEXT: fcvt h3, s4 +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: mov v2.h[6], v3.h[0] +; CHECK-CVT-NEXT: mov v2.h[7], v0.h[0] +; CHECK-CVT-NEXT: mov v0.16b, v2.16b ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: mul_h: @@ -211,57 +214,58 @@ ; CHECK-CVT: // %bb.0: // %entry ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] -; CHECK-CVT-NEXT: mov h6, v1.h[2] -; CHECK-CVT-NEXT: mov h7, v0.h[2] -; CHECK-CVT-NEXT: mov h16, v1.h[3] -; CHECK-CVT-NEXT: mov h17, v0.h[3] -; CHECK-CVT-NEXT: fcvt s4, h1 -; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: fcvt s4, h0 +; CHECK-CVT-NEXT: mov h5, v0.h[2] +; CHECK-CVT-NEXT: mov h6, v0.h[3] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v0.h[5] +; CHECK-CVT-NEXT: mov h17, v0.h[6] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcvt s5, h5 +; CHECK-CVT-NEXT: mov h0, v0.h[7] ; CHECK-CVT-NEXT: fcvt s6, h6 ; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcvt s16, h16 ; CHECK-CVT-NEXT: fcvt s17, h17 -; CHECK-CVT-NEXT: fdiv s4, s5, s4 -; CHECK-CVT-NEXT: mov h5, v1.h[4] ; CHECK-CVT-NEXT: fdiv s2, s3, s2 -; CHECK-CVT-NEXT: mov h3, v0.h[4] +; CHECK-CVT-NEXT: fcvt s3, h1 +; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: fdiv s3, s4, s3 +; CHECK-CVT-NEXT: mov h4, v1.h[2] +; CHECK-CVT-NEXT: fcvt h18, s2 +; CHECK-CVT-NEXT: fcvt s4, h4 +; CHECK-CVT-NEXT: fdiv s4, s5, s4 +; CHECK-CVT-NEXT: mov h5, v1.h[3] +; CHECK-CVT-NEXT: fcvt h2, s3 +; CHECK-CVT-NEXT: fcvt s5, h5 +; CHECK-CVT-NEXT: mov v2.h[1], v18.h[0] +; CHECK-CVT-NEXT: fdiv s5, s6, s5 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: fcvt h4, s4 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: mov v2.h[2], v4.h[0] ; CHECK-CVT-NEXT: fdiv s6, s7, s6 ; CHECK-CVT-NEXT: mov h7, v1.h[5] -; CHECK-CVT-NEXT: fdiv s16, s17, s16 -; CHECK-CVT-NEXT: mov h17, v0.h[5] -; CHECK-CVT-NEXT: fcvt s5, h5 -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcvt h4, s5 ; CHECK-CVT-NEXT: fcvt s7, h7 -; CHECK-CVT-NEXT: fcvt s17, h17 -; CHECK-CVT-NEXT: fdiv s3, s3, s5 -; CHECK-CVT-NEXT: mov h5, v1.h[6] -; CHECK-CVT-NEXT: fdiv s7, s17, s7 -; CHECK-CVT-NEXT: mov h17, v0.h[6] +; CHECK-CVT-NEXT: mov v2.h[3], v4.h[0] +; CHECK-CVT-NEXT: fdiv s7, s16, s7 +; CHECK-CVT-NEXT: mov h16, v1.h[6] ; CHECK-CVT-NEXT: mov h1, v1.h[7] -; CHECK-CVT-NEXT: mov h0, v0.h[7] +; CHECK-CVT-NEXT: fcvt s16, h16 ; CHECK-CVT-NEXT: fcvt s1, h1 -; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: fdiv s1, s0, s1 -; CHECK-CVT-NEXT: fcvt h0, s4 -; CHECK-CVT-NEXT: fcvt h2, s2 -; CHECK-CVT-NEXT: mov v0.h[1], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s6 -; CHECK-CVT-NEXT: mov v0.h[2], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s16 -; CHECK-CVT-NEXT: fcvt s5, h5 -; CHECK-CVT-NEXT: fcvt s17, h17 -; CHECK-CVT-NEXT: mov v0.h[3], v2.h[0] -; CHECK-CVT-NEXT: fcvt h2, s3 -; CHECK-CVT-NEXT: fdiv s5, s17, s5 -; CHECK-CVT-NEXT: fcvt h3, s7 -; CHECK-CVT-NEXT: mov v0.h[4], v2.h[0] -; CHECK-CVT-NEXT: fcvt h4, s5 -; CHECK-CVT-NEXT: mov v0.h[5], v3.h[0] -; CHECK-CVT-NEXT: mov v0.h[6], v4.h[0] -; CHECK-CVT-NEXT: fcvt h1, s1 -; CHECK-CVT-NEXT: mov v0.h[7], v1.h[0] +; CHECK-CVT-NEXT: fdiv s3, s17, s16 +; CHECK-CVT-NEXT: fdiv s0, s0, s1 +; CHECK-CVT-NEXT: fcvt h1, s6 +; CHECK-CVT-NEXT: mov v2.h[4], v1.h[0] +; CHECK-CVT-NEXT: fcvt h1, s7 +; CHECK-CVT-NEXT: mov v2.h[5], v1.h[0] +; CHECK-CVT-NEXT: fcvt h1, s3 +; CHECK-CVT-NEXT: mov v2.h[6], v1.h[0] +; CHECK-CVT-NEXT: fcvt h0, s0 +; CHECK-CVT-NEXT: mov v2.h[7], v0.h[0] +; CHECK-CVT-NEXT: mov v0.16b, v2.16b ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: div_h: @@ -311,22 +315,22 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov d4, v0.d[1] ; CHECK-NEXT: fcvt h0, d0 +; CHECK-NEXT: mov d5, v1.d[1] +; CHECK-NEXT: fcvt h1, d1 ; CHECK-NEXT: fcvt h4, d4 ; CHECK-NEXT: mov v0.h[1], v4.h[0] -; CHECK-NEXT: fcvt h4, d1 -; CHECK-NEXT: mov d1, v1.d[1] -; CHECK-NEXT: mov v0.h[2], v4.h[0] -; CHECK-NEXT: fcvt h1, d1 -; CHECK-NEXT: fcvt h4, d2 +; CHECK-NEXT: fcvt h4, d5 +; CHECK-NEXT: mov v0.h[2], v1.h[0] +; CHECK-NEXT: fcvt h1, d2 ; CHECK-NEXT: mov d2, v2.d[1] -; CHECK-NEXT: mov v0.h[3], v1.h[0] +; CHECK-NEXT: mov v0.h[3], v4.h[0] ; CHECK-NEXT: fcvt h2, d2 -; CHECK-NEXT: mov v0.h[4], v4.h[0] +; CHECK-NEXT: mov v0.h[4], v1.h[0] ; CHECK-NEXT: fcvt h1, d3 -; CHECK-NEXT: mov d3, v3.d[1] ; CHECK-NEXT: mov v0.h[5], v2.h[0] +; CHECK-NEXT: mov d2, v3.d[1] ; CHECK-NEXT: mov v0.h[6], v1.h[0] -; CHECK-NEXT: fcvt h1, d3 +; CHECK-NEXT: fcvt h1, d2 ; CHECK-NEXT: mov v0.h[7], v1.h[0] ; CHECK-NEXT: ret %1 = fptrunc <8 x double> %a to <8 x half> @@ -346,26 +350,25 @@ define <8 x double> @h_to_d(<8 x half> %a) { ; CHECK-LABEL: h_to_d: ; CHECK: // %bb.0: +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvt d4, h0 -; CHECK-NEXT: mov h2, v0.h[3] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fcvt d5, h1 -; CHECK-NEXT: fcvt d2, h2 -; CHECK-NEXT: fcvt d1, h3 -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: mov v1.d[1], v2.d[0] -; CHECK-NEXT: fcvt d2, h0 -; CHECK-NEXT: fcvt d3, h3 -; CHECK-NEXT: mov v2.d[1], v3.d[0] ; CHECK-NEXT: mov h3, v0.h[3] -; CHECK-NEXT: mov h0, v0.h[2] -; CHECK-NEXT: mov v4.d[1], v5.d[0] -; CHECK-NEXT: fcvt d5, h3 -; CHECK-NEXT: fcvt d3, h0 +; CHECK-NEXT: mov h4, v0.h[2] +; CHECK-NEXT: fcvt d0, h0 +; CHECK-NEXT: mov h5, v2.h[1] +; CHECK-NEXT: mov h6, v2.h[3] +; CHECK-NEXT: mov h7, v2.h[2] +; CHECK-NEXT: fcvt d16, h1 +; CHECK-NEXT: fcvt d17, h3 +; CHECK-NEXT: fcvt d1, h4 +; CHECK-NEXT: fcvt d2, h2 +; CHECK-NEXT: fcvt d4, h5 +; CHECK-NEXT: fcvt d5, h6 +; CHECK-NEXT: fcvt d3, h7 +; CHECK-NEXT: mov v0.d[1], v16.d[0] +; CHECK-NEXT: mov v1.d[1], v17.d[0] +; CHECK-NEXT: mov v2.d[1], v4.d[0] ; CHECK-NEXT: mov v3.d[1], v5.d[0] -; CHECK-NEXT: mov v0.16b, v4.16b ; CHECK-NEXT: ret %1 = fpext <8 x half> %a to <8 x double> ret <8 x double> %1 @@ -550,8 +553,8 @@ ; CHECK-CVT-NEXT: fcvtl v1.4s, v0.4h ; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-CVT-NEXT: fcvtzs v1.4s, v1.4s -; CHECK-CVT-NEXT: xtn v1.4h, v1.4s ; CHECK-CVT-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-CVT-NEXT: xtn v1.4h, v1.4s ; CHECK-CVT-NEXT: xtn2 v1.8h, v0.4s ; CHECK-CVT-NEXT: xtn v0.8b, v1.8h ; CHECK-CVT-NEXT: ret @@ -576,11 +579,11 @@ ; CHECK-CVT-LABEL: fptosi_i16: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: fcvtl v1.4s, v0.4h -; CHECK-CVT-NEXT: fcvtl2 v2.4s, v0.8h -; CHECK-CVT-NEXT: fcvtzs v0.4s, v1.4s -; CHECK-CVT-NEXT: xtn v0.4h, v0.4s -; CHECK-CVT-NEXT: fcvtzs v1.4s, v2.4s -; CHECK-CVT-NEXT: xtn2 v0.8h, v1.4s +; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-CVT-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-CVT-NEXT: fcvtzs v2.4s, v0.4s +; CHECK-CVT-NEXT: xtn v0.4h, v1.4s +; CHECK-CVT-NEXT: xtn2 v0.8h, v2.4s ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: fptosi_i16: @@ -597,8 +600,8 @@ ; CHECK-CVT-NEXT: fcvtl v1.4s, v0.4h ; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-CVT-NEXT: fcvtzu v1.4s, v1.4s -; CHECK-CVT-NEXT: xtn v1.4h, v1.4s ; CHECK-CVT-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-CVT-NEXT: xtn v1.4h, v1.4s ; CHECK-CVT-NEXT: xtn2 v1.8h, v0.4s ; CHECK-CVT-NEXT: xtn v0.8b, v1.8h ; CHECK-CVT-NEXT: ret @@ -616,11 +619,11 @@ ; CHECK-CVT-LABEL: fptoui_i16: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: fcvtl v1.4s, v0.4h -; CHECK-CVT-NEXT: fcvtl2 v2.4s, v0.8h -; CHECK-CVT-NEXT: fcvtzu v0.4s, v1.4s -; CHECK-CVT-NEXT: xtn v0.4h, v0.4s -; CHECK-CVT-NEXT: fcvtzu v1.4s, v2.4s -; CHECK-CVT-NEXT: xtn2 v0.8h, v1.4s +; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-CVT-NEXT: fcvtzu v1.4s, v1.4s +; CHECK-CVT-NEXT: fcvtzu v2.4s, v0.4s +; CHECK-CVT-NEXT: xtn v0.4h, v1.4s +; CHECK-CVT-NEXT: xtn2 v0.8h, v2.4s ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: fptoui_i16: @@ -636,59 +639,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, ne -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, ne -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, ne -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, ne -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, ne ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, ne +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, ne +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, ne +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, ne +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, ne -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, ne ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, ne -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_une: @@ -706,67 +709,67 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v0.h[4] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s4, h1 -; CHECK-CVT-NEXT: fcvt s2, h0 +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, eq -; CHECK-CVT-NEXT: mov h3, v1.h[2] ; CHECK-CVT-NEXT: csinv w8, w8, wzr, vc -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: csetm w9, eq +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csinv w9, w9, wzr, vc -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, eq -; CHECK-CVT-NEXT: csinv w9, w9, wzr, vc -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, eq -; CHECK-CVT-NEXT: csinv w8, w8, wzr, vc -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, eq ; CHECK-CVT-NEXT: csinv w9, w9, wzr, vc +; CHECK-CVT-NEXT: fcvt s4, h4 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 -; CHECK-CVT-NEXT: csetm w8, eq +; CHECK-CVT-NEXT: fcvt s2, h5 +; CHECK-CVT-NEXT: fmov s3, w9 +; CHECK-CVT-NEXT: mov h5, v1.h[4] +; CHECK-CVT-NEXT: csetm w9, eq +; CHECK-CVT-NEXT: mov v3.h[1], w8 +; CHECK-CVT-NEXT: csinv w8, w9, wzr, vc +; CHECK-CVT-NEXT: fcmp s2, s4 +; CHECK-CVT-NEXT: fcvt s2, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 +; CHECK-CVT-NEXT: mov h5, v1.h[5] +; CHECK-CVT-NEXT: mov h6, v0.h[5] +; CHECK-CVT-NEXT: csetm w9, eq +; CHECK-CVT-NEXT: mov v3.h[2], w8 +; CHECK-CVT-NEXT: csinv w8, w9, wzr, vc +; CHECK-CVT-NEXT: fcmp s4, s2 +; CHECK-CVT-NEXT: fcvt s2, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov h6, v0.h[6] +; CHECK-CVT-NEXT: csetm w9, eq ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: mov v3.h[3], w8 +; CHECK-CVT-NEXT: csinv w8, w9, wzr, vc +; CHECK-CVT-NEXT: fcmp s4, s2 +; CHECK-CVT-NEXT: fcvt s2, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csinv w8, w8, wzr, vc -; CHECK-CVT-NEXT: fcmp s3, s2 ; CHECK-CVT-NEXT: fcvt s1, h1 +; CHECK-CVT-NEXT: csetm w9, eq +; CHECK-CVT-NEXT: mov v3.h[4], w8 +; CHECK-CVT-NEXT: csinv w8, w9, wzr, vc +; CHECK-CVT-NEXT: fcmp s4, s2 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: mov v4.h[5], w8 +; CHECK-CVT-NEXT: mov v3.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, eq ; CHECK-CVT-NEXT: csinv w8, w8, wzr, vc ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[6], w8 +; CHECK-CVT-NEXT: mov v3.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, eq ; CHECK-CVT-NEXT: csinv w8, w8, wzr, vc -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v3.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v3.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_ueq: @@ -786,59 +789,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, hi -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, hi -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, hi -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, hi -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, hi ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, hi +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, hi +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, hi +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, hi +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, hi -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, hi ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, hi -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_ugt: @@ -856,59 +859,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, pl -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, pl -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, pl -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, pl -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, pl ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, pl +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, pl +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, pl +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, pl +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, pl -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, pl ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, pl -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_uge: @@ -926,59 +929,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, lt -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, lt -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, lt -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, lt -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, lt ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, lt +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, lt +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, lt +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, lt +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, lt -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, lt ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, lt -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_ult: @@ -996,59 +999,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, le -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, le -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, le -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, le -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, le ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, le +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, le +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, le +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, le +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, le -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, le ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, le -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_ule: @@ -1066,59 +1069,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, vs -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, vs -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, vs -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, vs -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, vs ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, vs +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, vs +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, vs +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, vs +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, vs -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, vs ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, vs -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_uno: @@ -1138,67 +1141,67 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v0.h[4] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s4, h1 -; CHECK-CVT-NEXT: fcvt s2, h0 +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, mi -; CHECK-CVT-NEXT: mov h3, v1.h[2] ; CHECK-CVT-NEXT: csinv w8, w8, wzr, le -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: csetm w9, mi +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csinv w9, w9, wzr, le -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, mi -; CHECK-CVT-NEXT: csinv w9, w9, wzr, le -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, mi -; CHECK-CVT-NEXT: csinv w8, w8, wzr, le -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, mi ; CHECK-CVT-NEXT: csinv w9, w9, wzr, le +; CHECK-CVT-NEXT: fcvt s4, h4 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 -; CHECK-CVT-NEXT: csetm w8, mi +; CHECK-CVT-NEXT: fcvt s2, h5 +; CHECK-CVT-NEXT: fmov s3, w9 +; CHECK-CVT-NEXT: mov h5, v1.h[4] +; CHECK-CVT-NEXT: csetm w9, mi +; CHECK-CVT-NEXT: mov v3.h[1], w8 +; CHECK-CVT-NEXT: csinv w8, w9, wzr, le +; CHECK-CVT-NEXT: fcmp s2, s4 +; CHECK-CVT-NEXT: fcvt s2, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 +; CHECK-CVT-NEXT: mov h5, v1.h[5] +; CHECK-CVT-NEXT: mov h6, v0.h[5] +; CHECK-CVT-NEXT: csetm w9, mi +; CHECK-CVT-NEXT: mov v3.h[2], w8 +; CHECK-CVT-NEXT: csinv w8, w9, wzr, le +; CHECK-CVT-NEXT: fcmp s4, s2 +; CHECK-CVT-NEXT: fcvt s2, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov h6, v0.h[6] +; CHECK-CVT-NEXT: csetm w9, mi ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: mov v3.h[3], w8 +; CHECK-CVT-NEXT: csinv w8, w9, wzr, le +; CHECK-CVT-NEXT: fcmp s4, s2 +; CHECK-CVT-NEXT: fcvt s2, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csinv w8, w8, wzr, le -; CHECK-CVT-NEXT: fcmp s3, s2 ; CHECK-CVT-NEXT: fcvt s1, h1 +; CHECK-CVT-NEXT: csetm w9, mi +; CHECK-CVT-NEXT: mov v3.h[4], w8 +; CHECK-CVT-NEXT: csinv w8, w9, wzr, le +; CHECK-CVT-NEXT: fcmp s4, s2 ; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: mov v4.h[5], w8 +; CHECK-CVT-NEXT: mov v3.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, mi ; CHECK-CVT-NEXT: csinv w8, w8, wzr, le ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[6], w8 +; CHECK-CVT-NEXT: mov v3.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, mi ; CHECK-CVT-NEXT: csinv w8, w8, wzr, le -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v3.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v3.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_one: @@ -1217,59 +1220,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, eq -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, eq -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, eq -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, eq -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, eq ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, eq +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, eq +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, eq +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, eq +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, eq -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, eq ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, eq -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_oeq: @@ -1286,59 +1289,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, gt -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, gt -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, gt -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, gt -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, gt ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, gt +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, gt +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, gt +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, gt +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, gt -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, gt ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, gt -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_ogt: @@ -1355,59 +1358,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, ge -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, ge -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, ge -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, ge -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, ge ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, ge +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, ge +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, ge +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, ge +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, ge -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, ge ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, ge -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_oge: @@ -1424,59 +1427,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, mi -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, mi -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, mi -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, mi -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, mi ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, mi +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, mi +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, mi +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, mi +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, mi -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, mi ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, mi -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_olt: @@ -1493,59 +1496,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, ls -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, ls -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, ls -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, ls -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, ls ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, ls +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, ls +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, ls +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, ls +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, ls -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, ls ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, ls -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_ole: @@ -1562,59 +1565,59 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: mov h2, v1.h[1] ; CHECK-CVT-NEXT: mov h3, v0.h[1] +; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s5, h0 +; CHECK-CVT-NEXT: mov h6, v1.h[4] +; CHECK-CVT-NEXT: mov h7, v0.h[4] +; CHECK-CVT-NEXT: mov h16, v1.h[5] ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: fcvt s4, h1 +; CHECK-CVT-NEXT: fcvt s6, h6 +; CHECK-CVT-NEXT: fcvt s7, h7 ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: fcvt s2, h0 -; CHECK-CVT-NEXT: mov h3, v1.h[2] +; CHECK-CVT-NEXT: mov h2, v1.h[2] +; CHECK-CVT-NEXT: mov h3, v0.h[2] ; CHECK-CVT-NEXT: csetm w8, vc -; CHECK-CVT-NEXT: fcmp s2, s4 -; CHECK-CVT-NEXT: mov h2, v0.h[2] -; CHECK-CVT-NEXT: fcvt s3, h3 +; CHECK-CVT-NEXT: fcmp s5, s4 ; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: mov h4, v1.h[3] -; CHECK-CVT-NEXT: csetm w9, vc -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v0.h[3] -; CHECK-CVT-NEXT: fcvt s3, h4 -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fmov s4, w9 -; CHECK-CVT-NEXT: csetm w9, vc -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: mov h2, v1.h[4] -; CHECK-CVT-NEXT: mov h3, v0.h[4] -; CHECK-CVT-NEXT: fcvt s2, h2 ; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[1], w8 -; CHECK-CVT-NEXT: csetm w8, vc -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[5] -; CHECK-CVT-NEXT: mov h3, v0.h[5] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[2], w9 +; CHECK-CVT-NEXT: mov h5, v0.h[3] ; CHECK-CVT-NEXT: csetm w9, vc ; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov h2, v1.h[6] -; CHECK-CVT-NEXT: mov h3, v0.h[6] +; CHECK-CVT-NEXT: fmov s2, w9 +; CHECK-CVT-NEXT: fcvt s3, h4 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v0.h[5] +; CHECK-CVT-NEXT: mov v2.h[1], w8 +; CHECK-CVT-NEXT: csetm w8, vc +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h16 +; CHECK-CVT-NEXT: fcvt s4, h5 +; CHECK-CVT-NEXT: mov h5, v1.h[6] +; CHECK-CVT-NEXT: mov v2.h[2], w8 ; CHECK-CVT-NEXT: mov h1, v1.h[7] +; CHECK-CVT-NEXT: csetm w8, vc +; CHECK-CVT-NEXT: fcmp s7, s6 +; CHECK-CVT-NEXT: mov h6, v0.h[6] ; CHECK-CVT-NEXT: mov h0, v0.h[7] -; CHECK-CVT-NEXT: fcvt s2, h2 -; CHECK-CVT-NEXT: fcvt s3, h3 -; CHECK-CVT-NEXT: mov v4.h[3], w8 +; CHECK-CVT-NEXT: mov v2.h[3], w8 +; CHECK-CVT-NEXT: csetm w8, vc +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: fcvt s3, h5 +; CHECK-CVT-NEXT: fcvt s4, h6 ; CHECK-CVT-NEXT: fcvt s1, h1 ; CHECK-CVT-NEXT: fcvt s0, h0 +; CHECK-CVT-NEXT: mov v2.h[4], w8 +; CHECK-CVT-NEXT: csetm w8, vc +; CHECK-CVT-NEXT: fcmp s4, s3 +; CHECK-CVT-NEXT: mov v2.h[5], w8 ; CHECK-CVT-NEXT: csetm w8, vc -; CHECK-CVT-NEXT: fcmp s3, s2 -; CHECK-CVT-NEXT: mov v4.h[4], w9 -; CHECK-CVT-NEXT: csetm w9, vc ; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: mov v4.h[5], w8 -; CHECK-CVT-NEXT: mov v4.h[6], w9 +; CHECK-CVT-NEXT: mov v2.h[6], w8 ; CHECK-CVT-NEXT: csetm w8, vc -; CHECK-CVT-NEXT: mov v4.h[7], w8 -; CHECK-CVT-NEXT: xtn v0.8b, v4.8h +; CHECK-CVT-NEXT: mov v2.h[7], w8 +; CHECK-CVT-NEXT: xtn v0.8b, v2.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_fcmp_ord: diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll @@ -312,8 +312,8 @@ ; CHECK-LABEL: set_lane_64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll --- a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll @@ -22,10 +22,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, #-1.00000000 ; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmaxnm s1, s0, s1 ; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fcvtzs w8, s1 -; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -37,13 +37,13 @@ ; CHECK-LABEL: test_signed_i8_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-1023410176 -; CHECK-NEXT: mov w9, #1123942400 +; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #1123942400 ; CHECK-NEXT: fmaxnm s1, s0, s1 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fcvtzs w8, s1 -; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i8 @llvm.fptosi.sat.i8.f32(float %f) @@ -54,14 +54,14 @@ ; CHECK-LABEL: test_signed_i13_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-981467136 -; CHECK-NEXT: mov w9, #61440 -; CHECK-NEXT: movk w9, #17791, lsl #16 +; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #61440 +; CHECK-NEXT: movk w8, #17791, lsl #16 ; CHECK-NEXT: fmaxnm s1, s0, s1 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fcvtzs w8, s1 -; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i13 @llvm.fptosi.sat.i13.f32(float %f) @@ -72,14 +72,14 @@ ; CHECK-LABEL: test_signed_i16_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-956301312 -; CHECK-NEXT: mov w9, #65024 -; CHECK-NEXT: movk w9, #18175, lsl #16 +; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #65024 +; CHECK-NEXT: movk w8, #18175, lsl #16 ; CHECK-NEXT: fmaxnm s1, s0, s1 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fcvtzs w8, s1 -; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i16 @llvm.fptosi.sat.i16.f32(float %f) @@ -90,14 +90,14 @@ ; CHECK-LABEL: test_signed_i19_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-931135488 -; CHECK-NEXT: mov w9, #65472 -; CHECK-NEXT: movk w9, #18559, lsl #16 +; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #65472 +; CHECK-NEXT: movk w8, #18559, lsl #16 ; CHECK-NEXT: fmaxnm s1, s0, s1 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fcvtzs w8, s1 -; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i19 @llvm.fptosi.sat.i19.f32(float %f) @@ -116,17 +116,17 @@ define i50 @test_signed_i50_f32(float %f) nounwind { ; CHECK-LABEL: test_signed_i50_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #-671088640 -; CHECK-NEXT: mov w11, #1476395007 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcvtzs x8, s0 -; CHECK-NEXT: mov x10, #-562949953421312 +; CHECK-NEXT: mov w8, #-671088640 +; CHECK-NEXT: fcvtzs x9, s0 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #1476395007 ; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: mov x12, #562949953421311 -; CHECK-NEXT: csel x8, x10, x8, lt +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov x8, #-562949953421312 +; CHECK-NEXT: csel x8, x8, x9, lt ; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: csel x8, x12, x8, gt +; CHECK-NEXT: mov x9, #562949953421311 +; CHECK-NEXT: csel x8, x9, x8, gt ; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel x0, xzr, x8, vs ; CHECK-NEXT: ret @@ -151,18 +151,18 @@ ; CHECK-NEXT: fmov s8, s0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: mov w8, #-251658240 +; CHECK-NEXT: mov x10, #34359738367 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov w8, #1895825407 ; CHECK-NEXT: fcmp s8, s0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov x8, #-34359738368 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x8, x8, x1, lt -; CHECK-NEXT: mov x9, #34359738367 -; CHECK-NEXT: csel x10, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s0 -; CHECK-NEXT: csel x8, x9, x8, gt -; CHECK-NEXT: csinv x9, x10, xzr, le +; CHECK-NEXT: csel x8, x10, x8, gt +; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: csel x0, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs @@ -180,18 +180,18 @@ ; CHECK-NEXT: fmov s8, s0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: mov w8, #-16777216 +; CHECK-NEXT: mov x10, #9223372036854775807 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov w8, #2130706431 ; CHECK-NEXT: fcmp s8, s0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov x8, #-9223372036854775808 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x8, x8, x1, lt -; CHECK-NEXT: mov x9, #9223372036854775807 -; CHECK-NEXT: csel x10, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s0 -; CHECK-NEXT: csel x8, x9, x8, gt -; CHECK-NEXT: csinv x9, x10, xzr, le +; CHECK-NEXT: csel x8, x10, x8, gt +; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: csel x0, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs @@ -221,10 +221,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov d1, #-1.00000000 ; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: fmaxnm d1, d0, d1 ; CHECK-NEXT: fminnm d1, d1, d2 ; CHECK-NEXT: fcvtzs w8, d1 -; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -236,14 +236,14 @@ ; CHECK-LABEL: test_signed_i8_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4584664420663164928 -; CHECK-NEXT: mov x9, #211106232532992 -; CHECK-NEXT: movk x9, #16479, lsl #48 +; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov x8, #211106232532992 +; CHECK-NEXT: movk x8, #16479, lsl #48 ; CHECK-NEXT: fmaxnm d1, d0, d1 -; CHECK-NEXT: fmov d2, x9 +; CHECK-NEXT: fmov d2, x8 ; CHECK-NEXT: fminnm d1, d1, d2 ; CHECK-NEXT: fcvtzs w8, d1 -; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i8 @llvm.fptosi.sat.i8.f64(double %f) @@ -254,14 +254,14 @@ ; CHECK-LABEL: test_signed_i13_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4562146422526312448 -; CHECK-NEXT: mov x9, #279275953455104 -; CHECK-NEXT: movk x9, #16559, lsl #48 +; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov x8, #279275953455104 +; CHECK-NEXT: movk x8, #16559, lsl #48 ; CHECK-NEXT: fmaxnm d1, d0, d1 -; CHECK-NEXT: fmov d2, x9 +; CHECK-NEXT: fmov d2, x8 ; CHECK-NEXT: fminnm d1, d1, d2 ; CHECK-NEXT: fcvtzs w8, d1 -; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i13 @llvm.fptosi.sat.i13.f64(double %f) @@ -272,14 +272,14 @@ ; CHECK-LABEL: test_signed_i16_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4548635623644200960 -; CHECK-NEXT: mov x9, #281200098803712 -; CHECK-NEXT: movk x9, #16607, lsl #48 +; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov x8, #281200098803712 +; CHECK-NEXT: movk x8, #16607, lsl #48 ; CHECK-NEXT: fmaxnm d1, d0, d1 -; CHECK-NEXT: fmov d2, x9 +; CHECK-NEXT: fmov d2, x8 ; CHECK-NEXT: fminnm d1, d1, d2 ; CHECK-NEXT: fcvtzs w8, d1 -; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i16 @llvm.fptosi.sat.i16.f64(double %f) @@ -290,14 +290,14 @@ ; CHECK-LABEL: test_signed_i19_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4535124824762089472 -; CHECK-NEXT: mov x9, #281440616972288 -; CHECK-NEXT: movk x9, #16655, lsl #48 +; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov x8, #281440616972288 +; CHECK-NEXT: movk x8, #16655, lsl #48 ; CHECK-NEXT: fmaxnm d1, d0, d1 -; CHECK-NEXT: fmov d2, x9 +; CHECK-NEXT: fmov d2, x8 ; CHECK-NEXT: fminnm d1, d1, d2 ; CHECK-NEXT: fcvtzs w8, d1 -; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i19 @llvm.fptosi.sat.i19.f64(double %f) @@ -317,14 +317,14 @@ ; CHECK-LABEL: test_signed_i50_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4395513236313604096 -; CHECK-NEXT: mov x9, #-16 -; CHECK-NEXT: movk x9, #17151, lsl #48 +; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov x8, #-16 +; CHECK-NEXT: movk x8, #17151, lsl #48 ; CHECK-NEXT: fmaxnm d1, d0, d1 -; CHECK-NEXT: fmov d2, x9 +; CHECK-NEXT: fmov d2, x8 ; CHECK-NEXT: fminnm d1, d1, d2 ; CHECK-NEXT: fcvtzs x8, d1 -; CHECK-NEXT: fcmp d0, d0 ; CHECK-NEXT: csel x0, xzr, x8, vs ; CHECK-NEXT: ret %x = call i50 @llvm.fptosi.sat.i50.f64(double %f) @@ -348,18 +348,18 @@ ; CHECK-NEXT: fmov d8, d0 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: mov x8, #-4170333254945079296 +; CHECK-NEXT: mov x10, #34359738367 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov x8, #5053038781909696511 ; CHECK-NEXT: fcmp d8, d0 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov x8, #-34359738368 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x8, x8, x1, lt -; CHECK-NEXT: mov x9, #34359738367 -; CHECK-NEXT: csel x10, xzr, x0, lt ; CHECK-NEXT: fcmp d8, d0 -; CHECK-NEXT: csel x8, x9, x8, gt -; CHECK-NEXT: csinv x9, x10, xzr, le +; CHECK-NEXT: csel x8, x10, x8, gt +; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: fcmp d8, d8 ; CHECK-NEXT: csel x0, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs @@ -377,18 +377,18 @@ ; CHECK-NEXT: fmov d8, d0 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: mov x8, #-4044232465378705408 +; CHECK-NEXT: mov x10, #9223372036854775807 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov x8, #5179139571476070399 ; CHECK-NEXT: fcmp d8, d0 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov x8, #-9223372036854775808 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x8, x8, x1, lt -; CHECK-NEXT: mov x9, #9223372036854775807 -; CHECK-NEXT: csel x10, xzr, x0, lt ; CHECK-NEXT: fcmp d8, d0 -; CHECK-NEXT: csel x8, x9, x8, gt -; CHECK-NEXT: csinv x9, x10, xzr, le +; CHECK-NEXT: csel x8, x10, x8, gt +; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: fcmp d8, d8 ; CHECK-NEXT: csel x0, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs @@ -420,9 +420,9 @@ ; CHECK-NEXT: fmov s1, #-1.00000000 ; CHECK-NEXT: movi d2, #0000000000000000 ; CHECK-NEXT: fmaxnm s1, s0, s1 +; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fcvtzs w8, s1 -; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -435,13 +435,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-1023410176 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: mov w9, #1123942400 ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #1123942400 +; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmaxnm s1, s0, s1 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fcvtzs w8, s1 -; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i8 @llvm.fptosi.sat.i8.f16(half %f) @@ -452,15 +452,15 @@ ; CHECK-LABEL: test_signed_i13_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-981467136 -; CHECK-NEXT: mov w9, #61440 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: movk w9, #17791, lsl #16 ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #61440 +; CHECK-NEXT: movk w8, #17791, lsl #16 +; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmaxnm s1, s0, s1 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fcvtzs w8, s1 -; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i13 @llvm.fptosi.sat.i13.f16(half %f) @@ -471,15 +471,15 @@ ; CHECK-LABEL: test_signed_i16_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-956301312 -; CHECK-NEXT: mov w9, #65024 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: movk w9, #18175, lsl #16 ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #65024 +; CHECK-NEXT: movk w8, #18175, lsl #16 +; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmaxnm s1, s0, s1 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fcvtzs w8, s1 -; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i16 @llvm.fptosi.sat.i16.f16(half %f) @@ -490,15 +490,15 @@ ; CHECK-LABEL: test_signed_i19_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-931135488 -; CHECK-NEXT: mov w9, #65472 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: movk w9, #18559, lsl #16 ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #65472 +; CHECK-NEXT: movk w8, #18559, lsl #16 +; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmaxnm s1, s0, s1 -; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fcvtzs w8, s1 -; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: ret %x = call i19 @llvm.fptosi.sat.i19.f16(half %f) @@ -525,15 +525,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-671088640 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov w8, #1476395007 -; CHECK-NEXT: mov x9, #-562949953421312 -; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: mov w9, #1476395007 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: fcvtzs x8, s0 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: mov x9, #-562949953421312 ; CHECK-NEXT: csel x8, x9, x8, lt -; CHECK-NEXT: mov x9, #562949953421311 ; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: mov x9, #562949953421311 ; CHECK-NEXT: csel x8, x9, x8, gt ; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel x0, xzr, x8, vs @@ -562,22 +562,22 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: mov w8, #-251658240 +; CHECK-NEXT: mov x10, #34359738367 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov w8, #1895825407 ; CHECK-NEXT: fcmp s8, s0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov x8, #-34359738368 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x8, x8, x1, lt -; CHECK-NEXT: mov x9, #34359738367 -; CHECK-NEXT: csel x10, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s0 -; CHECK-NEXT: csel x8, x9, x8, gt -; CHECK-NEXT: csinv x9, x10, xzr, le +; CHECK-NEXT: csel x8, x10, x8, gt +; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: csel x0, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs @@ -592,22 +592,22 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: mov w8, #-16777216 +; CHECK-NEXT: mov x10, #9223372036854775807 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov w8, #2130706431 ; CHECK-NEXT: fcmp s8, s0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov x8, #-9223372036854775808 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x8, x8, x1, lt -; CHECK-NEXT: mov x9, #9223372036854775807 -; CHECK-NEXT: csel x10, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s0 -; CHECK-NEXT: csel x8, x9, x8, gt -; CHECK-NEXT: csinv x9, x10, xzr, le +; CHECK-NEXT: csel x8, x10, x8, gt +; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: csel x0, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -57,18 +57,18 @@ ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 -; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 +; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: mov v0.s[1], v1.s[0] +; CHECK-NEXT: fcvtzs v4.4s, v4.4s ; CHECK-NEXT: mov v0.s[2], v2.s[0] +; CHECK-NEXT: fmov w4, s4 ; CHECK-NEXT: mov v0.s[3], v3.s[0] -; CHECK-NEXT: fcvtzs v4.4s, v4.4s ; CHECK-NEXT: fcvtzs v0.4s, v0.4s ; CHECK-NEXT: mov w1, v0.s[1] ; CHECK-NEXT: mov w2, v0.s[2] ; CHECK-NEXT: mov w3, v0.s[3] ; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: fmov w4, s4 ; CHECK-NEXT: ret %x = call <5 x i32> @llvm.fptosi.sat.v5f32.v5i32(<5 x float> %f) ret <5 x i32> %x @@ -79,21 +79,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 ; CHECK-NEXT: mov v0.s[1], v1.s[0] -; CHECK-NEXT: mov v0.s[2], v2.s[0] ; CHECK-NEXT: mov v4.s[1], v5.s[0] -; CHECK-NEXT: mov v0.s[3], v3.s[0] +; CHECK-NEXT: mov v0.s[2], v2.s[0] ; CHECK-NEXT: fcvtzs v1.4s, v4.4s -; CHECK-NEXT: fcvtzs v0.4s, v0.4s +; CHECK-NEXT: mov v0.s[3], v3.s[0] ; CHECK-NEXT: mov w5, v1.s[1] +; CHECK-NEXT: fmov w4, s1 +; CHECK-NEXT: fcvtzs v0.4s, v0.4s ; CHECK-NEXT: mov w1, v0.s[1] ; CHECK-NEXT: mov w2, v0.s[2] ; CHECK-NEXT: mov w3, v0.s[3] -; CHECK-NEXT: fmov w4, s1 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <6 x i32> @llvm.fptosi.sat.v6f32.v6i32(<6 x float> %f) @@ -104,8 +104,8 @@ ; CHECK-LABEL: test_signed_v7f32_v7i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 @@ -119,10 +119,10 @@ ; CHECK-NEXT: fcvtzs v0.4s, v0.4s ; CHECK-NEXT: mov w5, v1.s[1] ; CHECK-NEXT: mov w6, v1.s[2] +; CHECK-NEXT: fmov w4, s1 ; CHECK-NEXT: mov w1, v0.s[1] ; CHECK-NEXT: mov w2, v0.s[2] ; CHECK-NEXT: mov w3, v0.s[3] -; CHECK-NEXT: fmov w4, s1 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <7 x i32> @llvm.fptosi.sat.v7f32.v7i32(<7 x float> %f) @@ -163,8 +163,8 @@ define <2 x i32> @test_signed_v2f64_v2i32(<2 x double> %f) { ; CHECK-LABEL: test_signed_v2f64_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs w8, d0 ; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: fcvtzs w8, d0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzs w8, d1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -178,11 +178,11 @@ ; CHECK-LABEL: test_signed_v3f64_v3i32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtzs w8, d0 -; CHECK-NEXT: fcvtzs w9, d1 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fcvtzs w10, d2 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: mov v0.s[2], w10 +; CHECK-NEXT: fcvtzs w8, d1 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: fcvtzs w8, d2 +; CHECK-NEXT: mov v0.s[2], w8 ; CHECK-NEXT: fcvtzs w8, d0 ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: ret @@ -193,14 +193,14 @@ define <4 x i32> @test_signed_v4f64_v4i32(<4 x double> %f) { ; CHECK-LABEL: test_signed_v4f64_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs w8, d0 ; CHECK-NEXT: mov d2, v0.d[1] +; CHECK-NEXT: fcvtzs w8, d0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzs w8, d2 -; CHECK-NEXT: fcvtzs w9, d1 -; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v0.s[2], w9 +; CHECK-NEXT: fcvtzs w8, d1 +; CHECK-NEXT: mov d1, v1.d[1] +; CHECK-NEXT: mov v0.s[2], w8 ; CHECK-NEXT: fcvtzs w8, d1 ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: ret @@ -253,8 +253,8 @@ ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 @@ -263,12 +263,12 @@ ; CHECK-NEXT: mov w8, #-2147483648 ; CHECK-NEXT: csel w19, w8, w0, lt ; CHECK-NEXT: adrp x8, .LCPI14_1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #2147483647 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: csel w19, w8, w19, gt ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: bl __unordtf2 @@ -295,8 +295,8 @@ ; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: adrp x8, .LCPI15_0 ; CHECK-NEXT: mov v2.16b, v1.16b +; CHECK-NEXT: adrp x8, .LCPI15_0 ; CHECK-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: mov v0.16b, v2.16b @@ -306,30 +306,30 @@ ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixtfsi ; CHECK-NEXT: adrp x8, .LCPI15_1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 ; CHECK-NEXT: mov w20, #-2147483648 ; CHECK-NEXT: csel w19, w20, w0, lt +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w21, #2147483647 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: csel w19, w21, w19, gt ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: bl __unordtf2 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csel w22, wzr, w19, ne ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixtfsi ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, w20, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -338,12 +338,12 @@ ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: bl __unordtf2 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: csel w8, wzr, w19, ne +; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov v0.s[1], w22 -; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret @@ -364,8 +364,8 @@ ; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: adrp x8, .LCPI16_0 ; CHECK-NEXT: stp q0, q2, [sp, #48] // 32-byte Folded Spill +; CHECK-NEXT: adrp x8, .LCPI16_0 ; CHECK-NEXT: mov v2.16b, v1.16b ; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] @@ -376,30 +376,30 @@ ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixtfsi ; CHECK-NEXT: adrp x8, .LCPI16_1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_1] ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 ; CHECK-NEXT: mov w20, #-2147483648 ; CHECK-NEXT: csel w19, w20, w0, lt +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_1] ; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w21, #2147483647 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: csel w19, w21, w19, gt ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: bl __unordtf2 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csel w22, wzr, w19, ne ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixtfsi ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, w20, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -408,19 +408,19 @@ ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: bl __unordtf2 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csel w8, wzr, w19, ne ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov v0.s[1], w22 ; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixtfsi ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, w20, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload @@ -431,9 +431,9 @@ ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel w8, wzr, w19, ne +; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.s[2], w8 ; CHECK-NEXT: add sp, sp, #128 ; CHECK-NEXT: ret @@ -454,12 +454,12 @@ ; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w30, -48 -; CHECK-NEXT: adrp x8, .LCPI17_0 ; CHECK-NEXT: stp q2, q3, [sp, #64] // 32-byte Folded Spill +; CHECK-NEXT: adrp x8, .LCPI17_0 ; CHECK-NEXT: mov v2.16b, v1.16b +; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] -; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: bl __getf2 @@ -467,16 +467,16 @@ ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixtfsi ; CHECK-NEXT: adrp x8, .LCPI17_1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_1] ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 ; CHECK-NEXT: mov w20, #-2147483648 ; CHECK-NEXT: csel w19, w20, w0, lt +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_1] ; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w21, #2147483647 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: csel w19, w21, w19, gt ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: bl __unordtf2 @@ -488,8 +488,8 @@ ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixtfsi ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, w20, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -498,19 +498,19 @@ ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: bl __unordtf2 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: csel w8, wzr, w19, ne ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov v0.s[1], w22 ; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixtfsi ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, w20, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload @@ -529,8 +529,8 @@ ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixtfsi ; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, w20, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload @@ -541,9 +541,9 @@ ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel w8, wzr, w19, ne +; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: add sp, sp, #144 ; CHECK-NEXT: ret @@ -592,8 +592,8 @@ ; CHECK-FP16-LABEL: test_signed_v2f16_v2i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-NEXT: fcvtzs w8, h0 ; CHECK-FP16-NEXT: mov h1, v0.h[1] +; CHECK-FP16-NEXT: fcvtzs w8, h0 ; CHECK-FP16-NEXT: fmov s0, w8 ; CHECK-FP16-NEXT: fcvtzs w8, h1 ; CHECK-FP16-NEXT: mov v0.s[1], w8 @@ -613,17 +613,18 @@ ; CHECK-FP16-LABEL: test_signed_v3f16_v3i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP16-NEXT: mov h2, v0.h[1] ; CHECK-FP16-NEXT: fcvtzs w8, h0 -; CHECK-FP16-NEXT: mov h1, v0.h[1] +; CHECK-FP16-NEXT: fmov s1, w8 +; CHECK-FP16-NEXT: fcvtzs w8, h2 ; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: mov h3, v0.h[3] -; CHECK-FP16-NEXT: fmov s0, w8 -; CHECK-FP16-NEXT: fcvtzs w8, h1 -; CHECK-FP16-NEXT: fcvtzs w9, h2 -; CHECK-FP16-NEXT: mov v0.s[1], w8 -; CHECK-FP16-NEXT: mov v0.s[2], w9 -; CHECK-FP16-NEXT: fcvtzs w8, h3 -; CHECK-FP16-NEXT: mov v0.s[3], w8 +; CHECK-FP16-NEXT: mov h0, v0.h[3] +; CHECK-FP16-NEXT: mov v1.s[1], w8 +; CHECK-FP16-NEXT: fcvtzs w8, h2 +; CHECK-FP16-NEXT: mov v1.s[2], w8 +; CHECK-FP16-NEXT: fcvtzs w8, h0 +; CHECK-FP16-NEXT: mov v1.s[3], w8 +; CHECK-FP16-NEXT: mov v0.16b, v1.16b ; CHECK-FP16-NEXT: ret %x = call <3 x i32> @llvm.fptosi.sat.v3f16.v3i32(<3 x half> %f) ret <3 x i32> %x @@ -639,17 +640,18 @@ ; CHECK-FP16-LABEL: test_signed_v4f16_v4i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP16-NEXT: mov h2, v0.h[1] ; CHECK-FP16-NEXT: fcvtzs w8, h0 -; CHECK-FP16-NEXT: mov h1, v0.h[1] +; CHECK-FP16-NEXT: fmov s1, w8 +; CHECK-FP16-NEXT: fcvtzs w8, h2 ; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: mov h3, v0.h[3] -; CHECK-FP16-NEXT: fmov s0, w8 -; CHECK-FP16-NEXT: fcvtzs w8, h1 -; CHECK-FP16-NEXT: fcvtzs w9, h2 -; CHECK-FP16-NEXT: mov v0.s[1], w8 -; CHECK-FP16-NEXT: mov v0.s[2], w9 -; CHECK-FP16-NEXT: fcvtzs w8, h3 -; CHECK-FP16-NEXT: mov v0.s[3], w8 +; CHECK-FP16-NEXT: mov h0, v0.h[3] +; CHECK-FP16-NEXT: mov v1.s[1], w8 +; CHECK-FP16-NEXT: fcvtzs w8, h2 +; CHECK-FP16-NEXT: mov v1.s[2], w8 +; CHECK-FP16-NEXT: fcvtzs w8, h0 +; CHECK-FP16-NEXT: mov v1.s[3], w8 +; CHECK-FP16-NEXT: mov v0.16b, v1.16b ; CHECK-FP16-NEXT: ret %x = call <4 x i32> @llvm.fptosi.sat.v4f16.v4i32(<4 x half> %f) ret <4 x i32> %x @@ -665,21 +667,21 @@ ; CHECK-CVT-NEXT: mov w1, v1.s[1] ; CHECK-CVT-NEXT: mov w2, v1.s[2] ; CHECK-CVT-NEXT: mov w3, v1.s[3] -; CHECK-CVT-NEXT: fmov w4, s0 ; CHECK-CVT-NEXT: fmov w0, s1 +; CHECK-CVT-NEXT: fmov w4, s0 ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_signed_v5f16_v5i32: ; CHECK-FP16: // %bb.0: -; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-NEXT: mov h1, v0.h[1] +; CHECK-FP16-NEXT: mov h2, v0.h[2] +; CHECK-FP16-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-NEXT: mov h4, v0.h[3] ; CHECK-FP16-NEXT: fcvtzs w0, h0 -; CHECK-FP16-NEXT: mov h2, v0.h[1] -; CHECK-FP16-NEXT: mov h3, v0.h[2] -; CHECK-FP16-NEXT: mov h0, v0.h[3] -; CHECK-FP16-NEXT: fcvtzs w4, h1 -; CHECK-FP16-NEXT: fcvtzs w1, h2 -; CHECK-FP16-NEXT: fcvtzs w2, h3 -; CHECK-FP16-NEXT: fcvtzs w3, h0 +; CHECK-FP16-NEXT: fcvtzs w1, h1 +; CHECK-FP16-NEXT: fcvtzs w2, h2 +; CHECK-FP16-NEXT: fcvtzs w4, h3 +; CHECK-FP16-NEXT: fcvtzs w3, h4 ; CHECK-FP16-NEXT: ret %x = call <5 x i32> @llvm.fptosi.sat.v5f16.v5i32(<5 x half> %f) ret <5 x i32> %x @@ -692,30 +694,30 @@ ; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-CVT-NEXT: fcvtzs v1.4s, v1.4s ; CHECK-CVT-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-CVT-NEXT: mov w5, v0.s[1] ; CHECK-CVT-NEXT: mov w1, v1.s[1] ; CHECK-CVT-NEXT: mov w2, v1.s[2] ; CHECK-CVT-NEXT: mov w3, v1.s[3] -; CHECK-CVT-NEXT: fmov w4, s0 +; CHECK-CVT-NEXT: mov w5, v0.s[1] ; CHECK-CVT-NEXT: fmov w0, s1 +; CHECK-CVT-NEXT: fmov w4, s0 ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_signed_v6f16_v6i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-NEXT: mov h3, v0.h[2] +; CHECK-FP16-NEXT: mov h4, v0.h[3] +; CHECK-FP16-NEXT: fcvtzs w0, h0 +; CHECK-FP16-NEXT: mov h2, v1.h[1] +; CHECK-FP16-NEXT: fcvtzs w8, h1 +; CHECK-FP16-NEXT: fcvtzs w2, h3 +; CHECK-FP16-NEXT: fcvtzs w3, h4 +; CHECK-FP16-NEXT: fmov s1, w8 +; CHECK-FP16-NEXT: fcvtzs w5, h2 ; CHECK-FP16-NEXT: mov h2, v0.h[1] +; CHECK-FP16-NEXT: mov v1.s[1], w5 ; CHECK-FP16-NEXT: fcvtzs w1, h2 -; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: fcvtzs w8, h1 -; CHECK-FP16-NEXT: mov h1, v1.h[1] -; CHECK-FP16-NEXT: fcvtzs w2, h2 -; CHECK-FP16-NEXT: fmov s2, w8 -; CHECK-FP16-NEXT: fcvtzs w5, h1 -; CHECK-FP16-NEXT: fcvtzs w0, h0 -; CHECK-FP16-NEXT: mov h0, v0.h[3] -; CHECK-FP16-NEXT: mov v2.s[1], w5 -; CHECK-FP16-NEXT: fcvtzs w3, h0 -; CHECK-FP16-NEXT: fmov w4, s2 +; CHECK-FP16-NEXT: fmov w4, s1 ; CHECK-FP16-NEXT: ret %x = call <6 x i32> @llvm.fptosi.sat.v6f16.v6i32(<6 x half> %f) ret <6 x i32> %x @@ -728,33 +730,33 @@ ; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-CVT-NEXT: fcvtzs v1.4s, v1.4s ; CHECK-CVT-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-CVT-NEXT: mov w5, v0.s[1] -; CHECK-CVT-NEXT: mov w6, v0.s[2] ; CHECK-CVT-NEXT: mov w1, v1.s[1] ; CHECK-CVT-NEXT: mov w2, v1.s[2] ; CHECK-CVT-NEXT: mov w3, v1.s[3] -; CHECK-CVT-NEXT: fmov w4, s0 +; CHECK-CVT-NEXT: mov w5, v0.s[1] +; CHECK-CVT-NEXT: mov w6, v0.s[2] ; CHECK-CVT-NEXT: fmov w0, s1 +; CHECK-CVT-NEXT: fmov w4, s0 ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_signed_v7f16_v7i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-NEXT: mov h2, v0.h[1] -; CHECK-FP16-NEXT: mov h3, v0.h[2] -; CHECK-FP16-NEXT: fcvtzs w8, h1 -; CHECK-FP16-NEXT: fcvtzs w1, h2 +; CHECK-FP16-NEXT: mov h4, v0.h[3] +; CHECK-FP16-NEXT: fcvtzs w0, h0 ; CHECK-FP16-NEXT: mov h2, v1.h[1] +; CHECK-FP16-NEXT: fcvtzs w8, h1 ; CHECK-FP16-NEXT: mov h1, v1.h[2] -; CHECK-FP16-NEXT: fcvtzs w2, h3 +; CHECK-FP16-NEXT: fcvtzs w3, h4 ; CHECK-FP16-NEXT: fmov s3, w8 ; CHECK-FP16-NEXT: fcvtzs w8, h2 +; CHECK-FP16-NEXT: mov h2, v0.h[2] ; CHECK-FP16-NEXT: fcvtzs w6, h1 +; CHECK-FP16-NEXT: mov h1, v0.h[1] ; CHECK-FP16-NEXT: mov v3.s[1], w8 -; CHECK-FP16-NEXT: fcvtzs w0, h0 -; CHECK-FP16-NEXT: mov h0, v0.h[3] +; CHECK-FP16-NEXT: fcvtzs w2, h2 +; CHECK-FP16-NEXT: fcvtzs w1, h1 ; CHECK-FP16-NEXT: mov v3.s[2], w6 -; CHECK-FP16-NEXT: fcvtzs w3, h0 ; CHECK-FP16-NEXT: mov w5, v3.s[1] ; CHECK-FP16-NEXT: fmov w4, s3 ; CHECK-FP16-NEXT: ret @@ -774,28 +776,28 @@ ; CHECK-FP16-LABEL: test_signed_v8f16_v8i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-NEXT: mov h1, v0.h[1] -; CHECK-FP16-NEXT: fcvtzs w9, h3 -; CHECK-FP16-NEXT: fcvtzs w10, h1 -; CHECK-FP16-NEXT: mov h1, v3.h[1] -; CHECK-FP16-NEXT: fcvtzs w8, h0 -; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: mov h4, v3.h[2] -; CHECK-FP16-NEXT: fcvtzs w12, h1 -; CHECK-FP16-NEXT: fmov s1, w9 -; CHECK-FP16-NEXT: fcvtzs w11, h2 -; CHECK-FP16-NEXT: fmov s2, w8 -; CHECK-FP16-NEXT: fcvtzs w8, h4 -; CHECK-FP16-NEXT: mov v1.s[1], w12 +; CHECK-FP16-NEXT: mov h4, v0.h[1] +; CHECK-FP16-NEXT: fcvtzs w9, h0 +; CHECK-FP16-NEXT: mov h2, v3.h[1] +; CHECK-FP16-NEXT: fcvtzs w8, h3 +; CHECK-FP16-NEXT: mov h5, v3.h[2] ; CHECK-FP16-NEXT: mov h3, v3.h[3] -; CHECK-FP16-NEXT: mov v2.s[1], w10 +; CHECK-FP16-NEXT: fmov s1, w8 +; CHECK-FP16-NEXT: fcvtzs w8, h2 +; CHECK-FP16-NEXT: fmov s2, w9 +; CHECK-FP16-NEXT: fcvtzs w9, h4 +; CHECK-FP16-NEXT: mov h4, v0.h[2] ; CHECK-FP16-NEXT: mov h0, v0.h[3] +; CHECK-FP16-NEXT: mov v1.s[1], w8 +; CHECK-FP16-NEXT: fcvtzs w8, h5 +; CHECK-FP16-NEXT: mov v2.s[1], w9 +; CHECK-FP16-NEXT: fcvtzs w9, h4 ; CHECK-FP16-NEXT: mov v1.s[2], w8 ; CHECK-FP16-NEXT: fcvtzs w8, h3 -; CHECK-FP16-NEXT: mov v2.s[2], w11 +; CHECK-FP16-NEXT: mov v2.s[2], w9 +; CHECK-FP16-NEXT: fcvtzs w9, h0 ; CHECK-FP16-NEXT: mov v1.s[3], w8 -; CHECK-FP16-NEXT: fcvtzs w8, h0 -; CHECK-FP16-NEXT: mov v2.s[3], w8 +; CHECK-FP16-NEXT: mov v2.s[3], w9 ; CHECK-FP16-NEXT: mov v0.16b, v2.16b ; CHECK-FP16-NEXT: ret %x = call <8 x i32> @llvm.fptosi.sat.v8f16.v8i32(<8 x half> %f) @@ -824,11 +826,11 @@ ; CHECK-NEXT: fmov s2, #-1.00000000 ; CHECK-NEXT: movi d3, #0000000000000000 ; CHECK-NEXT: fmaxnm s4, s1, s2 +; CHECK-NEXT: fmaxnm s2, s0, s2 ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fmaxnm s1, s0, s2 -; CHECK-NEXT: fminnm s2, s4, s3 -; CHECK-NEXT: fminnm s1, s1, s3 -; CHECK-NEXT: fcvtzs w8, s2 +; CHECK-NEXT: fminnm s4, s4, s3 +; CHECK-NEXT: fminnm s1, s2, s3 +; CHECK-NEXT: fcvtzs w8, s4 ; CHECK-NEXT: fcvtzs w9, s1 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: fcmp s0, s0 @@ -847,15 +849,15 @@ ; CHECK-NEXT: mov w8, #-1023410176 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s1, v0.s[1] -; CHECK-NEXT: mov w9, #1123942400 ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmaxnm s3, s1, s2 -; CHECK-NEXT: fmov s4, w9 +; CHECK-NEXT: mov w8, #1123942400 ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fmaxnm s1, s0, s2 -; CHECK-NEXT: fminnm s2, s3, s4 -; CHECK-NEXT: fminnm s1, s1, s4 -; CHECK-NEXT: fcvtzs w8, s2 +; CHECK-NEXT: fmaxnm s3, s1, s2 +; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: fmaxnm s2, s0, s2 +; CHECK-NEXT: fminnm s3, s3, s4 +; CHECK-NEXT: fminnm s1, s2, s4 +; CHECK-NEXT: fcvtzs w8, s3 ; CHECK-NEXT: fcvtzs w9, s1 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: fcmp s0, s0 @@ -872,18 +874,18 @@ ; CHECK-LABEL: test_signed_v2f32_v2i13: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-981467136 -; CHECK-NEXT: mov w9, #61440 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s1, v0.s[1] -; CHECK-NEXT: movk w9, #17791, lsl #16 ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmaxnm s3, s1, s2 -; CHECK-NEXT: fmov s4, w9 +; CHECK-NEXT: mov w8, #61440 +; CHECK-NEXT: movk w8, #17791, lsl #16 ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fmaxnm s1, s0, s2 -; CHECK-NEXT: fminnm s2, s3, s4 -; CHECK-NEXT: fminnm s1, s1, s4 -; CHECK-NEXT: fcvtzs w8, s2 +; CHECK-NEXT: fmaxnm s3, s1, s2 +; CHECK-NEXT: fmaxnm s2, s0, s2 +; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: fminnm s3, s3, s4 +; CHECK-NEXT: fminnm s1, s2, s4 +; CHECK-NEXT: fcvtzs w8, s3 ; CHECK-NEXT: fcvtzs w9, s1 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: fcmp s0, s0 @@ -900,18 +902,18 @@ ; CHECK-LABEL: test_signed_v2f32_v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-956301312 -; CHECK-NEXT: mov w9, #65024 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s1, v0.s[1] -; CHECK-NEXT: movk w9, #18175, lsl #16 ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmaxnm s3, s1, s2 -; CHECK-NEXT: fmov s4, w9 +; CHECK-NEXT: mov w8, #65024 +; CHECK-NEXT: movk w8, #18175, lsl #16 ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fmaxnm s1, s0, s2 -; CHECK-NEXT: fminnm s2, s3, s4 -; CHECK-NEXT: fminnm s1, s1, s4 -; CHECK-NEXT: fcvtzs w8, s2 +; CHECK-NEXT: fmaxnm s3, s1, s2 +; CHECK-NEXT: fmaxnm s2, s0, s2 +; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: fminnm s3, s3, s4 +; CHECK-NEXT: fminnm s1, s2, s4 +; CHECK-NEXT: fcvtzs w8, s3 ; CHECK-NEXT: fcvtzs w9, s1 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: fcmp s0, s0 @@ -928,18 +930,18 @@ ; CHECK-LABEL: test_signed_v2f32_v2i19: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-931135488 -; CHECK-NEXT: mov w9, #65472 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s1, v0.s[1] -; CHECK-NEXT: movk w9, #18559, lsl #16 ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmaxnm s3, s1, s2 -; CHECK-NEXT: fmov s4, w9 +; CHECK-NEXT: mov w8, #65472 +; CHECK-NEXT: movk w8, #18559, lsl #16 ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fmaxnm s1, s0, s2 -; CHECK-NEXT: fminnm s2, s3, s4 -; CHECK-NEXT: fminnm s1, s1, s4 -; CHECK-NEXT: fcvtzs w8, s2 +; CHECK-NEXT: fmaxnm s3, s1, s2 +; CHECK-NEXT: fmaxnm s2, s0, s2 +; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: fminnm s3, s3, s4 +; CHECK-NEXT: fminnm s1, s2, s4 +; CHECK-NEXT: fcvtzs w8, s3 ; CHECK-NEXT: fcvtzs w9, s1 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: fcmp s0, s0 @@ -967,27 +969,27 @@ ; CHECK-NEXT: mov w8, #-671088640 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s1, v0.s[1] -; CHECK-NEXT: mov w10, #1476395007 +; CHECK-NEXT: mov w9, #1476395007 +; CHECK-NEXT: mov x10, #562949953421311 +; CHECK-NEXT: fcvtzs x11, s0 ; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: fmov s3, w9 +; CHECK-NEXT: fcvtzs x8, s1 ; CHECK-NEXT: mov x9, #-562949953421312 -; CHECK-NEXT: fmov s3, w10 -; CHECK-NEXT: fcvtzs x10, s1 ; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: mov x11, #562949953421311 -; CHECK-NEXT: csel x10, x9, x10, lt +; CHECK-NEXT: csel x8, x9, x8, lt ; CHECK-NEXT: fcmp s1, s3 -; CHECK-NEXT: csel x10, x11, x10, gt +; CHECK-NEXT: csel x8, x10, x8, gt ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fcvtzs x8, s0 -; CHECK-NEXT: csel x10, xzr, x10, vs +; CHECK-NEXT: csel x8, xzr, x8, vs ; CHECK-NEXT: fcmp s0, s2 -; CHECK-NEXT: csel x8, x9, x8, lt +; CHECK-NEXT: csel x9, x9, x11, lt ; CHECK-NEXT: fcmp s0, s3 -; CHECK-NEXT: csel x8, x11, x8, gt +; CHECK-NEXT: csel x9, x10, x9, gt ; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: csel x8, xzr, x8, vs -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x10 +; CHECK-NEXT: csel x9, xzr, x9, vs +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: mov v0.d[1], x8 ; CHECK-NEXT: ret %x = call <2 x i50> @llvm.fptosi.sat.v2f32.v2i50(<2 x float> %f) ret <2 x i50> %x @@ -997,8 +999,8 @@ ; CHECK-LABEL: test_signed_v2f32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fcvtzs x8, s0 ; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: fcvtzs x8, s0 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: fcvtzs x8, s1 ; CHECK-NEXT: mov v0.d[1], x8 @@ -1031,39 +1033,39 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: mov w8, #-251658240 -; CHECK-NEXT: mov w9, #1895825407 -; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: mov x21, #-34359738368 -; CHECK-NEXT: fmov s10, w9 -; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: mov x22, #34359738367 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: mov w8, #1895825407 +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: fmov s10, w8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x2, x19 ; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, x21, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s0, s10 +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs ; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: mov v0.d[1], x1 @@ -1098,39 +1100,39 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: mov w8, #-16777216 -; CHECK-NEXT: mov w9, #2130706431 -; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: mov x21, #-9223372036854775808 -; CHECK-NEXT: fmov s10, w9 -; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: mov x22, #9223372036854775807 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: mov w8, #2130706431 +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: fmov s10, w8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x2, x19 ; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, x21, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s0, s10 +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs ; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: mov v0.d[1], x1 @@ -1162,11 +1164,11 @@ ; CHECK-NEXT: fmov d2, #-1.00000000 ; CHECK-NEXT: movi d3, #0000000000000000 ; CHECK-NEXT: fmaxnm d4, d1, d2 +; CHECK-NEXT: fmaxnm d2, d0, d2 ; CHECK-NEXT: fcmp d1, d1 -; CHECK-NEXT: fmaxnm d1, d0, d2 -; CHECK-NEXT: fminnm d2, d4, d3 -; CHECK-NEXT: fminnm d1, d1, d3 -; CHECK-NEXT: fcvtzs w8, d2 +; CHECK-NEXT: fminnm d4, d4, d3 +; CHECK-NEXT: fminnm d1, d2, d3 +; CHECK-NEXT: fcvtzs w8, d4 ; CHECK-NEXT: fcvtzs w9, d1 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: fcmp d0, d0 @@ -1183,17 +1185,17 @@ ; CHECK-LABEL: test_signed_v2f64_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4584664420663164928 -; CHECK-NEXT: mov x9, #211106232532992 ; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: movk x9, #16479, lsl #48 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: fmaxnm d3, d1, d2 -; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: mov x8, #211106232532992 +; CHECK-NEXT: movk x8, #16479, lsl #48 ; CHECK-NEXT: fcmp d1, d1 -; CHECK-NEXT: fmaxnm d1, d0, d2 -; CHECK-NEXT: fminnm d2, d3, d4 -; CHECK-NEXT: fminnm d1, d1, d4 -; CHECK-NEXT: fcvtzs w8, d2 +; CHECK-NEXT: fmaxnm d3, d1, d2 +; CHECK-NEXT: fmaxnm d2, d0, d2 +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: fminnm d3, d3, d4 +; CHECK-NEXT: fminnm d1, d2, d4 +; CHECK-NEXT: fcvtzs w8, d3 ; CHECK-NEXT: fcvtzs w9, d1 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: fcmp d0, d0 @@ -1210,17 +1212,17 @@ ; CHECK-LABEL: test_signed_v2f64_v2i13: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4562146422526312448 -; CHECK-NEXT: mov x9, #279275953455104 ; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: movk x9, #16559, lsl #48 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: fmaxnm d3, d1, d2 -; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: mov x8, #279275953455104 +; CHECK-NEXT: movk x8, #16559, lsl #48 ; CHECK-NEXT: fcmp d1, d1 -; CHECK-NEXT: fmaxnm d1, d0, d2 -; CHECK-NEXT: fminnm d2, d3, d4 -; CHECK-NEXT: fminnm d1, d1, d4 -; CHECK-NEXT: fcvtzs w8, d2 +; CHECK-NEXT: fmaxnm d3, d1, d2 +; CHECK-NEXT: fmaxnm d2, d0, d2 +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: fminnm d3, d3, d4 +; CHECK-NEXT: fminnm d1, d2, d4 +; CHECK-NEXT: fcvtzs w8, d3 ; CHECK-NEXT: fcvtzs w9, d1 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: fcmp d0, d0 @@ -1237,17 +1239,17 @@ ; CHECK-LABEL: test_signed_v2f64_v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4548635623644200960 -; CHECK-NEXT: mov x9, #281200098803712 ; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: movk x9, #16607, lsl #48 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: fmaxnm d3, d1, d2 -; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: mov x8, #281200098803712 +; CHECK-NEXT: movk x8, #16607, lsl #48 ; CHECK-NEXT: fcmp d1, d1 -; CHECK-NEXT: fmaxnm d1, d0, d2 -; CHECK-NEXT: fminnm d2, d3, d4 -; CHECK-NEXT: fminnm d1, d1, d4 -; CHECK-NEXT: fcvtzs w8, d2 +; CHECK-NEXT: fmaxnm d3, d1, d2 +; CHECK-NEXT: fmaxnm d2, d0, d2 +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: fminnm d3, d3, d4 +; CHECK-NEXT: fminnm d1, d2, d4 +; CHECK-NEXT: fcvtzs w8, d3 ; CHECK-NEXT: fcvtzs w9, d1 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: fcmp d0, d0 @@ -1264,17 +1266,17 @@ ; CHECK-LABEL: test_signed_v2f64_v2i19: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4535124824762089472 -; CHECK-NEXT: mov x9, #281440616972288 ; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: movk x9, #16655, lsl #48 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: fmaxnm d3, d1, d2 -; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: mov x8, #281440616972288 +; CHECK-NEXT: movk x8, #16655, lsl #48 ; CHECK-NEXT: fcmp d1, d1 -; CHECK-NEXT: fmaxnm d1, d0, d2 -; CHECK-NEXT: fminnm d2, d3, d4 -; CHECK-NEXT: fminnm d1, d1, d4 -; CHECK-NEXT: fcvtzs w8, d2 +; CHECK-NEXT: fmaxnm d3, d1, d2 +; CHECK-NEXT: fmaxnm d2, d0, d2 +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: fminnm d3, d3, d4 +; CHECK-NEXT: fminnm d1, d2, d4 +; CHECK-NEXT: fcvtzs w8, d3 ; CHECK-NEXT: fcvtzs w9, d1 ; CHECK-NEXT: csel w8, wzr, w8, vs ; CHECK-NEXT: fcmp d0, d0 @@ -1290,8 +1292,8 @@ define <2 x i32> @test_signed_v2f64_v2i32_duplicate(<2 x double> %f) { ; CHECK-LABEL: test_signed_v2f64_v2i32_duplicate: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzs w8, d0 ; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: fcvtzs w8, d0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzs w8, d1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -1305,17 +1307,17 @@ ; CHECK-LABEL: test_signed_v2f64_v2i50: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-4395513236313604096 -; CHECK-NEXT: mov x9, #-16 ; CHECK-NEXT: mov d1, v0.d[1] -; CHECK-NEXT: movk x9, #17151, lsl #48 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: fmaxnm d3, d1, d2 -; CHECK-NEXT: fmov d4, x9 +; CHECK-NEXT: mov x8, #-16 +; CHECK-NEXT: movk x8, #17151, lsl #48 ; CHECK-NEXT: fcmp d1, d1 -; CHECK-NEXT: fmaxnm d1, d0, d2 -; CHECK-NEXT: fminnm d2, d3, d4 -; CHECK-NEXT: fminnm d1, d1, d4 -; CHECK-NEXT: fcvtzs x8, d2 +; CHECK-NEXT: fmaxnm d3, d1, d2 +; CHECK-NEXT: fmaxnm d2, d0, d2 +; CHECK-NEXT: fmov d4, x8 +; CHECK-NEXT: fminnm d3, d3, d4 +; CHECK-NEXT: fminnm d1, d2, d4 +; CHECK-NEXT: fcvtzs x8, d3 ; CHECK-NEXT: fcvtzs x9, d1 ; CHECK-NEXT: csel x8, xzr, x8, vs ; CHECK-NEXT: fcmp d0, d0 @@ -1359,39 +1361,39 @@ ; CHECK-NEXT: fmov d0, d8 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: mov x8, #-4170333254945079296 -; CHECK-NEXT: mov x9, #5053038781909696511 -; CHECK-NEXT: fmov d9, x8 ; CHECK-NEXT: mov x21, #-34359738368 -; CHECK-NEXT: fmov d10, x9 -; CHECK-NEXT: fcmp d8, d9 ; CHECK-NEXT: mov x22, #34359738367 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: fmov d9, x8 +; CHECK-NEXT: mov x8, #5053038781909696511 +; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: fmov d10, x8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt ; CHECK-NEXT: fcmp d8, d10 ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp d8, d8 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x2, x19 ; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: fcmp d0, d9 +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, x21, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp d0, d10 +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp d0, d0 -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs ; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: mov v0.d[1], x1 @@ -1425,39 +1427,39 @@ ; CHECK-NEXT: fmov d0, d8 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: mov x8, #-4044232465378705408 -; CHECK-NEXT: mov x9, #5179139571476070399 -; CHECK-NEXT: fmov d9, x8 ; CHECK-NEXT: mov x21, #-9223372036854775808 -; CHECK-NEXT: fmov d10, x9 -; CHECK-NEXT: fcmp d8, d9 ; CHECK-NEXT: mov x22, #9223372036854775807 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: fmov d9, x8 +; CHECK-NEXT: mov x8, #5179139571476070399 +; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: fmov d10, x8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt ; CHECK-NEXT: fcmp d8, d10 ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp d8, d8 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x2, x19 ; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: fcmp d0, d9 +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, x21, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp d0, d10 +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: csinv x9, x9, xzr, le ; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp d0, d0 -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs ; CHECK-NEXT: fmov d0, x9 ; CHECK-NEXT: mov v0.d[1], x1 @@ -1486,36 +1488,36 @@ ; CHECK-LABEL: test_signed_v4f16_v4i1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov s2, #-1.00000000 -; CHECK-NEXT: fcvt s4, h0 -; CHECK-NEXT: movi d3, #0000000000000000 -; CHECK-NEXT: fmaxnm s5, s4, s2 ; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fminnm s5, s5, s3 -; CHECK-NEXT: fcvtzs w8, s5 -; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: fmov s2, #-1.00000000 +; CHECK-NEXT: fcvt s3, h0 +; CHECK-NEXT: movi d5, #0000000000000000 +; CHECK-NEXT: mov h6, v0.h[2] ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvt s6, h0 -; CHECK-NEXT: fmaxnm s0, s1, s2 -; CHECK-NEXT: fminnm s0, s0, s3 -; CHECK-NEXT: fcvt s5, h5 +; CHECK-NEXT: fmaxnm s7, s3, s2 +; CHECK-NEXT: fcvt s6, h6 +; CHECK-NEXT: fmaxnm s4, s1, s2 ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fcvtzs w9, s0 -; CHECK-NEXT: fmaxnm s0, s5, s2 -; CHECK-NEXT: csel w9, wzr, w9, vs -; CHECK-NEXT: fcmp s4, s4 +; CHECK-NEXT: fminnm s7, s7, s5 ; CHECK-NEXT: fmaxnm s1, s6, s2 -; CHECK-NEXT: fminnm s2, s0, s3 -; CHECK-NEXT: csel w8, wzr, w8, vs -; CHECK-NEXT: fminnm s1, s1, s3 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fcvtzs w8, s2 -; CHECK-NEXT: fcmp s5, s5 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: fcvtzs w9, s1 +; CHECK-NEXT: fminnm s4, s4, s5 +; CHECK-NEXT: fcvtzs w9, s7 +; CHECK-NEXT: fminnm s1, s1, s5 +; CHECK-NEXT: fcvtzs w8, s4 +; CHECK-NEXT: fcvt s4, h0 ; CHECK-NEXT: csel w8, wzr, w8, vs +; CHECK-NEXT: fcmp s3, s3 +; CHECK-NEXT: fmaxnm s2, s4, s2 +; CHECK-NEXT: csel w9, wzr, w9, vs ; CHECK-NEXT: fcmp s6, s6 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: fcvtzs w9, s1 +; CHECK-NEXT: fminnm s1, s2, s5 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: csel w8, wzr, w9, vs +; CHECK-NEXT: fcmp s4, s4 +; CHECK-NEXT: fcvtzs w9, s1 ; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: csel w8, wzr, w9, vs ; CHECK-NEXT: mov v0.h[3], w8 @@ -1528,39 +1530,39 @@ define <4 x i8> @test_signed_v4f16_v4i8(<4 x half> %f) { ; CHECK-LABEL: test_signed_v4f16_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1023410176 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w9, #1123942400 -; CHECK-NEXT: fcvt s2, h0 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmaxnm s5, s2, s3 ; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fminnm s5, s5, s4 -; CHECK-NEXT: fcvtzs w8, s5 -; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov w8, #-1023410176 +; CHECK-NEXT: fcvt s3, h0 +; CHECK-NEXT: mov h6, v0.h[2] ; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: mov w8, #1123942400 ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvt s6, h0 -; CHECK-NEXT: fmaxnm s0, s1, s3 -; CHECK-NEXT: fminnm s0, s0, s4 -; CHECK-NEXT: fcvt s5, h5 +; CHECK-NEXT: fcvt s6, h6 +; CHECK-NEXT: fmov s5, w8 +; CHECK-NEXT: fmaxnm s7, s3, s2 +; CHECK-NEXT: fmaxnm s4, s1, s2 ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fcvtzs w9, s0 -; CHECK-NEXT: fmaxnm s0, s5, s3 -; CHECK-NEXT: csel w9, wzr, w9, vs -; CHECK-NEXT: fcmp s2, s2 -; CHECK-NEXT: fmaxnm s1, s6, s3 -; CHECK-NEXT: fminnm s3, s0, s4 -; CHECK-NEXT: csel w8, wzr, w8, vs -; CHECK-NEXT: fminnm s1, s1, s4 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fcvtzs w8, s3 -; CHECK-NEXT: fcmp s5, s5 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: fcvtzs w9, s1 +; CHECK-NEXT: fmaxnm s1, s6, s2 +; CHECK-NEXT: fminnm s7, s7, s5 +; CHECK-NEXT: fminnm s4, s4, s5 +; CHECK-NEXT: fminnm s1, s1, s5 +; CHECK-NEXT: fcvtzs w9, s7 +; CHECK-NEXT: fcvtzs w8, s4 +; CHECK-NEXT: fcvt s4, h0 ; CHECK-NEXT: csel w8, wzr, w8, vs +; CHECK-NEXT: fcmp s3, s3 +; CHECK-NEXT: fmaxnm s2, s4, s2 +; CHECK-NEXT: csel w9, wzr, w9, vs ; CHECK-NEXT: fcmp s6, s6 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: fcvtzs w9, s1 +; CHECK-NEXT: fminnm s1, s2, s5 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: csel w8, wzr, w9, vs +; CHECK-NEXT: fcmp s4, s4 +; CHECK-NEXT: fcvtzs w9, s1 ; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: csel w8, wzr, w9, vs ; CHECK-NEXT: mov v0.h[3], w8 @@ -1573,40 +1575,40 @@ define <4 x i13> @test_signed_v4f16_v4i13(<4 x half> %f) { ; CHECK-LABEL: test_signed_v4f16_v4i13: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-981467136 -; CHECK-NEXT: mov w9, #61440 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: movk w9, #17791, lsl #16 -; CHECK-NEXT: fcvt s2, h0 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmaxnm s5, s2, s3 ; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fminnm s5, s5, s4 -; CHECK-NEXT: fcvtzs w8, s5 -; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov w8, #-981467136 +; CHECK-NEXT: fcvt s3, h0 +; CHECK-NEXT: mov h6, v0.h[2] ; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: mov w8, #61440 ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvt s6, h0 -; CHECK-NEXT: fmaxnm s0, s1, s3 -; CHECK-NEXT: fminnm s0, s0, s4 -; CHECK-NEXT: fcvt s5, h5 +; CHECK-NEXT: movk w8, #17791, lsl #16 +; CHECK-NEXT: fcvt s6, h6 +; CHECK-NEXT: fmaxnm s7, s3, s2 +; CHECK-NEXT: fmov s5, w8 +; CHECK-NEXT: fmaxnm s4, s1, s2 ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fcvtzs w9, s0 -; CHECK-NEXT: fmaxnm s0, s5, s3 -; CHECK-NEXT: csel w9, wzr, w9, vs -; CHECK-NEXT: fcmp s2, s2 -; CHECK-NEXT: fmaxnm s1, s6, s3 -; CHECK-NEXT: fminnm s3, s0, s4 -; CHECK-NEXT: csel w8, wzr, w8, vs -; CHECK-NEXT: fminnm s1, s1, s4 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fcvtzs w8, s3 -; CHECK-NEXT: fcmp s5, s5 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: fcvtzs w9, s1 +; CHECK-NEXT: fmaxnm s1, s6, s2 +; CHECK-NEXT: fminnm s7, s7, s5 +; CHECK-NEXT: fminnm s4, s4, s5 +; CHECK-NEXT: fminnm s1, s1, s5 +; CHECK-NEXT: fcvtzs w9, s7 +; CHECK-NEXT: fcvtzs w8, s4 +; CHECK-NEXT: fcvt s4, h0 ; CHECK-NEXT: csel w8, wzr, w8, vs +; CHECK-NEXT: fcmp s3, s3 +; CHECK-NEXT: fmaxnm s2, s4, s2 +; CHECK-NEXT: csel w9, wzr, w9, vs ; CHECK-NEXT: fcmp s6, s6 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: fcvtzs w9, s1 +; CHECK-NEXT: fminnm s1, s2, s5 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: csel w8, wzr, w9, vs +; CHECK-NEXT: fcmp s4, s4 +; CHECK-NEXT: fcvtzs w9, s1 ; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: csel w8, wzr, w9, vs ; CHECK-NEXT: mov v0.h[3], w8 @@ -1621,34 +1623,34 @@ ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: mov w8, #-956301312 -; CHECK-CVT-NEXT: mov w9, #65024 -; CHECK-CVT-NEXT: movk w9, #18175, lsl #16 -; CHECK-CVT-NEXT: mov s1, v0.s[1] ; CHECK-CVT-NEXT: fmov s2, w8 -; CHECK-CVT-NEXT: fmov s3, w9 +; CHECK-CVT-NEXT: mov w8, #65024 +; CHECK-CVT-NEXT: mov s1, v0.s[1] +; CHECK-CVT-NEXT: movk w8, #18175, lsl #16 +; CHECK-CVT-NEXT: mov s6, v0.s[2] +; CHECK-CVT-NEXT: fmaxnm s5, s0, s2 +; CHECK-CVT-NEXT: fmov s4, w8 +; CHECK-CVT-NEXT: fmaxnm s3, s1, s2 ; CHECK-CVT-NEXT: fcmp s1, s1 -; CHECK-CVT-NEXT: fmaxnm s1, s1, s2 -; CHECK-CVT-NEXT: fmaxnm s4, s0, s2 -; CHECK-CVT-NEXT: fminnm s1, s1, s3 -; CHECK-CVT-NEXT: mov s5, v0.s[2] -; CHECK-CVT-NEXT: fminnm s4, s4, s3 -; CHECK-CVT-NEXT: fcvtzs w9, s1 -; CHECK-CVT-NEXT: fcvtzs w8, s4 -; CHECK-CVT-NEXT: mov s4, v0.s[3] -; CHECK-CVT-NEXT: fmaxnm s1, s5, s2 -; CHECK-CVT-NEXT: csel w9, wzr, w9, vs -; CHECK-CVT-NEXT: fcmp s0, s0 -; CHECK-CVT-NEXT: fmaxnm s2, s4, s2 -; CHECK-CVT-NEXT: fminnm s1, s1, s3 +; CHECK-CVT-NEXT: fmaxnm s1, s6, s2 +; CHECK-CVT-NEXT: fminnm s5, s5, s4 +; CHECK-CVT-NEXT: fminnm s3, s3, s4 +; CHECK-CVT-NEXT: fminnm s1, s1, s4 +; CHECK-CVT-NEXT: fcvtzs w9, s5 +; CHECK-CVT-NEXT: fcvtzs w8, s3 +; CHECK-CVT-NEXT: mov s3, v0.s[3] ; CHECK-CVT-NEXT: csel w8, wzr, w8, vs -; CHECK-CVT-NEXT: fminnm s2, s2, s3 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fcvtzs w8, s1 -; CHECK-CVT-NEXT: fcmp s5, s5 -; CHECK-CVT-NEXT: mov v0.h[1], w9 -; CHECK-CVT-NEXT: fcvtzs w9, s2 -; CHECK-CVT-NEXT: csel w8, wzr, w8, vs -; CHECK-CVT-NEXT: fcmp s4, s4 +; CHECK-CVT-NEXT: fcmp s0, s0 +; CHECK-CVT-NEXT: fmaxnm s2, s3, s2 +; CHECK-CVT-NEXT: csel w9, wzr, w9, vs +; CHECK-CVT-NEXT: fcmp s6, s6 +; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: fcvtzs w9, s1 +; CHECK-CVT-NEXT: fminnm s1, s2, s4 +; CHECK-CVT-NEXT: mov v0.h[1], w8 +; CHECK-CVT-NEXT: csel w8, wzr, w9, vs +; CHECK-CVT-NEXT: fcmp s3, s3 +; CHECK-CVT-NEXT: fcvtzs w9, s1 ; CHECK-CVT-NEXT: mov v0.h[2], w8 ; CHECK-CVT-NEXT: csel w8, wzr, w9, vs ; CHECK-CVT-NEXT: mov v0.h[3], w8 @@ -1666,40 +1668,40 @@ define <4 x i19> @test_signed_v4f16_v4i19(<4 x half> %f) { ; CHECK-LABEL: test_signed_v4f16_v4i19: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-931135488 -; CHECK-NEXT: mov w9, #65472 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: movk w9, #18559, lsl #16 -; CHECK-NEXT: fcvt s2, h0 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmaxnm s5, s2, s3 ; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fminnm s5, s5, s4 -; CHECK-NEXT: fcvtzs w8, s5 -; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov w8, #-931135488 +; CHECK-NEXT: fcvt s3, h0 +; CHECK-NEXT: mov h6, v0.h[2] ; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: mov w8, #65472 ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvt s6, h0 -; CHECK-NEXT: fmaxnm s0, s1, s3 -; CHECK-NEXT: fminnm s0, s0, s4 -; CHECK-NEXT: fcvt s5, h5 +; CHECK-NEXT: movk w8, #18559, lsl #16 +; CHECK-NEXT: fcvt s6, h6 +; CHECK-NEXT: fmaxnm s7, s3, s2 +; CHECK-NEXT: fmov s5, w8 +; CHECK-NEXT: fmaxnm s4, s1, s2 ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fcvtzs w9, s0 -; CHECK-NEXT: fmaxnm s0, s5, s3 -; CHECK-NEXT: csel w9, wzr, w9, vs -; CHECK-NEXT: fcmp s2, s2 -; CHECK-NEXT: fmaxnm s1, s6, s3 -; CHECK-NEXT: fminnm s3, s0, s4 -; CHECK-NEXT: csel w8, wzr, w8, vs -; CHECK-NEXT: fminnm s1, s1, s4 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fcvtzs w8, s3 -; CHECK-NEXT: fcmp s5, s5 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: fcvtzs w9, s1 +; CHECK-NEXT: fmaxnm s1, s6, s2 +; CHECK-NEXT: fminnm s7, s7, s5 +; CHECK-NEXT: fminnm s4, s4, s5 +; CHECK-NEXT: fminnm s1, s1, s5 +; CHECK-NEXT: fcvtzs w9, s7 +; CHECK-NEXT: fcvtzs w8, s4 +; CHECK-NEXT: fcvt s4, h0 ; CHECK-NEXT: csel w8, wzr, w8, vs +; CHECK-NEXT: fcmp s3, s3 +; CHECK-NEXT: fmaxnm s2, s4, s2 +; CHECK-NEXT: csel w9, wzr, w9, vs ; CHECK-NEXT: fcmp s6, s6 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: fcvtzs w9, s1 +; CHECK-NEXT: fminnm s1, s2, s5 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: csel w8, wzr, w9, vs +; CHECK-NEXT: fcmp s4, s4 +; CHECK-NEXT: fcvtzs w9, s1 ; CHECK-NEXT: mov v0.s[2], w8 ; CHECK-NEXT: csel w8, wzr, w9, vs ; CHECK-NEXT: mov v0.s[3], w8 @@ -1718,17 +1720,18 @@ ; CHECK-FP16-LABEL: test_signed_v4f16_v4i32_duplicate: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP16-NEXT: mov h2, v0.h[1] ; CHECK-FP16-NEXT: fcvtzs w8, h0 -; CHECK-FP16-NEXT: mov h1, v0.h[1] +; CHECK-FP16-NEXT: fmov s1, w8 +; CHECK-FP16-NEXT: fcvtzs w8, h2 ; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: mov h3, v0.h[3] -; CHECK-FP16-NEXT: fmov s0, w8 -; CHECK-FP16-NEXT: fcvtzs w8, h1 -; CHECK-FP16-NEXT: fcvtzs w9, h2 -; CHECK-FP16-NEXT: mov v0.s[1], w8 -; CHECK-FP16-NEXT: mov v0.s[2], w9 -; CHECK-FP16-NEXT: fcvtzs w8, h3 -; CHECK-FP16-NEXT: mov v0.s[3], w8 +; CHECK-FP16-NEXT: mov h0, v0.h[3] +; CHECK-FP16-NEXT: mov v1.s[1], w8 +; CHECK-FP16-NEXT: fcvtzs w8, h2 +; CHECK-FP16-NEXT: mov v1.s[2], w8 +; CHECK-FP16-NEXT: fcvtzs w8, h0 +; CHECK-FP16-NEXT: mov v1.s[3], w8 +; CHECK-FP16-NEXT: mov v0.16b, v1.16b ; CHECK-FP16-NEXT: ret %x = call <4 x i32> @llvm.fptosi.sat.v4f16.v4i32(<4 x half> %f) ret <4 x i32> %x @@ -1740,43 +1743,43 @@ ; CHECK-NEXT: mov w8, #-671088640 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fcvt s1, h0 -; CHECK-NEXT: mov w10, #1476395007 +; CHECK-NEXT: mov w9, #1476395007 +; CHECK-NEXT: mov h4, v0.h[1] +; CHECK-NEXT: mov x10, #562949953421311 ; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: fmov s3, w9 +; CHECK-NEXT: fcvtzs x8, s1 ; CHECK-NEXT: mov x9, #-562949953421312 -; CHECK-NEXT: fcvtzs x12, s1 +; CHECK-NEXT: fcvt s4, h4 ; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: fmov s3, w10 -; CHECK-NEXT: mov x11, #562949953421311 -; CHECK-NEXT: csel x8, x9, x12, lt -; CHECK-NEXT: fcmp s1, s3 -; CHECK-NEXT: csel x8, x11, x8, gt -; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvtzs x10, s1 -; CHECK-NEXT: csel x0, xzr, x8, vs -; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: csel x8, x9, x10, lt +; CHECK-NEXT: csel x8, x9, x8, lt ; CHECK-NEXT: fcmp s1, s3 -; CHECK-NEXT: csel x8, x11, x8, gt +; CHECK-NEXT: fcvtzs x11, s4 +; CHECK-NEXT: csel x8, x10, x8, gt ; CHECK-NEXT: fcmp s1, s1 ; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: csel x0, xzr, x8, vs +; CHECK-NEXT: fcmp s4, s2 ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvtzs x10, s1 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: csel x8, x9, x11, lt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: csel x8, x10, x8, gt +; CHECK-NEXT: fcmp s4, s4 +; CHECK-NEXT: fcvtzs x11, s1 ; CHECK-NEXT: csel x1, xzr, x8, vs ; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel x8, x9, x10, lt +; CHECK-NEXT: csel x8, x9, x11, lt ; CHECK-NEXT: fcmp s1, s3 -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: csel x8, x11, x8, gt +; CHECK-NEXT: csel x8, x10, x8, gt ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fcvtzs x12, s0 ; CHECK-NEXT: csel x2, xzr, x8, vs +; CHECK-NEXT: fcvtzs x8, s0 ; CHECK-NEXT: fcmp s0, s2 -; CHECK-NEXT: csel x8, x9, x12, lt +; CHECK-NEXT: csel x8, x9, x8, lt ; CHECK-NEXT: fcmp s0, s3 -; CHECK-NEXT: csel x8, x11, x8, gt +; CHECK-NEXT: csel x8, x10, x8, gt ; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: csel x3, xzr, x8, vs ; CHECK-NEXT: ret @@ -1788,38 +1791,37 @@ ; CHECK-CVT-LABEL: test_signed_v4f16_v4i64: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-CVT-NEXT: fcvt s1, h0 -; CHECK-CVT-NEXT: mov h2, v0.h[1] -; CHECK-CVT-NEXT: fcvtzs x8, s1 -; CHECK-CVT-NEXT: fcvt s1, h2 -; CHECK-CVT-NEXT: fmov d2, x8 -; CHECK-CVT-NEXT: fcvtzs x8, s1 ; CHECK-CVT-NEXT: mov h1, v0.h[2] +; CHECK-CVT-NEXT: mov h2, v0.h[1] +; CHECK-CVT-NEXT: fcvt s3, h0 ; CHECK-CVT-NEXT: mov h0, v0.h[3] ; CHECK-CVT-NEXT: fcvt s1, h1 -; CHECK-CVT-NEXT: mov v2.d[1], x8 -; CHECK-CVT-NEXT: fcvtzs x8, s1 -; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: fmov d1, x8 -; CHECK-CVT-NEXT: fcvtzs x8, s0 -; CHECK-CVT-NEXT: mov v1.d[1], x8 -; CHECK-CVT-NEXT: mov v0.16b, v2.16b +; CHECK-CVT-NEXT: fcvt s2, h2 +; CHECK-CVT-NEXT: fcvtzs x8, s3 +; CHECK-CVT-NEXT: fcvt s3, h0 +; CHECK-CVT-NEXT: fcvtzs x9, s1 +; CHECK-CVT-NEXT: fmov d0, x8 +; CHECK-CVT-NEXT: fcvtzs x8, s2 +; CHECK-CVT-NEXT: fmov d1, x9 +; CHECK-CVT-NEXT: fcvtzs x9, s3 +; CHECK-CVT-NEXT: mov v0.d[1], x8 +; CHECK-CVT-NEXT: mov v1.d[1], x9 ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_signed_v4f16_v4i64: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-NEXT: fcvtzs x8, h0 -; CHECK-FP16-NEXT: mov h1, v0.h[1] -; CHECK-FP16-NEXT: mov h2, v0.h[2] +; CHECK-FP16-NEXT: mov h1, v0.h[2] +; CHECK-FP16-NEXT: mov h2, v0.h[1] ; CHECK-FP16-NEXT: mov h3, v0.h[3] +; CHECK-FP16-NEXT: fcvtzs x8, h0 +; CHECK-FP16-NEXT: fcvtzs x9, h1 ; CHECK-FP16-NEXT: fmov d0, x8 -; CHECK-FP16-NEXT: fcvtzs x8, h1 -; CHECK-FP16-NEXT: fcvtzs x9, h2 -; CHECK-FP16-NEXT: mov v0.d[1], x8 +; CHECK-FP16-NEXT: fcvtzs x8, h2 ; CHECK-FP16-NEXT: fmov d1, x9 -; CHECK-FP16-NEXT: fcvtzs x8, h3 -; CHECK-FP16-NEXT: mov v1.d[1], x8 +; CHECK-FP16-NEXT: fcvtzs x9, h3 +; CHECK-FP16-NEXT: mov v0.d[1], x8 +; CHECK-FP16-NEXT: mov v1.d[1], x9 ; CHECK-FP16-NEXT: ret %x = call <4 x i64> @llvm.fptosi.sat.v4f16.v4i64(<4 x half> %f) ret <4 x i64> %x @@ -1851,46 +1853,46 @@ ; CHECK-NEXT: .cfi_offset b10, -96 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #-251658240 -; CHECK-NEXT: mov w9, #1895825407 -; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x25, #-34359738368 -; CHECK-NEXT: fmov s10, w9 -; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: mov x26, #34359738367 +; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: mov w8, #1895825407 +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: fmov s10, w8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt -; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x21, xzr, x8, vs ; CHECK-NEXT: csel x22, xzr, x9, vs +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 @@ -1898,34 +1900,34 @@ ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x23, xzr, x8, vs ; CHECK-NEXT: csel x24, xzr, x9, vs +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x8, x25, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x26, x8, gt -; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: mov x2, x19 ; CHECK-NEXT: mov x3, x20 ; CHECK-NEXT: mov x4, x21 ; CHECK-NEXT: mov x5, x22 ; CHECK-NEXT: mov x6, x23 +; CHECK-NEXT: csel x8, x25, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: mov x7, x24 ; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: csinv x9, x9, xzr, le +; CHECK-NEXT: csel x8, x26, x8, gt +; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs +; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], x1 +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret @@ -1959,46 +1961,46 @@ ; CHECK-NEXT: .cfi_offset b10, -96 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #-16777216 -; CHECK-NEXT: mov w9, #2130706431 -; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x25, #-9223372036854775808 -; CHECK-NEXT: fmov s10, w9 -; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: mov x26, #9223372036854775807 +; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: mov w8, #2130706431 +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: fmov s10, w8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt -; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x21, xzr, x8, vs ; CHECK-NEXT: csel x22, xzr, x9, vs +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 @@ -2006,34 +2008,34 @@ ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x23, xzr, x8, vs ; CHECK-NEXT: csel x24, xzr, x9, vs +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x8, x25, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x26, x8, gt -; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: mov x2, x19 ; CHECK-NEXT: mov x3, x20 ; CHECK-NEXT: mov x4, x21 ; CHECK-NEXT: mov x5, x22 ; CHECK-NEXT: mov x6, x23 +; CHECK-NEXT: csel x8, x25, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: mov x7, x24 ; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: csinv x9, x9, xzr, le +; CHECK-NEXT: csel x8, x26, x8, gt +; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: csel x1, xzr, x8, vs +; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], x1 +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll --- a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll @@ -48,8 +48,8 @@ define i13 @test_unsigned_i13_f32(float %f) nounwind { ; CHECK-LABEL: test_unsigned_i13_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #63488 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov w8, #63488 ; CHECK-NEXT: movk w8, #17919, lsl #16 ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s1, w8 @@ -63,8 +63,8 @@ define i16 @test_unsigned_i16_f32(float %f) nounwind { ; CHECK-LABEL: test_unsigned_i16_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65280 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov w8, #65280 ; CHECK-NEXT: movk w8, #18303, lsl #16 ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s1, w8 @@ -78,8 +78,8 @@ define i19 @test_unsigned_i19_f32(float %f) nounwind { ; CHECK-LABEL: test_unsigned_i19_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65504 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov w8, #65504 ; CHECK-NEXT: movk w8, #18687, lsl #16 ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s1, w8 @@ -102,13 +102,13 @@ define i50 @test_unsigned_i50_f32(float %f) nounwind { ; CHECK-LABEL: test_unsigned_i50_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #1484783615 -; CHECK-NEXT: fcvtzu x8, s0 +; CHECK-NEXT: mov w8, #1484783615 +; CHECK-NEXT: fcvtzu x9, s0 ; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: csel x8, xzr, x8, lt -; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: csel x8, xzr, x9, lt ; CHECK-NEXT: mov x9, #1125899906842623 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: csel x0, x9, x8, gt ; CHECK-NEXT: ret %x = call i50 @llvm.fptoui.sat.i50.f32(float %f) @@ -132,15 +132,15 @@ ; CHECK-NEXT: fmov s8, s0 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: mov w8, #1904214015 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: mov x10, #68719476735 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov x9, #68719476735 -; CHECK-NEXT: csel x10, xzr, x0, lt -; CHECK-NEXT: csel x11, xzr, x1, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s0 -; CHECK-NEXT: csel x1, x9, x11, gt -; CHECK-NEXT: csinv x0, x10, xzr, le +; CHECK-NEXT: csel x1, x10, x9, gt +; CHECK-NEXT: csinv x0, x8, xzr, le ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %x = call i100 @llvm.fptoui.sat.i100.f32(float %f) @@ -155,14 +155,14 @@ ; CHECK-NEXT: fmov s8, s0 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: mov w8, #2139095039 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: csel x10, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s0 -; CHECK-NEXT: csinv x0, x10, xzr, le -; CHECK-NEXT: csinv x1, x9, xzr, le +; CHECK-NEXT: csinv x0, x9, xzr, le +; CHECK-NEXT: csinv x1, x8, xzr, le ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %x = call i128 @llvm.fptoui.sat.i128.f32(float %f) @@ -201,8 +201,8 @@ define i8 @test_unsigned_i8_f64(double %f) nounwind { ; CHECK-LABEL: test_unsigned_i8_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #246290604621824 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov x8, #246290604621824 ; CHECK-NEXT: movk x8, #16495, lsl #48 ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: fmov d1, x8 @@ -216,8 +216,8 @@ define i13 @test_unsigned_i13_f64(double %f) nounwind { ; CHECK-LABEL: test_unsigned_i13_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #280375465082880 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov x8, #280375465082880 ; CHECK-NEXT: movk x8, #16575, lsl #48 ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: fmov d1, x8 @@ -231,8 +231,8 @@ define i16 @test_unsigned_i16_f64(double %f) nounwind { ; CHECK-LABEL: test_unsigned_i16_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #281337537757184 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov x8, #281337537757184 ; CHECK-NEXT: movk x8, #16623, lsl #48 ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: fmov d1, x8 @@ -246,8 +246,8 @@ define i19 @test_unsigned_i19_f64(double %f) nounwind { ; CHECK-LABEL: test_unsigned_i19_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #281457796841472 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov x8, #281457796841472 ; CHECK-NEXT: movk x8, #16671, lsl #48 ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: fmov d1, x8 @@ -270,8 +270,8 @@ define i50 @test_unsigned_i50_f64(double %f) nounwind { ; CHECK-LABEL: test_unsigned_i50_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-8 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov x8, #-8 ; CHECK-NEXT: movk x8, #17167, lsl #48 ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: fmov d1, x8 @@ -299,15 +299,15 @@ ; CHECK-NEXT: fmov d8, d0 ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: mov x8, #5057542381537067007 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fcmp d8, #0.0 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: mov x10, #68719476735 ; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov x9, #68719476735 -; CHECK-NEXT: csel x10, xzr, x0, lt -; CHECK-NEXT: csel x11, xzr, x1, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp d8, d0 -; CHECK-NEXT: csel x1, x9, x11, gt -; CHECK-NEXT: csinv x0, x10, xzr, le +; CHECK-NEXT: csel x1, x10, x9, gt +; CHECK-NEXT: csinv x0, x8, xzr, le ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %x = call i100 @llvm.fptoui.sat.i100.f64(double %f) @@ -322,14 +322,14 @@ ; CHECK-NEXT: fmov d8, d0 ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: mov x8, #5183643171103440895 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fcmp d8, #0.0 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: csel x10, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp d8, d0 -; CHECK-NEXT: csinv x0, x10, xzr, le -; CHECK-NEXT: csinv x1, x9, xzr, le +; CHECK-NEXT: csinv x0, x9, xzr, le +; CHECK-NEXT: csinv x1, x8, xzr, le ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %x = call i128 @llvm.fptoui.sat.i128.f64(double %f) @@ -354,8 +354,8 @@ define i1 @test_unsigned_i1_f16(half %f) nounwind { ; CHECK-LABEL: test_unsigned_i1_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s1, #1.00000000 ; CHECK-NEXT: fminnm s0, s0, s1 @@ -369,8 +369,8 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind { ; CHECK-LABEL: test_unsigned_i8_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: mov w8, #1132396544 ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s1, w8 @@ -384,9 +384,9 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind { ; CHECK-LABEL: test_unsigned_i13_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #63488 -; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: mov w8, #63488 ; CHECK-NEXT: movk w8, #17919, lsl #16 ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s1, w8 @@ -400,9 +400,9 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind { ; CHECK-LABEL: test_unsigned_i16_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65280 -; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: mov w8, #65280 ; CHECK-NEXT: movk w8, #18303, lsl #16 ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s1, w8 @@ -416,9 +416,9 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind { ; CHECK-LABEL: test_unsigned_i19_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65504 -; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: mov w8, #65504 ; CHECK-NEXT: movk w8, #18687, lsl #16 ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s1, w8 @@ -449,9 +449,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: mov w8, #1484783615 +; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: fcvtzu x9, s0 ; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: csel x8, xzr, x9, lt ; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov x9, #1125899906842623 @@ -481,19 +481,19 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: mov w8, #1904214015 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: mov x10, #68719476735 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov x9, #68719476735 -; CHECK-NEXT: csel x10, xzr, x0, lt -; CHECK-NEXT: csel x11, xzr, x1, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s0 -; CHECK-NEXT: csel x1, x9, x11, gt -; CHECK-NEXT: csinv x0, x10, xzr, le +; CHECK-NEXT: csel x1, x10, x9, gt +; CHECK-NEXT: csinv x0, x8, xzr, le ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %x = call i100 @llvm.fptoui.sat.i100.f16(half %f) @@ -505,18 +505,18 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: mov w8, #2139095039 -; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: csel x10, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s0 -; CHECK-NEXT: csinv x0, x10, xzr, le -; CHECK-NEXT: csinv x1, x9, xzr, le +; CHECK-NEXT: csinv x0, x9, xzr, le +; CHECK-NEXT: csinv x1, x8, xzr, le ; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %x = call i128 @llvm.fptoui.sat.i128.f16(half %f) diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -57,18 +57,18 @@ ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 -; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 +; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: mov v0.s[1], v1.s[0] +; CHECK-NEXT: fcvtzu v4.4s, v4.4s ; CHECK-NEXT: mov v0.s[2], v2.s[0] +; CHECK-NEXT: fmov w4, s4 ; CHECK-NEXT: mov v0.s[3], v3.s[0] -; CHECK-NEXT: fcvtzu v4.4s, v4.4s ; CHECK-NEXT: fcvtzu v0.4s, v0.4s ; CHECK-NEXT: mov w1, v0.s[1] ; CHECK-NEXT: mov w2, v0.s[2] ; CHECK-NEXT: mov w3, v0.s[3] ; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: fmov w4, s4 ; CHECK-NEXT: ret %x = call <5 x i32> @llvm.fptoui.sat.v5f32.v5i32(<5 x float> %f) ret <5 x i32> %x @@ -79,21 +79,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 ; CHECK-NEXT: mov v0.s[1], v1.s[0] -; CHECK-NEXT: mov v0.s[2], v2.s[0] ; CHECK-NEXT: mov v4.s[1], v5.s[0] -; CHECK-NEXT: mov v0.s[3], v3.s[0] +; CHECK-NEXT: mov v0.s[2], v2.s[0] ; CHECK-NEXT: fcvtzu v1.4s, v4.4s -; CHECK-NEXT: fcvtzu v0.4s, v0.4s +; CHECK-NEXT: mov v0.s[3], v3.s[0] ; CHECK-NEXT: mov w5, v1.s[1] +; CHECK-NEXT: fmov w4, s1 +; CHECK-NEXT: fcvtzu v0.4s, v0.4s ; CHECK-NEXT: mov w1, v0.s[1] ; CHECK-NEXT: mov w2, v0.s[2] ; CHECK-NEXT: mov w3, v0.s[3] -; CHECK-NEXT: fmov w4, s1 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <6 x i32> @llvm.fptoui.sat.v6f32.v6i32(<6 x float> %f) @@ -104,8 +104,8 @@ ; CHECK-LABEL: test_unsigned_v7f32_v7i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 @@ -119,10 +119,10 @@ ; CHECK-NEXT: fcvtzu v0.4s, v0.4s ; CHECK-NEXT: mov w5, v1.s[1] ; CHECK-NEXT: mov w6, v1.s[2] +; CHECK-NEXT: fmov w4, s1 ; CHECK-NEXT: mov w1, v0.s[1] ; CHECK-NEXT: mov w2, v0.s[2] ; CHECK-NEXT: mov w3, v0.s[3] -; CHECK-NEXT: fmov w4, s1 ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %x = call <7 x i32> @llvm.fptoui.sat.v7f32.v7i32(<7 x float> %f) @@ -163,8 +163,8 @@ define <2 x i32> @test_unsigned_v2f64_v2i32(<2 x double> %f) { ; CHECK-LABEL: test_unsigned_v2f64_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -178,11 +178,11 @@ ; CHECK-LABEL: test_unsigned_v3f64_v3i32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtzu w8, d0 -; CHECK-NEXT: fcvtzu w9, d1 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fcvtzu w10, d2 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: mov v0.s[2], w10 +; CHECK-NEXT: fcvtzu w8, d1 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: fcvtzu w8, d2 +; CHECK-NEXT: mov v0.s[2], w8 ; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: ret @@ -193,14 +193,14 @@ define <4 x i32> @test_unsigned_v4f64_v4i32(<4 x double> %f) { ; CHECK-LABEL: test_unsigned_v4f64_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: mov d2, v0.d[1] +; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, d2 -; CHECK-NEXT: fcvtzu w9, d1 -; CHECK-NEXT: mov d1, v1.d[1] ; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: mov v0.s[2], w9 +; CHECK-NEXT: fcvtzu w8, d1 +; CHECK-NEXT: mov d1, v1.d[1] +; CHECK-NEXT: mov v0.s[2], w8 ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: ret @@ -253,17 +253,17 @@ ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixunstfsi ; CHECK-NEXT: adrp x8, .LCPI14_1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 ; CHECK-NEXT: csel w19, wzr, w0, lt +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: csinv w8, w19, wzr, le @@ -285,8 +285,8 @@ ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: adrp x8, .LCPI15_0 ; CHECK-NEXT: mov v2.16b, v1.16b +; CHECK-NEXT: adrp x8, .LCPI15_0 ; CHECK-NEXT: stp q1, q0, [sp, #32] // 32-byte Folded Spill ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: mov v0.16b, v2.16b @@ -296,31 +296,31 @@ ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixunstfsi ; CHECK-NEXT: adrp x8, .LCPI15_1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 ; CHECK-NEXT: csel w19, wzr, w0, lt +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csinv w20, w19, wzr, le ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixunstfsi ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, wzr, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: csinv w8, w19, wzr, le ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov v0.s[1], w20 ; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret @@ -338,8 +338,8 @@ ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: adrp x8, .LCPI16_0 ; CHECK-NEXT: stp q0, q2, [sp, #48] // 32-byte Folded Spill +; CHECK-NEXT: adrp x8, .LCPI16_0 ; CHECK-NEXT: mov v2.16b, v1.16b ; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] @@ -350,10 +350,10 @@ ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixunstfsi ; CHECK-NEXT: adrp x8, .LCPI16_1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_1] ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 ; CHECK-NEXT: csel w19, wzr, w0, lt +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_1] ; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload @@ -364,31 +364,31 @@ ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixunstfsi ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, wzr, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: csinv w8, w19, wzr, le ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov v0.s[1], w20 ; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixunstfsi ; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, wzr, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csinv w8, w19, wzr, le -; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: mov v0.s[2], w8 ; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret @@ -406,45 +406,45 @@ ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w30, -32 -; CHECK-NEXT: adrp x8, .LCPI17_0 ; CHECK-NEXT: stp q0, q2, [sp, #16] // 32-byte Folded Spill +; CHECK-NEXT: adrp x8, .LCPI17_0 ; CHECK-NEXT: mov v2.16b, v1.16b ; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str q3, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] ; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: str q3, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: str q1, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixunstfsi ; CHECK-NEXT: adrp x8, .LCPI17_1 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_1] ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 ; CHECK-NEXT: csel w19, wzr, w0, lt +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_1] ; CHECK-NEXT: str q1, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: csinv w20, w19, wzr, le ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixunstfsi ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, wzr, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: csinv w8, w19, wzr, le ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: mov v0.s[1], w20 ; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov w19, w0 @@ -453,8 +453,8 @@ ; CHECK-NEXT: cmp w19, #0 ; CHECK-NEXT: csel w19, wzr, w0, lt ; CHECK-NEXT: bl __gttf2 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csinv w8, w19, wzr, le ; CHECK-NEXT: mov v0.s[2], w8 ; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill @@ -464,15 +464,15 @@ ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: bl __fixunstfsi ; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel w19, wzr, w0, lt ; CHECK-NEXT: bl __gttf2 ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csinv w8, w19, wzr, le -; CHECK-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: add sp, sp, #128 ; CHECK-NEXT: ret @@ -521,8 +521,8 @@ ; CHECK-FP16-LABEL: test_unsigned_v2f16_v2i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-NEXT: fcvtzu w8, h0 ; CHECK-FP16-NEXT: mov h1, v0.h[1] +; CHECK-FP16-NEXT: fcvtzu w8, h0 ; CHECK-FP16-NEXT: fmov s0, w8 ; CHECK-FP16-NEXT: fcvtzu w8, h1 ; CHECK-FP16-NEXT: mov v0.s[1], w8 @@ -542,17 +542,18 @@ ; CHECK-FP16-LABEL: test_unsigned_v3f16_v3i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP16-NEXT: mov h2, v0.h[1] ; CHECK-FP16-NEXT: fcvtzu w8, h0 -; CHECK-FP16-NEXT: mov h1, v0.h[1] +; CHECK-FP16-NEXT: fmov s1, w8 +; CHECK-FP16-NEXT: fcvtzu w8, h2 ; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: mov h3, v0.h[3] -; CHECK-FP16-NEXT: fmov s0, w8 -; CHECK-FP16-NEXT: fcvtzu w8, h1 -; CHECK-FP16-NEXT: fcvtzu w9, h2 -; CHECK-FP16-NEXT: mov v0.s[1], w8 -; CHECK-FP16-NEXT: mov v0.s[2], w9 -; CHECK-FP16-NEXT: fcvtzu w8, h3 -; CHECK-FP16-NEXT: mov v0.s[3], w8 +; CHECK-FP16-NEXT: mov h0, v0.h[3] +; CHECK-FP16-NEXT: mov v1.s[1], w8 +; CHECK-FP16-NEXT: fcvtzu w8, h2 +; CHECK-FP16-NEXT: mov v1.s[2], w8 +; CHECK-FP16-NEXT: fcvtzu w8, h0 +; CHECK-FP16-NEXT: mov v1.s[3], w8 +; CHECK-FP16-NEXT: mov v0.16b, v1.16b ; CHECK-FP16-NEXT: ret %x = call <3 x i32> @llvm.fptoui.sat.v3f16.v3i32(<3 x half> %f) ret <3 x i32> %x @@ -568,17 +569,18 @@ ; CHECK-FP16-LABEL: test_unsigned_v4f16_v4i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP16-NEXT: mov h2, v0.h[1] ; CHECK-FP16-NEXT: fcvtzu w8, h0 -; CHECK-FP16-NEXT: mov h1, v0.h[1] +; CHECK-FP16-NEXT: fmov s1, w8 +; CHECK-FP16-NEXT: fcvtzu w8, h2 ; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: mov h3, v0.h[3] -; CHECK-FP16-NEXT: fmov s0, w8 -; CHECK-FP16-NEXT: fcvtzu w8, h1 -; CHECK-FP16-NEXT: fcvtzu w9, h2 -; CHECK-FP16-NEXT: mov v0.s[1], w8 -; CHECK-FP16-NEXT: mov v0.s[2], w9 -; CHECK-FP16-NEXT: fcvtzu w8, h3 -; CHECK-FP16-NEXT: mov v0.s[3], w8 +; CHECK-FP16-NEXT: mov h0, v0.h[3] +; CHECK-FP16-NEXT: mov v1.s[1], w8 +; CHECK-FP16-NEXT: fcvtzu w8, h2 +; CHECK-FP16-NEXT: mov v1.s[2], w8 +; CHECK-FP16-NEXT: fcvtzu w8, h0 +; CHECK-FP16-NEXT: mov v1.s[3], w8 +; CHECK-FP16-NEXT: mov v0.16b, v1.16b ; CHECK-FP16-NEXT: ret %x = call <4 x i32> @llvm.fptoui.sat.v4f16.v4i32(<4 x half> %f) ret <4 x i32> %x @@ -594,21 +596,21 @@ ; CHECK-CVT-NEXT: mov w1, v1.s[1] ; CHECK-CVT-NEXT: mov w2, v1.s[2] ; CHECK-CVT-NEXT: mov w3, v1.s[3] -; CHECK-CVT-NEXT: fmov w4, s0 ; CHECK-CVT-NEXT: fmov w0, s1 +; CHECK-CVT-NEXT: fmov w4, s0 ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_unsigned_v5f16_v5i32: ; CHECK-FP16: // %bb.0: -; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-NEXT: mov h1, v0.h[1] +; CHECK-FP16-NEXT: mov h2, v0.h[2] +; CHECK-FP16-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-NEXT: mov h4, v0.h[3] ; CHECK-FP16-NEXT: fcvtzu w0, h0 -; CHECK-FP16-NEXT: mov h2, v0.h[1] -; CHECK-FP16-NEXT: mov h3, v0.h[2] -; CHECK-FP16-NEXT: mov h0, v0.h[3] -; CHECK-FP16-NEXT: fcvtzu w4, h1 -; CHECK-FP16-NEXT: fcvtzu w1, h2 -; CHECK-FP16-NEXT: fcvtzu w2, h3 -; CHECK-FP16-NEXT: fcvtzu w3, h0 +; CHECK-FP16-NEXT: fcvtzu w1, h1 +; CHECK-FP16-NEXT: fcvtzu w2, h2 +; CHECK-FP16-NEXT: fcvtzu w4, h3 +; CHECK-FP16-NEXT: fcvtzu w3, h4 ; CHECK-FP16-NEXT: ret %x = call <5 x i32> @llvm.fptoui.sat.v5f16.v5i32(<5 x half> %f) ret <5 x i32> %x @@ -621,30 +623,30 @@ ; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-CVT-NEXT: fcvtzu v1.4s, v1.4s ; CHECK-CVT-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-CVT-NEXT: mov w5, v0.s[1] ; CHECK-CVT-NEXT: mov w1, v1.s[1] ; CHECK-CVT-NEXT: mov w2, v1.s[2] ; CHECK-CVT-NEXT: mov w3, v1.s[3] -; CHECK-CVT-NEXT: fmov w4, s0 +; CHECK-CVT-NEXT: mov w5, v0.s[1] ; CHECK-CVT-NEXT: fmov w0, s1 +; CHECK-CVT-NEXT: fmov w4, s0 ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_unsigned_v6f16_v6i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-FP16-NEXT: mov h3, v0.h[2] +; CHECK-FP16-NEXT: mov h4, v0.h[3] +; CHECK-FP16-NEXT: fcvtzu w0, h0 +; CHECK-FP16-NEXT: mov h2, v1.h[1] +; CHECK-FP16-NEXT: fcvtzu w8, h1 +; CHECK-FP16-NEXT: fcvtzu w2, h3 +; CHECK-FP16-NEXT: fcvtzu w3, h4 +; CHECK-FP16-NEXT: fmov s1, w8 +; CHECK-FP16-NEXT: fcvtzu w5, h2 ; CHECK-FP16-NEXT: mov h2, v0.h[1] +; CHECK-FP16-NEXT: mov v1.s[1], w5 ; CHECK-FP16-NEXT: fcvtzu w1, h2 -; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: fcvtzu w8, h1 -; CHECK-FP16-NEXT: mov h1, v1.h[1] -; CHECK-FP16-NEXT: fcvtzu w2, h2 -; CHECK-FP16-NEXT: fmov s2, w8 -; CHECK-FP16-NEXT: fcvtzu w5, h1 -; CHECK-FP16-NEXT: fcvtzu w0, h0 -; CHECK-FP16-NEXT: mov h0, v0.h[3] -; CHECK-FP16-NEXT: mov v2.s[1], w5 -; CHECK-FP16-NEXT: fcvtzu w3, h0 -; CHECK-FP16-NEXT: fmov w4, s2 +; CHECK-FP16-NEXT: fmov w4, s1 ; CHECK-FP16-NEXT: ret %x = call <6 x i32> @llvm.fptoui.sat.v6f16.v6i32(<6 x half> %f) ret <6 x i32> %x @@ -657,33 +659,33 @@ ; CHECK-CVT-NEXT: fcvtl2 v0.4s, v0.8h ; CHECK-CVT-NEXT: fcvtzu v1.4s, v1.4s ; CHECK-CVT-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-CVT-NEXT: mov w5, v0.s[1] -; CHECK-CVT-NEXT: mov w6, v0.s[2] ; CHECK-CVT-NEXT: mov w1, v1.s[1] ; CHECK-CVT-NEXT: mov w2, v1.s[2] ; CHECK-CVT-NEXT: mov w3, v1.s[3] -; CHECK-CVT-NEXT: fmov w4, s0 +; CHECK-CVT-NEXT: mov w5, v0.s[1] +; CHECK-CVT-NEXT: mov w6, v0.s[2] ; CHECK-CVT-NEXT: fmov w0, s1 +; CHECK-CVT-NEXT: fmov w4, s0 ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_unsigned_v7f16_v7i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-NEXT: mov h2, v0.h[1] -; CHECK-FP16-NEXT: mov h3, v0.h[2] -; CHECK-FP16-NEXT: fcvtzu w8, h1 -; CHECK-FP16-NEXT: fcvtzu w1, h2 +; CHECK-FP16-NEXT: mov h4, v0.h[3] +; CHECK-FP16-NEXT: fcvtzu w0, h0 ; CHECK-FP16-NEXT: mov h2, v1.h[1] +; CHECK-FP16-NEXT: fcvtzu w8, h1 ; CHECK-FP16-NEXT: mov h1, v1.h[2] -; CHECK-FP16-NEXT: fcvtzu w2, h3 +; CHECK-FP16-NEXT: fcvtzu w3, h4 ; CHECK-FP16-NEXT: fmov s3, w8 ; CHECK-FP16-NEXT: fcvtzu w8, h2 +; CHECK-FP16-NEXT: mov h2, v0.h[2] ; CHECK-FP16-NEXT: fcvtzu w6, h1 +; CHECK-FP16-NEXT: mov h1, v0.h[1] ; CHECK-FP16-NEXT: mov v3.s[1], w8 -; CHECK-FP16-NEXT: fcvtzu w0, h0 -; CHECK-FP16-NEXT: mov h0, v0.h[3] +; CHECK-FP16-NEXT: fcvtzu w2, h2 +; CHECK-FP16-NEXT: fcvtzu w1, h1 ; CHECK-FP16-NEXT: mov v3.s[2], w6 -; CHECK-FP16-NEXT: fcvtzu w3, h0 ; CHECK-FP16-NEXT: mov w5, v3.s[1] ; CHECK-FP16-NEXT: fmov w4, s3 ; CHECK-FP16-NEXT: ret @@ -703,28 +705,28 @@ ; CHECK-FP16-LABEL: test_unsigned_v8f16_v8i32: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-FP16-NEXT: mov h1, v0.h[1] -; CHECK-FP16-NEXT: fcvtzu w9, h3 -; CHECK-FP16-NEXT: fcvtzu w10, h1 -; CHECK-FP16-NEXT: mov h1, v3.h[1] -; CHECK-FP16-NEXT: fcvtzu w8, h0 -; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: mov h4, v3.h[2] -; CHECK-FP16-NEXT: fcvtzu w12, h1 -; CHECK-FP16-NEXT: fmov s1, w9 -; CHECK-FP16-NEXT: fcvtzu w11, h2 -; CHECK-FP16-NEXT: fmov s2, w8 -; CHECK-FP16-NEXT: fcvtzu w8, h4 -; CHECK-FP16-NEXT: mov v1.s[1], w12 +; CHECK-FP16-NEXT: mov h4, v0.h[1] +; CHECK-FP16-NEXT: fcvtzu w9, h0 +; CHECK-FP16-NEXT: mov h2, v3.h[1] +; CHECK-FP16-NEXT: fcvtzu w8, h3 +; CHECK-FP16-NEXT: mov h5, v3.h[2] ; CHECK-FP16-NEXT: mov h3, v3.h[3] -; CHECK-FP16-NEXT: mov v2.s[1], w10 +; CHECK-FP16-NEXT: fmov s1, w8 +; CHECK-FP16-NEXT: fcvtzu w8, h2 +; CHECK-FP16-NEXT: fmov s2, w9 +; CHECK-FP16-NEXT: fcvtzu w9, h4 +; CHECK-FP16-NEXT: mov h4, v0.h[2] ; CHECK-FP16-NEXT: mov h0, v0.h[3] +; CHECK-FP16-NEXT: mov v1.s[1], w8 +; CHECK-FP16-NEXT: fcvtzu w8, h5 +; CHECK-FP16-NEXT: mov v2.s[1], w9 +; CHECK-FP16-NEXT: fcvtzu w9, h4 ; CHECK-FP16-NEXT: mov v1.s[2], w8 ; CHECK-FP16-NEXT: fcvtzu w8, h3 -; CHECK-FP16-NEXT: mov v2.s[2], w11 +; CHECK-FP16-NEXT: mov v2.s[2], w9 +; CHECK-FP16-NEXT: fcvtzu w9, h0 ; CHECK-FP16-NEXT: mov v1.s[3], w8 -; CHECK-FP16-NEXT: fcvtzu w8, h0 -; CHECK-FP16-NEXT: mov v2.s[3], w8 +; CHECK-FP16-NEXT: mov v2.s[3], w9 ; CHECK-FP16-NEXT: mov v0.16b, v2.16b ; CHECK-FP16-NEXT: ret %x = call <8 x i32> @llvm.fptoui.sat.v8f16.v8i32(<8 x half> %f) @@ -748,15 +750,15 @@ define <2 x i1> @test_unsigned_v2f32_v2i1(<2 x float> %f) { ; CHECK-LABEL: test_unsigned_v2f32_v2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: fmov s2, #1.00000000 -; CHECK-NEXT: mov s3, v0.s[1] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov s2, v0.s[1] +; CHECK-NEXT: fmov s3, #1.00000000 ; CHECK-NEXT: fmaxnm s0, s0, s1 -; CHECK-NEXT: fmaxnm s1, s3, s1 -; CHECK-NEXT: fminnm s0, s0, s2 +; CHECK-NEXT: fmaxnm s1, s2, s1 +; CHECK-NEXT: fminnm s0, s0, s3 +; CHECK-NEXT: fminnm s1, s1, s3 ; CHECK-NEXT: fcvtzu w8, s0 -; CHECK-NEXT: fminnm s1, s1, s2 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, s1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -769,16 +771,16 @@ define <2 x i8> @test_unsigned_v2f32_v2i8(<2 x float> %f) { ; CHECK-LABEL: test_unsigned_v2f32_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: mov w8, #1132396544 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s2, v0.s[1] -; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s3, w8 +; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmaxnm s1, s2, s1 ; CHECK-NEXT: fminnm s0, s0, s3 -; CHECK-NEXT: fcvtzu w8, s0 ; CHECK-NEXT: fminnm s1, s1, s3 +; CHECK-NEXT: fcvtzu w8, s0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, s1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -791,17 +793,17 @@ define <2 x i13> @test_unsigned_v2f32_v2i13(<2 x float> %f) { ; CHECK-LABEL: test_unsigned_v2f32_v2i13: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #63488 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov w8, #63488 ; CHECK-NEXT: movk w8, #17919, lsl #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s2, v0.s[1] ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s3, w8 ; CHECK-NEXT: fmaxnm s1, s2, s1 ; CHECK-NEXT: fminnm s0, s0, s3 -; CHECK-NEXT: fcvtzu w8, s0 ; CHECK-NEXT: fminnm s1, s1, s3 +; CHECK-NEXT: fcvtzu w8, s0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, s1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -814,17 +816,17 @@ define <2 x i16> @test_unsigned_v2f32_v2i16(<2 x float> %f) { ; CHECK-LABEL: test_unsigned_v2f32_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65280 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov w8, #65280 ; CHECK-NEXT: movk w8, #18303, lsl #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s2, v0.s[1] ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s3, w8 ; CHECK-NEXT: fmaxnm s1, s2, s1 ; CHECK-NEXT: fminnm s0, s0, s3 -; CHECK-NEXT: fcvtzu w8, s0 ; CHECK-NEXT: fminnm s1, s1, s3 +; CHECK-NEXT: fcvtzu w8, s0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, s1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -837,17 +839,17 @@ define <2 x i19> @test_unsigned_v2f32_v2i19(<2 x float> %f) { ; CHECK-LABEL: test_unsigned_v2f32_v2i19: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #65504 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov w8, #65504 ; CHECK-NEXT: movk w8, #18687, lsl #16 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s2, v0.s[1] ; CHECK-NEXT: fmaxnm s0, s0, s1 ; CHECK-NEXT: fmov s3, w8 ; CHECK-NEXT: fmaxnm s1, s2, s1 ; CHECK-NEXT: fminnm s0, s0, s3 -; CHECK-NEXT: fcvtzu w8, s0 ; CHECK-NEXT: fminnm s1, s1, s3 +; CHECK-NEXT: fcvtzu w8, s0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, s1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -872,13 +874,13 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov s1, v0.s[1] ; CHECK-NEXT: mov w8, #1484783615 +; CHECK-NEXT: mov x9, #1125899906842623 +; CHECK-NEXT: fcvtzu x10, s0 ; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fcvtzu x8, s1 ; CHECK-NEXT: fcmp s1, #0.0 -; CHECK-NEXT: mov x9, #1125899906842623 ; CHECK-NEXT: csel x8, xzr, x8, lt ; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: fcvtzu x10, s0 ; CHECK-NEXT: csel x8, x9, x8, gt ; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: csel x10, xzr, x10, lt @@ -895,8 +897,8 @@ ; CHECK-LABEL: test_unsigned_v2f32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fcvtzu x8, s0 ; CHECK-NEXT: mov s1, v0.s[1] +; CHECK-NEXT: fcvtzu x8, s0 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: fcvtzu x8, s1 ; CHECK-NEXT: mov v0.d[1], x8 @@ -926,15 +928,15 @@ ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: mov w8, #1904214015 ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: mov x21, #68719476735 -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: csel x10, xzr, x1, lt -; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: csel x19, x21, x10, gt -; CHECK-NEXT: csinv x20, x9, xzr, le ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: csel x19, x21, x9, gt +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x2, x20 @@ -944,10 +946,10 @@ ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: csel x1, x21, x9, gt ; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: fmov x0, d0 @@ -977,26 +979,26 @@ ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: mov w8, #2139095039 ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: csel x10, xzr, x0, lt -; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: csinv x19, x10, xzr, le -; CHECK-NEXT: csinv x20, x9, xzr, le ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: csinv x19, x9, xzr, le +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x2, x19 ; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: csinv x1, x9, xzr, le ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov v0.d[1], x1 @@ -1025,13 +1027,13 @@ ; CHECK-LABEL: test_unsigned_v2f64_v2i1: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: fmov d2, #1.00000000 -; CHECK-NEXT: mov d3, v0.d[1] +; CHECK-NEXT: mov d2, v0.d[1] +; CHECK-NEXT: fmov d3, #1.00000000 ; CHECK-NEXT: fmaxnm d0, d0, d1 -; CHECK-NEXT: fmaxnm d1, d3, d1 -; CHECK-NEXT: fminnm d0, d0, d2 +; CHECK-NEXT: fmaxnm d1, d2, d1 +; CHECK-NEXT: fminnm d0, d0, d3 +; CHECK-NEXT: fminnm d1, d1, d3 ; CHECK-NEXT: fcvtzu w8, d0 -; CHECK-NEXT: fminnm d1, d1, d2 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -1044,16 +1046,16 @@ define <2 x i8> @test_unsigned_v2f64_v2i8(<2 x double> %f) { ; CHECK-LABEL: test_unsigned_v2f64_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #246290604621824 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov x8, #246290604621824 ; CHECK-NEXT: movk x8, #16495, lsl #48 ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: fmov d3, x8 ; CHECK-NEXT: fmaxnm d1, d2, d1 ; CHECK-NEXT: fminnm d0, d0, d3 -; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fminnm d1, d1, d3 +; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -1066,16 +1068,16 @@ define <2 x i13> @test_unsigned_v2f64_v2i13(<2 x double> %f) { ; CHECK-LABEL: test_unsigned_v2f64_v2i13: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #280375465082880 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov x8, #280375465082880 ; CHECK-NEXT: movk x8, #16575, lsl #48 ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: fmov d3, x8 ; CHECK-NEXT: fmaxnm d1, d2, d1 ; CHECK-NEXT: fminnm d0, d0, d3 -; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fminnm d1, d1, d3 +; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -1088,16 +1090,16 @@ define <2 x i16> @test_unsigned_v2f64_v2i16(<2 x double> %f) { ; CHECK-LABEL: test_unsigned_v2f64_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #281337537757184 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov x8, #281337537757184 ; CHECK-NEXT: movk x8, #16623, lsl #48 ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: fmov d3, x8 ; CHECK-NEXT: fmaxnm d1, d2, d1 ; CHECK-NEXT: fminnm d0, d0, d3 -; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fminnm d1, d1, d3 +; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -1110,16 +1112,16 @@ define <2 x i19> @test_unsigned_v2f64_v2i19(<2 x double> %f) { ; CHECK-LABEL: test_unsigned_v2f64_v2i19: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #281457796841472 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov x8, #281457796841472 ; CHECK-NEXT: movk x8, #16671, lsl #48 ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: fmov d3, x8 ; CHECK-NEXT: fmaxnm d1, d2, d1 ; CHECK-NEXT: fminnm d0, d0, d3 -; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fminnm d1, d1, d3 +; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -1132,8 +1134,8 @@ define <2 x i32> @test_unsigned_v2f64_v2i32_duplicate(<2 x double> %f) { ; CHECK-LABEL: test_unsigned_v2f64_v2i32_duplicate: ; CHECK: // %bb.0: -; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: mov d1, v0.d[1] +; CHECK-NEXT: fcvtzu w8, d0 ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fcvtzu w8, d1 ; CHECK-NEXT: mov v0.s[1], w8 @@ -1146,16 +1148,16 @@ define <2 x i50> @test_unsigned_v2f64_v2i50(<2 x double> %f) { ; CHECK-LABEL: test_unsigned_v2f64_v2i50: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-8 ; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov x8, #-8 ; CHECK-NEXT: movk x8, #17167, lsl #48 ; CHECK-NEXT: mov d2, v0.d[1] ; CHECK-NEXT: fmaxnm d0, d0, d1 ; CHECK-NEXT: fmov d3, x8 ; CHECK-NEXT: fmaxnm d1, d2, d1 ; CHECK-NEXT: fminnm d0, d0, d3 -; CHECK-NEXT: fcvtzu x8, d0 ; CHECK-NEXT: fminnm d1, d1, d3 +; CHECK-NEXT: fcvtzu x8, d0 ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: fcvtzu x8, d1 ; CHECK-NEXT: mov v0.d[1], x8 @@ -1193,15 +1195,15 @@ ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: mov x8, #5057542381537067007 ; CHECK-NEXT: fcmp d8, #0.0 -; CHECK-NEXT: fmov d9, x8 ; CHECK-NEXT: mov x21, #68719476735 -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: csel x10, xzr, x1, lt -; CHECK-NEXT: fcmp d8, d9 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: csel x19, x21, x10, gt -; CHECK-NEXT: csinv x20, x9, xzr, le ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: fmov d9, x8 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt +; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: csel x19, x21, x9, gt +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x2, x20 @@ -1211,10 +1213,10 @@ ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp d0, d9 +; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: csel x1, x21, x9, gt ; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: fmov x0, d0 @@ -1243,26 +1245,26 @@ ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: mov x8, #5183643171103440895 ; CHECK-NEXT: fcmp d8, #0.0 -; CHECK-NEXT: fmov d9, x8 -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: csel x10, xzr, x0, lt -; CHECK-NEXT: fcmp d8, d9 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: csinv x19, x10, xzr, le -; CHECK-NEXT: csinv x20, x9, xzr, le ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: fmov d9, x8 +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: csinv x19, x9, xzr, le +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x2, x19 ; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp d0, d9 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: csinv x1, x9, xzr, le ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov v0.d[1], x1 @@ -1291,30 +1293,30 @@ ; CHECK-LABEL: test_unsigned_v4f16_v4i1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: fcvt s2, h0 ; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: mov h4, v0.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: fmov s4, #1.00000000 ; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: fmaxnm s2, s2, s1 ; CHECK-NEXT: fcvt s3, h3 -; CHECK-NEXT: fcvt s4, h4 -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: fmaxnm s1, s1, s2 -; CHECK-NEXT: fmaxnm s3, s3, s2 -; CHECK-NEXT: fmaxnm s4, s4, s2 -; CHECK-NEXT: fmaxnm s0, s0, s2 -; CHECK-NEXT: fmov s2, #1.00000000 -; CHECK-NEXT: fminnm s1, s1, s2 -; CHECK-NEXT: fcvtzu w8, s1 -; CHECK-NEXT: fminnm s1, s3, s2 -; CHECK-NEXT: fminnm s3, s4, s2 -; CHECK-NEXT: fminnm s2, s0, s2 +; CHECK-NEXT: fcvt s5, h5 +; CHECK-NEXT: fminnm s2, s2, s4 +; CHECK-NEXT: fmaxnm s3, s3, s1 +; CHECK-NEXT: fmaxnm s5, s5, s1 +; CHECK-NEXT: fcvtzu w8, s2 +; CHECK-NEXT: fminnm s2, s3, s4 +; CHECK-NEXT: fcvt s3, h0 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fcvtzu w8, s1 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: fcvtzu w8, s3 -; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: fcvtzu w9, s2 +; CHECK-NEXT: fmaxnm s1, s3, s1 +; CHECK-NEXT: fminnm s2, s5, s4 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: fminnm s1, s1, s4 ; CHECK-NEXT: fcvtzu w8, s2 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: fcvtzu w8, s1 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -1326,31 +1328,31 @@ ; CHECK-LABEL: test_unsigned_v4f16_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: fcvt s2, h0 ; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: mov h4, v0.h[2] -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: movi d2, #0000000000000000 ; CHECK-NEXT: mov w8, #1132396544 +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: fmaxnm s2, s2, s1 +; CHECK-NEXT: fmov s4, w8 ; CHECK-NEXT: fcvt s3, h3 -; CHECK-NEXT: fcvt s4, h4 -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: fmaxnm s1, s1, s2 -; CHECK-NEXT: fmaxnm s3, s3, s2 -; CHECK-NEXT: fmaxnm s4, s4, s2 -; CHECK-NEXT: fmaxnm s0, s0, s2 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fminnm s1, s1, s2 -; CHECK-NEXT: fcvtzu w8, s1 -; CHECK-NEXT: fminnm s1, s3, s2 -; CHECK-NEXT: fminnm s3, s4, s2 -; CHECK-NEXT: fminnm s2, s0, s2 +; CHECK-NEXT: fcvt s5, h5 +; CHECK-NEXT: fminnm s2, s2, s4 +; CHECK-NEXT: fmaxnm s3, s3, s1 +; CHECK-NEXT: fmaxnm s5, s5, s1 +; CHECK-NEXT: fcvtzu w8, s2 +; CHECK-NEXT: fminnm s2, s3, s4 +; CHECK-NEXT: fcvt s3, h0 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fcvtzu w8, s1 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: fcvtzu w8, s3 -; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: fcvtzu w9, s2 +; CHECK-NEXT: fmaxnm s1, s3, s1 +; CHECK-NEXT: fminnm s2, s5, s4 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: fminnm s1, s1, s4 ; CHECK-NEXT: fcvtzu w8, s2 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: fcvtzu w8, s1 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -1362,32 +1364,32 @@ ; CHECK-LABEL: test_unsigned_v4f16_v4i13: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fcvt s1, h0 -; CHECK-NEXT: mov w8, #63488 +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: fcvt s2, h0 ; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: mov h4, v0.h[2] -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: mov w8, #63488 ; CHECK-NEXT: movk w8, #17919, lsl #16 +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: fmaxnm s2, s2, s1 ; CHECK-NEXT: fcvt s3, h3 -; CHECK-NEXT: fcvt s4, h4 -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: fmaxnm s1, s1, s2 -; CHECK-NEXT: fmaxnm s3, s3, s2 -; CHECK-NEXT: fmaxnm s4, s4, s2 -; CHECK-NEXT: fmaxnm s0, s0, s2 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fminnm s1, s1, s2 -; CHECK-NEXT: fcvtzu w8, s1 -; CHECK-NEXT: fminnm s1, s3, s2 -; CHECK-NEXT: fminnm s3, s4, s2 -; CHECK-NEXT: fminnm s2, s0, s2 +; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: fcvt s5, h5 +; CHECK-NEXT: fminnm s2, s2, s4 +; CHECK-NEXT: fmaxnm s3, s3, s1 +; CHECK-NEXT: fmaxnm s5, s5, s1 +; CHECK-NEXT: fcvtzu w8, s2 +; CHECK-NEXT: fminnm s2, s3, s4 +; CHECK-NEXT: fcvt s3, h0 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fcvtzu w8, s1 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: fcvtzu w8, s3 -; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: fcvtzu w9, s2 +; CHECK-NEXT: fmaxnm s1, s3, s1 +; CHECK-NEXT: fminnm s2, s5, s4 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: fminnm s1, s1, s4 ; CHECK-NEXT: fcvtzu w8, s2 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: fcvtzu w8, s1 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -1398,27 +1400,27 @@ define <4 x i16> @test_unsigned_v4f16_v4i16(<4 x half> %f) { ; CHECK-CVT-LABEL: test_unsigned_v4f16_v4i16: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: movi d1, #0000000000000000 +; CHECK-CVT-NEXT: fcvtl v0.4s, v0.4h ; CHECK-CVT-NEXT: mov w8, #65280 ; CHECK-CVT-NEXT: movk w8, #18303, lsl #16 ; CHECK-CVT-NEXT: fmaxnm s2, s0, s1 ; CHECK-CVT-NEXT: mov s3, v0.s[1] -; CHECK-CVT-NEXT: mov s4, v0.s[2] -; CHECK-CVT-NEXT: mov s0, v0.s[3] +; CHECK-CVT-NEXT: fmov s4, w8 +; CHECK-CVT-NEXT: mov s5, v0.s[2] +; CHECK-CVT-NEXT: fminnm s2, s2, s4 ; CHECK-CVT-NEXT: fmaxnm s3, s3, s1 -; CHECK-CVT-NEXT: fmaxnm s4, s4, s1 -; CHECK-CVT-NEXT: fmaxnm s0, s0, s1 -; CHECK-CVT-NEXT: fmov s1, w8 -; CHECK-CVT-NEXT: fminnm s2, s2, s1 +; CHECK-CVT-NEXT: fmaxnm s5, s5, s1 ; CHECK-CVT-NEXT: fcvtzu w8, s2 -; CHECK-CVT-NEXT: fminnm s2, s3, s1 -; CHECK-CVT-NEXT: fminnm s3, s4, s1 -; CHECK-CVT-NEXT: fminnm s1, s0, s1 +; CHECK-CVT-NEXT: fminnm s2, s3, s4 +; CHECK-CVT-NEXT: mov s3, v0.s[3] ; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: fcvtzu w9, s2 +; CHECK-CVT-NEXT: fminnm s2, s5, s4 +; CHECK-CVT-NEXT: fmaxnm s1, s3, s1 +; CHECK-CVT-NEXT: mov v0.h[1], w9 ; CHECK-CVT-NEXT: fcvtzu w8, s2 -; CHECK-CVT-NEXT: mov v0.h[1], w8 -; CHECK-CVT-NEXT: fcvtzu w8, s3 +; CHECK-CVT-NEXT: fminnm s1, s1, s4 ; CHECK-CVT-NEXT: mov v0.h[2], w8 ; CHECK-CVT-NEXT: fcvtzu w8, s1 ; CHECK-CVT-NEXT: mov v0.h[3], w8 @@ -1437,32 +1439,32 @@ ; CHECK-LABEL: test_unsigned_v4f16_v4i19: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fcvt s1, h0 -; CHECK-NEXT: mov w8, #65504 +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: fcvt s2, h0 ; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: mov h4, v0.h[2] -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: mov w8, #65504 ; CHECK-NEXT: movk w8, #18687, lsl #16 +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: fmaxnm s2, s2, s1 ; CHECK-NEXT: fcvt s3, h3 -; CHECK-NEXT: fcvt s4, h4 -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: fmaxnm s1, s1, s2 -; CHECK-NEXT: fmaxnm s3, s3, s2 -; CHECK-NEXT: fmaxnm s4, s4, s2 -; CHECK-NEXT: fmaxnm s0, s0, s2 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fminnm s1, s1, s2 -; CHECK-NEXT: fcvtzu w8, s1 -; CHECK-NEXT: fminnm s1, s3, s2 -; CHECK-NEXT: fminnm s3, s4, s2 -; CHECK-NEXT: fminnm s2, s0, s2 +; CHECK-NEXT: fmov s4, w8 +; CHECK-NEXT: fcvt s5, h5 +; CHECK-NEXT: fminnm s2, s2, s4 +; CHECK-NEXT: fmaxnm s3, s3, s1 +; CHECK-NEXT: fmaxnm s5, s5, s1 +; CHECK-NEXT: fcvtzu w8, s2 +; CHECK-NEXT: fminnm s2, s3, s4 +; CHECK-NEXT: fcvt s3, h0 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fcvtzu w8, s1 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: fcvtzu w8, s3 -; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: fcvtzu w9, s2 +; CHECK-NEXT: fmaxnm s1, s3, s1 +; CHECK-NEXT: fminnm s2, s5, s4 +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: fminnm s1, s1, s4 ; CHECK-NEXT: fcvtzu w8, s2 +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: fcvtzu w8, s1 ; CHECK-NEXT: mov v0.s[3], w8 ; CHECK-NEXT: ret %x = call <4 x i19> @llvm.fptoui.sat.v4f16.v4i19(<4 x half> %f) @@ -1479,17 +1481,18 @@ ; CHECK-FP16-LABEL: test_unsigned_v4f16_v4i32_duplicate: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP16-NEXT: mov h2, v0.h[1] ; CHECK-FP16-NEXT: fcvtzu w8, h0 -; CHECK-FP16-NEXT: mov h1, v0.h[1] +; CHECK-FP16-NEXT: fmov s1, w8 +; CHECK-FP16-NEXT: fcvtzu w8, h2 ; CHECK-FP16-NEXT: mov h2, v0.h[2] -; CHECK-FP16-NEXT: mov h3, v0.h[3] -; CHECK-FP16-NEXT: fmov s0, w8 -; CHECK-FP16-NEXT: fcvtzu w8, h1 -; CHECK-FP16-NEXT: fcvtzu w9, h2 -; CHECK-FP16-NEXT: mov v0.s[1], w8 -; CHECK-FP16-NEXT: mov v0.s[2], w9 -; CHECK-FP16-NEXT: fcvtzu w8, h3 -; CHECK-FP16-NEXT: mov v0.s[3], w8 +; CHECK-FP16-NEXT: mov h0, v0.h[3] +; CHECK-FP16-NEXT: mov v1.s[1], w8 +; CHECK-FP16-NEXT: fcvtzu w8, h2 +; CHECK-FP16-NEXT: mov v1.s[2], w8 +; CHECK-FP16-NEXT: fcvtzu w8, h0 +; CHECK-FP16-NEXT: mov v1.s[3], w8 +; CHECK-FP16-NEXT: mov v0.16b, v1.16b ; CHECK-FP16-NEXT: ret %x = call <4 x i32> @llvm.fptoui.sat.v4f16.v4i32(<4 x half> %f) ret <4 x i32> %x @@ -1501,33 +1504,33 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: fcvt s1, h0 ; CHECK-NEXT: mov w8, #1484783615 -; CHECK-NEXT: fcvtzu x10, s1 +; CHECK-NEXT: mov h2, v0.h[1] +; CHECK-NEXT: fmov s3, w8 +; CHECK-NEXT: fcvtzu x9, s1 ; CHECK-NEXT: fcmp s1, #0.0 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: csel x8, xzr, x10, lt -; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: mov h1, v0.h[1] +; CHECK-NEXT: fcvt s2, h2 +; CHECK-NEXT: csel x8, xzr, x9, lt +; CHECK-NEXT: fcmp s1, s3 +; CHECK-NEXT: mov h1, v0.h[2] ; CHECK-NEXT: mov x9, #1125899906842623 -; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvtzu x10, s1 +; CHECK-NEXT: fcvtzu x10, s2 +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x0, x9, x8, gt -; CHECK-NEXT: fcmp s1, #0.0 -; CHECK-NEXT: csel x8, xzr, x10, lt -; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: fcmp s2, #0.0 ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: csel x8, xzr, x10, lt +; CHECK-NEXT: fcmp s2, s3 +; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: fcvtzu x10, s1 ; CHECK-NEXT: csel x1, x9, x8, gt ; CHECK-NEXT: fcmp s1, #0.0 -; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: csel x8, xzr, x10, lt -; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: fcvtzu x11, s0 +; CHECK-NEXT: fcmp s1, s3 ; CHECK-NEXT: csel x2, x9, x8, gt +; CHECK-NEXT: fcvtzu x8, s0 ; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: csel x8, xzr, x11, lt -; CHECK-NEXT: fcmp s0, s2 +; CHECK-NEXT: csel x8, xzr, x8, lt +; CHECK-NEXT: fcmp s0, s3 ; CHECK-NEXT: csel x3, x9, x8, gt ; CHECK-NEXT: ret %x = call <4 x i50> @llvm.fptoui.sat.v4f16.v4i50(<4 x half> %f) @@ -1538,38 +1541,37 @@ ; CHECK-CVT-LABEL: test_unsigned_v4f16_v4i64: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-CVT-NEXT: fcvt s1, h0 -; CHECK-CVT-NEXT: mov h2, v0.h[1] -; CHECK-CVT-NEXT: fcvtzu x8, s1 -; CHECK-CVT-NEXT: fcvt s1, h2 -; CHECK-CVT-NEXT: fmov d2, x8 -; CHECK-CVT-NEXT: fcvtzu x8, s1 ; CHECK-CVT-NEXT: mov h1, v0.h[2] +; CHECK-CVT-NEXT: mov h2, v0.h[1] +; CHECK-CVT-NEXT: fcvt s3, h0 ; CHECK-CVT-NEXT: mov h0, v0.h[3] ; CHECK-CVT-NEXT: fcvt s1, h1 -; CHECK-CVT-NEXT: mov v2.d[1], x8 -; CHECK-CVT-NEXT: fcvtzu x8, s1 -; CHECK-CVT-NEXT: fcvt s0, h0 -; CHECK-CVT-NEXT: fmov d1, x8 -; CHECK-CVT-NEXT: fcvtzu x8, s0 -; CHECK-CVT-NEXT: mov v1.d[1], x8 -; CHECK-CVT-NEXT: mov v0.16b, v2.16b +; CHECK-CVT-NEXT: fcvt s2, h2 +; CHECK-CVT-NEXT: fcvtzu x8, s3 +; CHECK-CVT-NEXT: fcvt s3, h0 +; CHECK-CVT-NEXT: fcvtzu x9, s1 +; CHECK-CVT-NEXT: fmov d0, x8 +; CHECK-CVT-NEXT: fcvtzu x8, s2 +; CHECK-CVT-NEXT: fmov d1, x9 +; CHECK-CVT-NEXT: fcvtzu x9, s3 +; CHECK-CVT-NEXT: mov v0.d[1], x8 +; CHECK-CVT-NEXT: mov v1.d[1], x9 ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: test_unsigned_v4f16_v4i64: ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-FP16-NEXT: fcvtzu x8, h0 -; CHECK-FP16-NEXT: mov h1, v0.h[1] -; CHECK-FP16-NEXT: mov h2, v0.h[2] +; CHECK-FP16-NEXT: mov h1, v0.h[2] +; CHECK-FP16-NEXT: mov h2, v0.h[1] ; CHECK-FP16-NEXT: mov h3, v0.h[3] +; CHECK-FP16-NEXT: fcvtzu x8, h0 +; CHECK-FP16-NEXT: fcvtzu x9, h1 ; CHECK-FP16-NEXT: fmov d0, x8 -; CHECK-FP16-NEXT: fcvtzu x8, h1 -; CHECK-FP16-NEXT: fcvtzu x9, h2 -; CHECK-FP16-NEXT: mov v0.d[1], x8 +; CHECK-FP16-NEXT: fcvtzu x8, h2 ; CHECK-FP16-NEXT: fmov d1, x9 -; CHECK-FP16-NEXT: fcvtzu x8, h3 -; CHECK-FP16-NEXT: mov v1.d[1], x8 +; CHECK-FP16-NEXT: fcvtzu x9, h3 +; CHECK-FP16-NEXT: mov v0.d[1], x8 +; CHECK-FP16-NEXT: mov v1.d[1], x9 ; CHECK-FP16-NEXT: ret %x = call <4 x i64> @llvm.fptoui.sat.v4f16.v4i64(<4 x half> %f) ret <4 x i64> %x @@ -1597,64 +1599,64 @@ ; CHECK-NEXT: .cfi_offset b9, -80 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov h1, v0.h[2] -; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #1904214015 ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: mov x25, #68719476735 ; CHECK-NEXT: mov h0, v0.h[1] -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: csel x10, xzr, x1, lt +; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: mov x25, #68719476735 +; CHECK-NEXT: csel x19, x25, x9, gt +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: csel x19, x25, x10, gt -; CHECK-NEXT: csinv x20, x9, xzr, le ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x21, x25, x9, gt ; CHECK-NEXT: csinv x22, x8, xzr, le +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x23, x25, x9, gt ; CHECK-NEXT: csinv x24, x8, xzr, le +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x1, x25, x9, gt ; CHECK-NEXT: mov x2, x22 ; CHECK-NEXT: mov x3, x21 ; CHECK-NEXT: mov x4, x20 ; CHECK-NEXT: mov x5, x19 ; CHECK-NEXT: mov x6, x24 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt +; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: mov x7, x23 ; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: csel x1, x25, x9, gt ; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: ldp x30, x25, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret @@ -1683,63 +1685,63 @@ ; CHECK-NEXT: .cfi_offset b9, -80 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #2139095039 ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: mov h0, v0.h[2] -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: csel x10, xzr, x0, lt +; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 +; CHECK-NEXT: csinv x19, x9, xzr, le +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: csinv x19, x10, xzr, le -; CHECK-NEXT: csinv x20, x9, xzr, le ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csinv x21, x9, xzr, le ; CHECK-NEXT: csinv x22, x8, xzr, le +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csinv x23, x9, xzr, le ; CHECK-NEXT: csinv x24, x8, xzr, le +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: mov x2, x19 ; CHECK-NEXT: mov x3, x20 ; CHECK-NEXT: mov x4, x21 ; CHECK-NEXT: mov x5, x22 ; CHECK-NEXT: mov x6, x23 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt +; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: mov x7, x24 ; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: csinv x1, x9, xzr, le +; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], x1 +; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/framelayout-frame-record.mir b/llvm/test/CodeGen/AArch64/framelayout-frame-record.mir --- a/llvm/test/CodeGen/AArch64/framelayout-frame-record.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-frame-record.mir @@ -16,9 +16,8 @@ # CHECK: stp d9, d8, [sp, #-48]! # CHECK: stp x29, x30, [sp, #16] -# CHECK: str x19, [sp, #32] - # CHECK: add x29, sp, #16 +# CHECK: str x19, [sp, #32] # CHECK: .cfi_def_cfa w29, 32 # CHECK: .cfi_offset w19, -16 diff --git a/llvm/test/CodeGen/AArch64/framelayout-unaligned-fp.ll b/llvm/test/CodeGen/AArch64/framelayout-unaligned-fp.ll --- a/llvm/test/CodeGen/AArch64/framelayout-unaligned-fp.ll +++ b/llvm/test/CodeGen/AArch64/framelayout-unaligned-fp.ll @@ -28,12 +28,12 @@ ; CHECK-LABEL: b: ; CHECK: str d8, [sp, #-32]! ; CHECK-NEXT: stp x29, x30, [sp, #8] -; CHECK-NEXT: str x19, [sp, #24] ; CHECK-NEXT: add x29, sp, #8 +; CHECK-NEXT: str x19, [sp, #24] ; CHECK: sub sp, x29, #8 -; CHECK-NEXT: ldr x19, [sp, #24] ; CHECK-NEXT: ldp x29, x30, [sp, #8] +; CHECK-NEXT: ldr x19, [sp, #24] ; CHECK-NEXT: ldr d8, [sp], #32 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/func-calls.ll b/llvm/test/CodeGen/AArch64/func-calls.ll --- a/llvm/test/CodeGen/AArch64/func-calls.ll +++ b/llvm/test/CodeGen/AArch64/func-calls.ll @@ -139,8 +139,8 @@ ; CHECK-NOT: mov x1 ; CHECK-LE: mov x2, #{{0x2a|42}} ; CHECK-LE: mov x3, xzr -; CHECK-BE: mov {{x|w}}3, #{{0x2a|42}} ; CHECK-BE: mov x2, xzr +; CHECK-BE: mov {{x|w}}3, #{{0x2a|42}} ; CHECK: bl check_i128_regalign ret void diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll --- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll @@ -40,12 +40,12 @@ define i16 @rotl_i16(i16 %x, i16 %z) { ; CHECK-LABEL: rotl_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: neg w10, w1 -; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: neg w8, w1 ; CHECK-NEXT: and w9, w1, #0xf -; CHECK-NEXT: and w10, w10, #0xf +; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: and w10, w0, #0xffff ; CHECK-NEXT: lsl w9, w0, w9 -; CHECK-NEXT: lsr w8, w8, w10 +; CHECK-NEXT: lsr w8, w10, w8 ; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %f = call i16 @llvm.fshl.i16(i16 %x, i16 %x, i16 %z) @@ -81,8 +81,8 @@ ; CHECK-NEXT: neg v3.4s, v1.4s ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s ; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret @@ -132,13 +132,13 @@ define i16 @rotr_i16(i16 %x, i16 %z) { ; CHECK-LABEL: rotr_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: neg w8, w1 ; CHECK-NEXT: and w9, w1, #0xf -; CHECK-NEXT: neg w10, w1 -; CHECK-NEXT: lsr w8, w8, w9 -; CHECK-NEXT: and w9, w10, #0xf -; CHECK-NEXT: lsl w9, w0, w9 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: and w10, w0, #0xffff +; CHECK-NEXT: lsr w9, w10, w9 +; CHECK-NEXT: lsl w8, w0, w8 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %f = call i16 @llvm.fshr.i16(i16 %x, i16 %x, i16 %z) ret i16 %f @@ -167,10 +167,10 @@ define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) { ; CHECK-LABEL: rotr_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v3.4s, #31 -; CHECK-NEXT: neg v2.4s, v1.4s -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: and v2.16b, v2.16b, v3.16b +; CHECK-NEXT: movi v2.4s, #31 +; CHECK-NEXT: neg v3.4s, v1.4s +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v2.16b, v3.16b, v2.16b ; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s ; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll --- a/llvm/test/CodeGen/AArch64/funnel-shift.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll @@ -20,11 +20,11 @@ ; CHECK-LABEL: fshl_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w9, w2 -; CHECK-NEXT: lsr w10, w1, #1 -; CHECK-NEXT: lsl w8, w0, w2 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: mvn w8, w2 +; CHECK-NEXT: lsr w9, w1, #1 +; CHECK-NEXT: lsl w10, w0, w2 +; CHECK-NEXT: lsr w8, w9, w8 +; CHECK-NEXT: orr w0, w10, w8 ; CHECK-NEXT: ret %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) ret i32 %f @@ -33,11 +33,11 @@ define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) { ; CHECK-LABEL: fshl_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn w9, w2 -; CHECK-NEXT: lsr x10, x1, #1 -; CHECK-NEXT: lsl x8, x0, x2 -; CHECK-NEXT: lsr x9, x10, x9 -; CHECK-NEXT: orr x0, x8, x9 +; CHECK-NEXT: mvn w8, w2 +; CHECK-NEXT: lsr x9, x1, #1 +; CHECK-NEXT: lsl x10, x0, x2 +; CHECK-NEXT: lsr x8, x9, x8 +; CHECK-NEXT: orr x0, x10, x8 ; CHECK-NEXT: ret %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z) ret i64 %f @@ -46,29 +46,29 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; CHECK-LABEL: fshl_i128: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn w9, w4 -; CHECK-NEXT: and x12, x9, #0x7f -; CHECK-NEXT: extr x8, x3, x2, #1 +; CHECK-NEXT: mvn w8, w4 +; CHECK-NEXT: extr x9, x3, x2, #1 ; CHECK-NEXT: lsr x10, x3, #1 +; CHECK-NEXT: and x12, x8, #0x7f +; CHECK-NEXT: lsl x11, x10, #1 ; CHECK-NEXT: tst x12, #0x40 -; CHECK-NEXT: lsr x12, x0, #1 -; CHECK-NEXT: lsr x8, x8, x9 -; CHECK-NEXT: lsr x12, x12, x9 -; CHECK-NEXT: lsr x9, x10, x9 -; CHECK-NEXT: lsl x10, x10, #1 -; CHECK-NEXT: lsl x10, x10, x4 -; CHECK-NEXT: lsl x11, x1, x4 -; CHECK-NEXT: and x14, x4, #0x7f -; CHECK-NEXT: orr x8, x10, x8 -; CHECK-NEXT: lsl x13, x0, x4 -; CHECK-NEXT: orr x11, x11, x12 -; CHECK-NEXT: csel x10, xzr, x9, ne -; CHECK-NEXT: csel x8, x9, x8, ne -; CHECK-NEXT: tst x14, #0x40 -; CHECK-NEXT: csel x9, x13, x11, ne -; CHECK-NEXT: csel x11, xzr, x13, ne -; CHECK-NEXT: orr x1, x9, x10 -; CHECK-NEXT: orr x0, x11, x8 +; CHECK-NEXT: lsl x11, x11, x4 +; CHECK-NEXT: lsr x9, x9, x8 +; CHECK-NEXT: orr x9, x11, x9 +; CHECK-NEXT: lsr x11, x0, #1 +; CHECK-NEXT: lsr x10, x10, x8 +; CHECK-NEXT: lsl x12, x1, x4 +; CHECK-NEXT: lsr x8, x11, x8 +; CHECK-NEXT: and x11, x4, #0x7f +; CHECK-NEXT: csel x9, x10, x9, ne +; CHECK-NEXT: csel x10, xzr, x10, ne +; CHECK-NEXT: orr x8, x12, x8 +; CHECK-NEXT: lsl x12, x0, x4 +; CHECK-NEXT: tst x11, #0x40 +; CHECK-NEXT: csel x8, x12, x8, ne +; CHECK-NEXT: csel x11, xzr, x12, ne +; CHECK-NEXT: orr x1, x8, x10 +; CHECK-NEXT: orr x0, x11, x9 ; CHECK-NEXT: ret %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) ret i128 %f @@ -80,18 +80,18 @@ ; CHECK-LABEL: fshl_i37: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #31883 +; CHECK-NEXT: mov w9, #37 ; CHECK-NEXT: movk x8, #3542, lsl #16 +; CHECK-NEXT: ubfiz x10, x1, #26, #37 ; CHECK-NEXT: movk x8, #51366, lsl #32 ; CHECK-NEXT: movk x8, #56679, lsl #48 ; CHECK-NEXT: umulh x8, x2, x8 -; CHECK-NEXT: mov w9, #37 ; CHECK-NEXT: ubfx x8, x8, #5, #27 ; CHECK-NEXT: msub w8, w8, w9, w2 -; CHECK-NEXT: lsl x9, x0, x8 -; CHECK-NEXT: mvn w8, w8 -; CHECK-NEXT: ubfiz x10, x1, #26, #37 -; CHECK-NEXT: lsr x8, x10, x8 -; CHECK-NEXT: orr x0, x9, x8 +; CHECK-NEXT: mvn w9, w8 +; CHECK-NEXT: lsl x8, x0, x8 +; CHECK-NEXT: lsr x9, x10, x9 +; CHECK-NEXT: orr x0, x8, x9 ; CHECK-NEXT: ret %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z) ret i37 %f @@ -188,11 +188,11 @@ ; CHECK-LABEL: fshr_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w9, w2 -; CHECK-NEXT: lsl w10, w0, #1 -; CHECK-NEXT: lsr w8, w1, w2 -; CHECK-NEXT: lsl w9, w10, w9 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: mvn w8, w2 +; CHECK-NEXT: lsl w9, w0, #1 +; CHECK-NEXT: lsr w10, w1, w2 +; CHECK-NEXT: lsl w8, w9, w8 +; CHECK-NEXT: orr w0, w8, w10 ; CHECK-NEXT: ret %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) ret i32 %f @@ -201,11 +201,11 @@ define i64 @fshr_i64(i64 %x, i64 %y, i64 %z) { ; CHECK-LABEL: fshr_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn w9, w2 -; CHECK-NEXT: lsl x10, x0, #1 -; CHECK-NEXT: lsr x8, x1, x2 -; CHECK-NEXT: lsl x9, x10, x9 -; CHECK-NEXT: orr x0, x9, x8 +; CHECK-NEXT: mvn w8, w2 +; CHECK-NEXT: lsl x9, x0, #1 +; CHECK-NEXT: lsr x10, x1, x2 +; CHECK-NEXT: lsl x8, x9, x8 +; CHECK-NEXT: orr x0, x8, x10 ; CHECK-NEXT: ret %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 %z) ret i64 %f @@ -217,20 +217,20 @@ ; CHECK-LABEL: fshr_i37: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #31883 +; CHECK-NEXT: mov w9, #37 ; CHECK-NEXT: movk x8, #3542, lsl #16 +; CHECK-NEXT: lsl x10, x1, #27 ; CHECK-NEXT: movk x8, #51366, lsl #32 +; CHECK-NEXT: lsl x11, x0, #1 ; CHECK-NEXT: movk x8, #56679, lsl #48 ; CHECK-NEXT: umulh x8, x2, x8 -; CHECK-NEXT: mov w9, #37 ; CHECK-NEXT: lsr x8, x8, #5 ; CHECK-NEXT: msub w8, w8, w9, w2 -; CHECK-NEXT: lsl x10, x1, #27 ; CHECK-NEXT: add w8, w8, #27 -; CHECK-NEXT: lsr x9, x10, x8 -; CHECK-NEXT: mvn w8, w8 -; CHECK-NEXT: lsl x10, x0, #1 -; CHECK-NEXT: lsl x8, x10, x8 -; CHECK-NEXT: orr x0, x8, x9 +; CHECK-NEXT: mvn w9, w8 +; CHECK-NEXT: lsr x8, x10, x8 +; CHECK-NEXT: lsl x9, x11, x9 +; CHECK-NEXT: orr x0, x9, x8 ; CHECK-NEXT: ret %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z) ret i37 %f diff --git a/llvm/test/CodeGen/AArch64/global-merge-3.ll b/llvm/test/CodeGen/AArch64/global-merge-3.ll --- a/llvm/test/CodeGen/AArch64/global-merge-3.ll +++ b/llvm/test/CodeGen/AArch64/global-merge-3.ll @@ -7,19 +7,19 @@ @z = internal global i32 1, align 4 define dso_local void @f1(i32 %a1, i32 %a2, i32 %a3) { -;CHECK-APPLE-IOS: adrp x8, __MergedGlobals_x@PAGE+12 +;CHECK-APPLE-IOS: adrp x8, _z@PAGE +;CHECK-APPLE-IOS: adrp x9, __MergedGlobals_x@PAGE+12 ;CHECK-APPLE-IOS-NOT: adrp -;CHECK-APPLE-IOS: add x8, x8, __MergedGlobals_x@PAGEOFF+12 -;CHECK-APPLE-IOS: adrp x9, _z@PAGE -;CHECK-APPLE-IOS: str w0, [x8] -;CHECK-APPLE-IOS: str w1, [x8, #400] -;CHECK-APPLE-IOS: str w2, [x9, _z@PAGEOFF] -;CHECK: adrp x8, .L_MergedGlobals+12 -;CHECK: add x8, x8, :lo12:.L_MergedGlobals+12 -;CHECK: adrp x9, z -;CHECK: str w0, [x8] -;CHECK: str w1, [x8, #400] -;CHECK: str w2, [x9, :lo12:z] +;CHECK-APPLE-IOS: add x9, x9, __MergedGlobals_x@PAGEOFF+12 +;CHECK-APPLE-IOS: str w1, [x9, #400] +;CHECK-APPLE-IOS: str w0, [x9] +;CHECK-APPLE-IOS: str w2, [x8, _z@PAGEOFF] +;CHECK: adrp x8, z +;CHECK: adrp x9, .L_MergedGlobals+12 +;CHECK: add x9, x9, :lo12:.L_MergedGlobals+12 +;CHECK: str w1, [x9, #400] +;CHECK: str w0, [x9] +;CHECK: str w2, [x8, :lo12:z] %x3 = getelementptr inbounds [100 x i32], [100 x i32]* @x, i32 0, i64 3 %y3 = getelementptr inbounds [100 x i32], [100 x i32]* @y, i32 0, i64 3 store i32 %a1, i32* %x3, align 4 diff --git a/llvm/test/CodeGen/AArch64/half.ll b/llvm/test/CodeGen/AArch64/half.ll --- a/llvm/test/CodeGen/AArch64/half.ll +++ b/llvm/test/CodeGen/AArch64/half.ll @@ -102,15 +102,15 @@ ; CHECK-NEXT: mov w8, #24576 ; CHECK-NEXT: fmov s0, w1 ; CHECK-NEXT: movk w8, #15974, lsl #16 -; CHECK-NEXT: mov w9, #16384 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: movk w9, #15428, lsl #16 ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #16384 +; CHECK-NEXT: movk w8, #15428, lsl #16 ; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: mov w10, #4 +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: mov w8, #4 ; CHECK-NEXT: fccmp s0, s2, #8, pl -; CHECK-NEXT: csinc w8, w10, wzr, mi +; CHECK-NEXT: csinc w8, w8, wzr, mi ; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cinc w0, w8, pl ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll --- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -203,8 +203,8 @@ ; CHECK-LABEL: vec_4xi32_nonsplat_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 @@ -233,8 +233,8 @@ define <4 x i1> @vec_4xi32_nonsplat_undef1_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: vec_4xi32_nonsplat_undef1_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 @@ -248,8 +248,8 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: vec_4xi32_nonsplat_undef2_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 diff --git a/llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll b/llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll --- a/llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll +++ b/llvm/test/CodeGen/AArch64/hwasan-check-memaccess.ll @@ -8,8 +8,8 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: mov x9, x0 - ; CHECK-NEXT: bl __hwasan_check_x1_1 ; CHECK-NEXT: mov x0, x1 + ; CHECK-NEXT: bl __hwasan_check_x1_1 ; CHECK-NEXT: ldr x30, [sp], #16 ; CHECK-NEXT: ret call void @llvm.hwasan.check.memaccess(i8* %x0, i8* %x1, i32 1) diff --git a/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll b/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll --- a/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll +++ b/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll @@ -8,10 +8,10 @@ ; CHECK-LABEL: test1: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, x -; CHECK-NEXT: add x8, x8, :lo12:x -; CHECK-NEXT: ldp x8, x9, [x8] ; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x8, x8, :lo12:x ; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: ldp x8, x9, [x8] ; CHECK-NEXT: stp x8, x9, [x10] ; CHECK-NEXT: ret %tmp = load volatile i128, i128* @x @@ -23,10 +23,10 @@ ; CHECK-LABEL: test2: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, x -; CHECK-NEXT: add x8, x8, :lo12:x -; CHECK-NEXT: ldp x8, x9, [x8, #504] ; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x8, x8, :lo12:x ; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: ldp x8, x9, [x8, #504] ; CHECK-NEXT: stp x8, x9, [x10, #504] ; CHECK-NEXT: ret %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 504) to i128*) @@ -38,12 +38,12 @@ ; CHECK-LABEL: test3: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, x -; CHECK-NEXT: add x8, x8, :lo12:x -; CHECK-NEXT: add x8, x8, #512 -; CHECK-NEXT: ldp x8, x9, [x8] ; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x8, x8, :lo12:x ; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: add x8, x8, #512 ; CHECK-NEXT: add x10, x10, #512 +; CHECK-NEXT: ldp x8, x9, [x8] ; CHECK-NEXT: stp x8, x9, [x10] ; CHECK-NEXT: ret %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 512) to i128*) @@ -55,10 +55,10 @@ ; CHECK-LABEL: test4: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, x -; CHECK-NEXT: add x8, x8, :lo12:x -; CHECK-NEXT: ldp x8, x9, [x8, #-512] ; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x8, x8, :lo12:x ; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: ldp x8, x9, [x8, #-512] ; CHECK-NEXT: stp x8, x9, [x10, #-512] ; CHECK-NEXT: ret %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -512) to i128*) @@ -70,12 +70,12 @@ ; CHECK-LABEL: test5: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, x -; CHECK-NEXT: add x8, x8, :lo12:x -; CHECK-NEXT: sub x8, x8, #520 -; CHECK-NEXT: ldp x8, x9, [x8] ; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x8, x8, :lo12:x ; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: sub x8, x8, #520 ; CHECK-NEXT: sub x10, x10, #520 +; CHECK-NEXT: ldp x8, x9, [x8] ; CHECK-NEXT: stp x8, x9, [x10] ; CHECK-NEXT: ret %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -520) to i128*) @@ -87,12 +87,12 @@ ; CHECK-LABEL: test6: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, x -; CHECK-NEXT: add x8, x8, :lo12:x -; CHECK-NEXT: sub x8, x8, #520 -; CHECK-NEXT: ldp x8, x9, [x8] ; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x8, x8, :lo12:x ; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: sub x8, x8, #520 ; CHECK-NEXT: sub x10, x10, #520 +; CHECK-NEXT: ldp x8, x9, [x8] ; CHECK-NEXT: stp x8, x9, [x10] ; CHECK-NEXT: ret %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -520) to i128*) @@ -104,12 +104,12 @@ ; CHECK-LABEL: test7: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, x -; CHECK-NEXT: add x8, x8, :lo12:x -; CHECK-NEXT: add x8, x8, #503 -; CHECK-NEXT: ldp x8, x9, [x8] ; CHECK-NEXT: adrp x10, y +; CHECK-NEXT: add x8, x8, :lo12:x ; CHECK-NEXT: add x10, x10, :lo12:y +; CHECK-NEXT: add x8, x8, #503 ; CHECK-NEXT: add x10, x10, #503 +; CHECK-NEXT: ldp x8, x9, [x8] ; CHECK-NEXT: stp x8, x9, [x10] ; CHECK-NEXT: ret %tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 503) to i128*) diff --git a/llvm/test/CodeGen/AArch64/implicit-null-check.ll b/llvm/test/CodeGen/AArch64/implicit-null-check.ll --- a/llvm/test/CodeGen/AArch64/implicit-null-check.ll +++ b/llvm/test/CodeGen/AArch64/implicit-null-check.ll @@ -224,11 +224,11 @@ define i32 @imp_null_check_hoist_over_udiv(i32* %x, i32 %a, i32 %b) { ; CHECK-LABEL: imp_null_check_hoist_over_udiv: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: .Ltmp6: -; CHECK-NEXT: ldr w8, [x0] // on-fault: .LBB9_2 +; CHECK-NEXT: cbz x0, .LBB9_2 ; CHECK-NEXT: // %bb.1: // %not_null -; CHECK-NEXT: udiv w9, w1, w2 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: udiv w8, w1, w2 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB9_2: ; CHECK-NEXT: mov w0, #42 @@ -279,7 +279,7 @@ define i32 @imp_null_check_gep_load_with_use_dep(i32* %x, i32 %a) { ; CHECK-LABEL: imp_null_check_gep_load_with_use_dep: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: .Ltmp6: ; CHECK-NEXT: ldr w8, [x0] // on-fault: .LBB11_2 ; CHECK-NEXT: // %bb.1: // %not_null ; CHECK-NEXT: add w9, w0, w1 @@ -404,7 +404,7 @@ define i32 @imp_null_check_neg_gep_load(i32* %x) { ; CHECK-LABEL: imp_null_check_neg_gep_load: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: .Ltmp7: ; CHECK-NEXT: ldur w0, [x0, #-128] // on-fault: .LBB16_2 ; CHECK-NEXT: // %bb.1: // %not_null ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll --- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -79,9 +79,9 @@ ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: ushll v0.8h, v1.8b, #0 -; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: str q1, [sp] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -98,18 +98,18 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: cnth x8 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: cnth x9 -; CHECK-NEXT: sub x9, x9, #8 -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: cmp x9, #8 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: sub x8, x8, #8 +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: cmp x8, #8 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: ushll v0.8h, v1.8b, #0 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -128,9 +128,9 @@ ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: ushll v0.4s, v1.4h, #0 -; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: str q1, [sp] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -147,18 +147,18 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: cntw x8 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: cntw x9 -; CHECK-NEXT: sub x9, x9, #4 -; CHECK-NEXT: mov w8, #4 -; CHECK-NEXT: cmp x9, #4 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: sub x8, x8, #4 +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: ushll v0.4s, v1.4h, #0 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -177,9 +177,9 @@ ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: ushll v0.2d, v1.2s, #0 -; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: str q1, [sp] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -196,18 +196,18 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] ; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: sub x9, x9, #2 -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: cmp x9, #2 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: ushll v0.2d, v1.2s, #0 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -223,20 +223,20 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ptrue p1.s, vl8 +; CHECK-NEXT: subs x8, x8, #8 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] -; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: subs x8, x8, #8 ; CHECK-NEXT: csel x8, xzr, x8, lo ; CHECK-NEXT: mov w9, #8 ; CHECK-NEXT: cmp x8, #8 ; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: uunpklo z0.d, z1.s ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1d { z0.d }, p0, [x9, x8, lsl #3] +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: st1d { z1.d }, p0, [x9, x8, lsl #3] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll --- a/llvm/test/CodeGen/AArch64/isinf.ll +++ b/llvm/test/CodeGen/AArch64/isinf.ll @@ -11,8 +11,8 @@ ; CHECK-LABEL: replace_isinf_call_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: fabs h0, h0 +; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: fcmp h0, h1 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/known-never-nan.ll b/llvm/test/CodeGen/AArch64/known-never-nan.ll --- a/llvm/test/CodeGen/AArch64/known-never-nan.ll +++ b/llvm/test/CodeGen/AArch64/known-never-nan.ll @@ -6,11 +6,11 @@ ; CHECK-LABEL: fmaxnm: ; CHECK: // %bb.0: ; CHECK-NEXT: ucvtf s0, w0 -; CHECK-NEXT: fmov s1, #11.00000000 -; CHECK-NEXT: ucvtf s2, w1 +; CHECK-NEXT: ucvtf s1, w1 +; CHECK-NEXT: fmov s2, #11.00000000 ; CHECK-NEXT: fmov s3, #17.00000000 -; CHECK-NEXT: fadd s0, s0, s1 -; CHECK-NEXT: fadd s1, s2, s3 +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: fadd s1, s1, s3 ; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: fcsel s0, s0, s1, pl ; CHECK-NEXT: ret @@ -31,10 +31,10 @@ ; CHECK-NEXT: mov w8, #-8388608 ; CHECK-NEXT: ucvtf s0, w0 ; CHECK-NEXT: ucvtf s1, w1 -; CHECK-NEXT: fmov s2, #17.00000000 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmul s0, s0, s3 -; CHECK-NEXT: fadd s1, s1, s2 +; CHECK-NEXT: fmov s3, #17.00000000 +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: fadd s1, s1, s3 +; CHECK-NEXT: fmul s0, s0, s2 ; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: fcsel s0, s0, s1, pl ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/ldst-opt.ll b/llvm/test/CodeGen/AArch64/ldst-opt.ll --- a/llvm/test/CodeGen/AArch64/ldst-opt.ll +++ b/llvm/test/CodeGen/AArch64/ldst-opt.ll @@ -1671,8 +1671,9 @@ ; CHECK-LABEL: bug34674: ; CHECK: // %entry ; CHECK-NEXT: mov [[ZREG:x[0-9]+]], xzr -; CHECK-DAG: stp xzr, xzr, [x0] -; CHECK-DAG: add x{{[0-9]+}}, [[ZREG]], #1 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add x0, [[ZREG]], #1 +; CHECK-NEXT: stp xzr, xzr, [x8] define i64 @bug34674(<2 x i64>* %p) { entry: store <2 x i64> zeroinitializer, <2 x i64>* %p diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll --- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll +++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll @@ -12,18 +12,18 @@ ; CHECK-NEXT: sunpkhi z3.h, z0.b ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpkhi z4.s, z2.h ; CHECK-NEXT: sunpkhi z5.s, z3.h ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: sunpkhi z5.s, z1.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpkhi z3.s, z1.h +; CHECK-NEXT: sunpkhi z5.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h @@ -36,9 +36,9 @@ define @sdiv_i16( %a, %b) { ; CHECK-LABEL: sdiv_i16: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z2.s, z1.h ; CHECK-NEXT: sunpkhi z3.s, z0.h -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s @@ -113,24 +113,24 @@ ; CHECK-NEXT: sunpkhi z2.h, z1.b ; CHECK-NEXT: sunpkhi z3.h, z0.b ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sunpklo z4.h, z1.b -; CHECK-NEXT: sunpklo z5.h, z0.b -; CHECK-NEXT: sunpkhi z6.s, z2.h -; CHECK-NEXT: sunpkhi z7.s, z3.h +; CHECK-NEXT: sunpkhi z5.s, z2.h +; CHECK-NEXT: sunpkhi z6.s, z3.h ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: sunpkhi z7.s, z4.h +; CHECK-NEXT: sunpklo z4.h, z1.b ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: sunpkhi z3.s, z5.h +; CHECK-NEXT: sunpklo z3.h, z0.b +; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: sunpkhi z6.s, z4.h +; CHECK-NEXT: sunpkhi z7.s, z3.h ; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z7.s -; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: ret %div = srem %a, %b @@ -140,11 +140,11 @@ define @srem_i16( %a, %b) { ; CHECK-LABEL: srem_i16: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z2.s, z1.h ; CHECK-NEXT: sunpkhi z3.s, z0.h -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: sunpklo z4.s, z1.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: sunpklo z5.s, z0.h ; CHECK-NEXT: movprfx z3, z5 ; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s @@ -191,18 +191,18 @@ ; CHECK-NEXT: uunpkhi z3.h, z0.b ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpkhi z4.s, z2.h ; CHECK-NEXT: uunpkhi z5.s, z3.h ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: uunpkhi z5.s, z1.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpkhi z3.s, z1.h +; CHECK-NEXT: uunpkhi z5.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h @@ -215,9 +215,9 @@ define @udiv_i16( %a, %b) { ; CHECK-LABEL: udiv_i16: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z2.s, z1.h ; CHECK-NEXT: uunpkhi z3.s, z0.h -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s @@ -293,24 +293,24 @@ ; CHECK-NEXT: uunpkhi z2.h, z1.b ; CHECK-NEXT: uunpkhi z3.h, z0.b ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uunpklo z4.h, z1.b -; CHECK-NEXT: uunpklo z5.h, z0.b -; CHECK-NEXT: uunpkhi z6.s, z2.h -; CHECK-NEXT: uunpkhi z7.s, z3.h +; CHECK-NEXT: uunpkhi z5.s, z2.h +; CHECK-NEXT: uunpkhi z6.s, z3.h ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s -; CHECK-NEXT: uunpkhi z7.s, z4.h +; CHECK-NEXT: uunpklo z4.h, z1.b ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: uunpkhi z3.s, z5.h +; CHECK-NEXT: uunpklo z3.h, z0.b +; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s +; CHECK-NEXT: uunpkhi z6.s, z4.h +; CHECK-NEXT: uunpkhi z7.s, z3.h ; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z7.s -; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z5.h +; CHECK-NEXT: uzp1 z3.h, z3.h, z6.h ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b ; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b ; CHECK-NEXT: ret %div = urem %a, %b @@ -320,11 +320,11 @@ define @urem_i16( %a, %b) { ; CHECK-LABEL: urem_i16: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z2.s, z1.h ; CHECK-NEXT: uunpkhi z3.s, z0.h -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: uunpklo z4.s, z1.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: uunpklo z5.s, z0.h ; CHECK-NEXT: movprfx z3, z5 ; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s @@ -424,9 +424,9 @@ ; CHECK-LABEL: smin_split_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: smin z2.h, p0/m, z2.h, z6.h ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z4.h ; CHECK-NEXT: smin z1.h, p0/m, z1.h, z5.h -; CHECK-NEXT: smin z2.h, p0/m, z2.h, z6.h ; CHECK-NEXT: smin z3.h, p0/m, z3.h, z7.h ; CHECK-NEXT: ret %cmp = icmp slt %a, %b @@ -775,8 +775,8 @@ ; CHECK-LABEL: asr_promote_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %shr = ashr %a, %b @@ -1067,9 +1067,9 @@ ; CHECK-LABEL: cmp_split_64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p3.b +; CHECK-NEXT: cmpgt p2.b, p3/z, z2.b, z6.b ; CHECK-NEXT: cmpgt p0.b, p3/z, z0.b, z4.b ; CHECK-NEXT: cmpgt p1.b, p3/z, z1.b, z5.b -; CHECK-NEXT: cmpgt p2.b, p3/z, z2.b, z6.b ; CHECK-NEXT: cmpgt p3.b, p3/z, z3.b, z7.b ; CHECK-NEXT: ret %cmp = icmp sgt %a, %b @@ -1083,14 +1083,15 @@ define @fshl_i64( %a, %b, %c){ ; CHECK-LABEL: fshl_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.d, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z3.d, z2.d, z3.d +; CHECK-NEXT: mov z4.d, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: eor z2.d, z2.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lsr z1.d, z1.d, #1 -; CHECK-NEXT: and z2.d, z2.d, #0x3f ; CHECK-NEXT: and z3.d, z3.d, #0x3f -; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z2.d -; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: and z2.d, z2.d, #0x3f +; CHECK-NEXT: lsr z1.d, z1.d, #1 +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z3.d +; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z2.d ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %fshl = call @llvm.fshl.nxv2i64( %a, %b, %c) @@ -1102,18 +1103,18 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov z6.d, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: eor z7.d, z5.d, z6.d -; CHECK-NEXT: and z5.d, z5.d, #0x3f -; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z5.d -; CHECK-NEXT: eor z5.d, z4.d, z6.d +; CHECK-NEXT: eor z7.d, z4.d, z6.d +; CHECK-NEXT: eor z6.d, z5.d, z6.d +; CHECK-NEXT: and z7.d, z7.d, #0x3f ; CHECK-NEXT: lsr z2.d, z2.d, #1 -; CHECK-NEXT: lsr z3.d, z3.d, #1 ; CHECK-NEXT: and z4.d, z4.d, #0x3f ; CHECK-NEXT: and z5.d, z5.d, #0x3f -; CHECK-NEXT: and z7.d, z7.d, #0x3f +; CHECK-NEXT: and z6.d, z6.d, #0x3f +; CHECK-NEXT: lsr z3.d, z3.d, #1 +; CHECK-NEXT: lsr z2.d, p0/m, z2.d, z7.d ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z4.d -; CHECK-NEXT: lsr z2.d, p0/m, z2.d, z5.d -; CHECK-NEXT: lsr z3.d, p0/m, z3.d, z7.d +; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z5.d +; CHECK-NEXT: lsr z3.d, p0/m, z3.d, z6.d ; CHECK-NEXT: orr z0.d, z0.d, z2.d ; CHECK-NEXT: orr z1.d, z1.d, z3.d ; CHECK-NEXT: ret @@ -1127,11 +1128,11 @@ ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: subr z1.d, z1.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z2.d, z2.d, #0x3f ; CHECK-NEXT: and z1.d, z1.d, #0x3f -; CHECK-NEXT: lslr z2.d, p0/m, z2.d, z0.d -; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: and z2.d, z2.d, #0x3f +; CHECK-NEXT: lsrr z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %fshl = call @llvm.fshl.nxv2i64( %a, %a, %b) ret %fshl @@ -1143,19 +1144,20 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov z4.d, z2.d ; CHECK-NEXT: subr z2.d, z2.d, #0 // =0x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z4.d, z4.d, #0x3f -; CHECK-NEXT: and z2.d, z2.d, #0x3f -; CHECK-NEXT: lslr z4.d, p0/m, z4.d, z0.d -; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z2.d -; CHECK-NEXT: mov z2.d, z3.d +; CHECK-NEXT: mov z5.d, z3.d ; CHECK-NEXT: subr z3.d, z3.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z2.d, z2.d, #0x3f +; CHECK-NEXT: and z4.d, z4.d, #0x3f ; CHECK-NEXT: and z3.d, z3.d, #0x3f -; CHECK-NEXT: lslr z2.d, p0/m, z2.d, z1.d +; CHECK-NEXT: lsrr z2.d, p0/m, z2.d, z0.d +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: and z5.d, z5.d, #0x3f +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: lsl z4.d, p0/m, z4.d, z5.d ; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z4.d, z0.d -; CHECK-NEXT: orr z1.d, z2.d, z1.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: orr z1.d, z4.d, z1.d ; CHECK-NEXT: ret %fshl = call @llvm.fshl.nxv4i64( %a, %a, %b) ret %fshl @@ -1177,14 +1179,15 @@ define @fshr_i64( %a, %b, %c){ ; CHECK-LABEL: fshr_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.d, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z3.d, z2.d, z3.d +; CHECK-NEXT: mov z4.d, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: eor z2.d, z2.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lsl z0.d, z0.d, #1 -; CHECK-NEXT: and z2.d, z2.d, #0x3f ; CHECK-NEXT: and z3.d, z3.d, #0x3f -; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z2.d -; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z3.d +; CHECK-NEXT: and z2.d, z2.d, #0x3f +; CHECK-NEXT: lsl z0.d, z0.d, #1 +; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z2.d ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %fshr = call @llvm.fshr.nxv2i64( %a, %b, %c) @@ -1197,11 +1200,11 @@ ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: subr z1.d, z1.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: and z2.d, z2.d, #0x3f ; CHECK-NEXT: and z1.d, z1.d, #0x3f -; CHECK-NEXT: lsrr z2.d, p0/m, z2.d, z0.d -; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: orr z0.d, z2.d, z0.d +; CHECK-NEXT: and z2.d, z2.d, #0x3f +; CHECK-NEXT: lslr z1.d, p0/m, z1.d, z0.d +; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ret %fshr = call @llvm.fshr.nxv2i64( %a, %a, %b) ret %fshr diff --git a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll --- a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll +++ b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll @@ -10,9 +10,6 @@ define void @logical_32bit() minsize { ; CHECK-LABEL: logical_32bit: ; CHECK: // %bb.0: -; CHECK-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w19, -16 ; CHECK-NEXT: adrp x8, :got:var1_32 ; CHECK-NEXT: adrp x9, :got:var2_32 ; CHECK-NEXT: ldr x8, [x8, :got_lo12:var1_32] @@ -22,40 +19,39 @@ ; CHECK-NEXT: and w11, w10, w9 ; CHECK-NEXT: bic w12, w10, w9 ; CHECK-NEXT: orr w13, w10, w9 -; CHECK-NEXT: orn w14, w10, w9 -; CHECK-NEXT: eor w15, w10, w9 -; CHECK-NEXT: eon w16, w9, w10 -; CHECK-NEXT: and w17, w10, w9, lsl #31 -; CHECK-NEXT: bic w18, w10, w9, lsl #31 -; CHECK-NEXT: orr w0, w10, w9, lsl #31 -; CHECK-NEXT: orn w1, w10, w9, lsl #31 -; CHECK-NEXT: eor w2, w10, w9, lsl #31 -; CHECK-NEXT: eon w3, w10, w9, lsl #31 -; CHECK-NEXT: bic w4, w10, w9, asr #10 -; CHECK-NEXT: eor w5, w10, w9, asr #10 -; CHECK-NEXT: orn w6, w10, w9, lsr #1 -; CHECK-NEXT: eor w7, w10, w9, lsr #1 -; CHECK-NEXT: eon w19, w10, w9, ror #20 -; CHECK-NEXT: and w9, w10, w9, ror #20 ; CHECK-NEXT: str w11, [x8] +; CHECK-NEXT: orn w11, w10, w9 ; CHECK-NEXT: str w12, [x8] +; CHECK-NEXT: eor w12, w10, w9 ; CHECK-NEXT: str w13, [x8] -; CHECK-NEXT: str w14, [x8] -; CHECK-NEXT: str w15, [x8] -; CHECK-NEXT: str w16, [x8] -; CHECK-NEXT: str w17, [x8] -; CHECK-NEXT: str w18, [x8] -; CHECK-NEXT: str w0, [x8] -; CHECK-NEXT: str w1, [x8] -; CHECK-NEXT: str w2, [x8] -; CHECK-NEXT: str w3, [x8] -; CHECK-NEXT: str w4, [x8] -; CHECK-NEXT: str w5, [x8] -; CHECK-NEXT: str w6, [x8] -; CHECK-NEXT: str w7, [x8] -; CHECK-NEXT: str w19, [x8] +; CHECK-NEXT: eon w13, w9, w10 +; CHECK-NEXT: str w11, [x8] +; CHECK-NEXT: and w11, w10, w9, lsl #31 +; CHECK-NEXT: str w12, [x8] +; CHECK-NEXT: bic w12, w10, w9, lsl #31 +; CHECK-NEXT: str w13, [x8] +; CHECK-NEXT: orr w13, w10, w9, lsl #31 +; CHECK-NEXT: str w11, [x8] +; CHECK-NEXT: orn w11, w10, w9, lsl #31 +; CHECK-NEXT: str w12, [x8] +; CHECK-NEXT: eor w12, w10, w9, lsl #31 +; CHECK-NEXT: str w13, [x8] +; CHECK-NEXT: eon w13, w10, w9, lsl #31 +; CHECK-NEXT: str w11, [x8] +; CHECK-NEXT: bic w11, w10, w9, asr #10 +; CHECK-NEXT: str w12, [x8] +; CHECK-NEXT: eor w12, w10, w9, asr #10 +; CHECK-NEXT: str w13, [x8] +; CHECK-NEXT: orn w13, w10, w9, lsr #1 +; CHECK-NEXT: str w11, [x8] +; CHECK-NEXT: eor w11, w10, w9, lsr #1 +; CHECK-NEXT: str w12, [x8] +; CHECK-NEXT: eon w12, w10, w9, ror #20 +; CHECK-NEXT: and w9, w10, w9, ror #20 +; CHECK-NEXT: str w13, [x8] +; CHECK-NEXT: str w11, [x8] +; CHECK-NEXT: str w12, [x8] ; CHECK-NEXT: str w9, [x8] -; CHECK-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %val1 = load i32, i32* @var1_32 %val2 = load i32, i32* @var2_32 @@ -130,9 +126,6 @@ define void @logical_64bit() minsize { ; CHECK-LABEL: logical_64bit: ; CHECK: // %bb.0: -; CHECK-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w19, -16 ; CHECK-NEXT: adrp x8, :got:var1_64 ; CHECK-NEXT: adrp x9, :got:var2_64 ; CHECK-NEXT: ldr x8, [x8, :got_lo12:var1_64] @@ -142,40 +135,39 @@ ; CHECK-NEXT: and x11, x10, x9 ; CHECK-NEXT: bic x12, x10, x9 ; CHECK-NEXT: orr x13, x10, x9 -; CHECK-NEXT: orn x14, x10, x9 -; CHECK-NEXT: eor x15, x10, x9 -; CHECK-NEXT: eon x16, x9, x10 -; CHECK-NEXT: and x17, x10, x9, lsl #63 -; CHECK-NEXT: bic x18, x10, x9, lsl #63 -; CHECK-NEXT: orr x0, x10, x9, lsl #63 -; CHECK-NEXT: orn x1, x10, x9, lsl #63 -; CHECK-NEXT: eor x2, x10, x9, lsl #63 -; CHECK-NEXT: eon x3, x10, x9, lsl #63 -; CHECK-NEXT: bic x4, x10, x9, asr #10 -; CHECK-NEXT: eor x5, x10, x9, asr #10 -; CHECK-NEXT: orn x6, x10, x9, lsr #1 -; CHECK-NEXT: eor x7, x10, x9, lsr #1 -; CHECK-NEXT: eon x19, x10, x9, ror #20 -; CHECK-NEXT: and x9, x10, x9, ror #20 ; CHECK-NEXT: str x11, [x8] +; CHECK-NEXT: orn x11, x10, x9 ; CHECK-NEXT: str x12, [x8] +; CHECK-NEXT: eor x12, x10, x9 ; CHECK-NEXT: str x13, [x8] -; CHECK-NEXT: str x14, [x8] -; CHECK-NEXT: str x15, [x8] -; CHECK-NEXT: str x16, [x8] -; CHECK-NEXT: str x17, [x8] -; CHECK-NEXT: str x18, [x8] -; CHECK-NEXT: str x0, [x8] -; CHECK-NEXT: str x1, [x8] -; CHECK-NEXT: str x2, [x8] -; CHECK-NEXT: str x3, [x8] -; CHECK-NEXT: str x4, [x8] -; CHECK-NEXT: str x5, [x8] -; CHECK-NEXT: str x6, [x8] -; CHECK-NEXT: str x7, [x8] -; CHECK-NEXT: str x19, [x8] +; CHECK-NEXT: eon x13, x9, x10 +; CHECK-NEXT: str x11, [x8] +; CHECK-NEXT: and x11, x10, x9, lsl #63 +; CHECK-NEXT: str x12, [x8] +; CHECK-NEXT: bic x12, x10, x9, lsl #63 +; CHECK-NEXT: str x13, [x8] +; CHECK-NEXT: orr x13, x10, x9, lsl #63 +; CHECK-NEXT: str x11, [x8] +; CHECK-NEXT: orn x11, x10, x9, lsl #63 +; CHECK-NEXT: str x12, [x8] +; CHECK-NEXT: eor x12, x10, x9, lsl #63 +; CHECK-NEXT: str x13, [x8] +; CHECK-NEXT: eon x13, x10, x9, lsl #63 +; CHECK-NEXT: str x11, [x8] +; CHECK-NEXT: bic x11, x10, x9, asr #10 +; CHECK-NEXT: str x12, [x8] +; CHECK-NEXT: eor x12, x10, x9, asr #10 +; CHECK-NEXT: str x13, [x8] +; CHECK-NEXT: orn x13, x10, x9, lsr #1 +; CHECK-NEXT: str x11, [x8] +; CHECK-NEXT: eor x11, x10, x9, lsr #1 +; CHECK-NEXT: str x12, [x8] +; CHECK-NEXT: eon x12, x10, x9, ror #20 +; CHECK-NEXT: and x9, x10, x9, ror #20 +; CHECK-NEXT: str x13, [x8] +; CHECK-NEXT: str x11, [x8] +; CHECK-NEXT: str x12, [x8] ; CHECK-NEXT: str x9, [x8] -; CHECK-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %val1 = load i64, i64* @var1_64 %val2 = load i64, i64* @var2_64 @@ -260,17 +252,16 @@ ; CHECK-NEXT: ldr x9, [x8] ; CHECK-NEXT: ldr x10, [x10] ; CHECK-NEXT: tst x9, x10 -; CHECK-NEXT: b.gt .LBB2_4 +; CHECK-NEXT: b.gt .LBB2_2 ; CHECK-NEXT: // %bb.1: // %test2 ; CHECK-NEXT: tst x9, x10, lsl #63 -; CHECK-NEXT: b.lt .LBB2_4 -; CHECK-NEXT: // %bb.2: // %test3 ; CHECK-NEXT: and x10, x9, x10, asr #12 -; CHECK-NEXT: cmp x10, #1 -; CHECK-NEXT: b.ge .LBB2_4 -; CHECK-NEXT: // %bb.3: // %other_exit +; CHECK-NEXT: ccmp x10, #1, #0, ge +; CHECK-NEXT: b.lt .LBB2_3 +; CHECK-NEXT: .LBB2_2: // %common.ret +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_3: // %other_exit ; CHECK-NEXT: str x9, [x8] -; CHECK-NEXT: .LBB2_4: // %common.ret ; CHECK-NEXT: ret %val1 = load i64, i64* @var1_64 %val2 = load i64, i64* @var2_64 diff --git a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll --- a/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll +++ b/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll @@ -22,9 +22,9 @@ ; CHECK-LABEL: mlai16_and: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: movi v1.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret entry: %v0 = sext <4 x i16> %vec0 to <4 x i32> @@ -41,9 +41,9 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr d0, [x0, #16] ; CHECK-NEXT: ldr d1, [x1, #16] -; CHECK-NEXT: ldr d2, [x2, #16] ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: ldr d1, [x2, #16] +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: str d0, [x0, #16] ; CHECK-NEXT: ret @@ -90,9 +90,9 @@ ; CHECK-LABEL: addmuli16_and: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: movi v3.2d, #0x00ffff0000ffff ; CHECK-NEXT: smlal v1.4s, v0.4h, v2.4h -; CHECK-NEXT: movi v0.2d, #0x00ffff0000ffff -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: and v0.16b, v1.16b, v3.16b ; CHECK-NEXT: ret entry: %v0 = sext <4 x i16> %vec0 to <4 x i32> @@ -158,9 +158,9 @@ ; CHECK-LABEL: mlai32_and: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s +; CHECK-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret entry: %v0 = sext <2 x i32> %vec0 to <2 x i64> @@ -177,9 +177,9 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr d0, [x0, #32] ; CHECK-NEXT: ldr d1, [x1, #32] -; CHECK-NEXT: ldr d2, [x2, #32] ; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s -; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s +; CHECK-NEXT: ldr d1, [x2, #32] +; CHECK-NEXT: uaddw v0.2d, v0.2d, v1.2s ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: str d0, [x0, #32] ; CHECK-NEXT: ret @@ -226,9 +226,9 @@ ; CHECK-LABEL: addmuli32_and: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: smull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: movi v3.2d, #0x000000ffffffff ; CHECK-NEXT: smlal v1.2d, v0.2s, v2.2s -; CHECK-NEXT: movi v0.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: and v0.16b, v1.16b, v3.16b ; CHECK-NEXT: ret entry: %v0 = sext <2 x i32> %vec0 to <2 x i64> diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll --- a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll @@ -20,14 +20,14 @@ ; CHECK-NEXT: b.lt .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: adrp x8, A -; CHECK-NEXT: ldr w21, [x8, :lo12:A] ; CHECK-NEXT: mov w20, w19 +; CHECK-NEXT: ldr w21, [x8, :lo12:A] ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: subs w19, w19, #1 ; CHECK-NEXT: sdiv w20, w20, w0 +; CHECK-NEXT: subs w19, w19, #1 ; CHECK-NEXT: b.ne .LBB0_2 ; CHECK-NEXT: b .LBB0_4 ; CHECK-NEXT: .LBB0_3: @@ -74,16 +74,16 @@ ; CHECK-NEXT: b.lt .LBB1_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: adrp x8, A -; CHECK-NEXT: ldr w20, [x8, :lo12:A] ; CHECK-NEXT: mov w0, w19 +; CHECK-NEXT: ldr w20, [x8, :lo12:A] ; CHECK-NEXT: bl _Z3usei ; CHECK-NEXT: mov w21, w19 ; CHECK-NEXT: .LBB1_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: subs w19, w19, #1 ; CHECK-NEXT: sdiv w21, w21, w0 +; CHECK-NEXT: subs w19, w19, #1 ; CHECK-NEXT: b.ne .LBB1_2 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_3: @@ -131,23 +131,23 @@ ; CHECK-NEXT: b.lt .LBB2_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: adrp x8, A -; CHECK-NEXT: ldr w20, [x8, :lo12:A] -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: mov w21, w19 -; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: mov w9, #42 +; CHECK-NEXT: mov w20, w19 +; CHECK-NEXT: ldr w21, [x8, :lo12:A] +; CHECK-NEXT: str w9, [x0] ; CHECK-NEXT: .LBB2_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov w0, w20 +; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: bl _Z3usei +; CHECK-NEXT: sdiv w20, w20, w0 ; CHECK-NEXT: subs w19, w19, #1 -; CHECK-NEXT: sdiv w21, w21, w0 ; CHECK-NEXT: b.ne .LBB2_2 ; CHECK-NEXT: b .LBB2_4 ; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: mov w21, w19 +; CHECK-NEXT: mov w20, w19 ; CHECK-NEXT: .LBB2_4: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-throw.ll b/llvm/test/CodeGen/AArch64/machine-outliner-throw.ll --- a/llvm/test/CodeGen/AArch64/machine-outliner-throw.ll +++ b/llvm/test/CodeGen/AArch64/machine-outliner-throw.ll @@ -36,8 +36,8 @@ ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: orr w9, wzr, #0x1 ; CHECK-NEXT: mov w0, #4 +; CHECK-NEXT: orr w9, wzr, #0x1 ; CHECK-NEXT: madd w19, w8, w8, w9 ; CHECK-NEXT: bl __cxa_allocate_exception ; CHECK-NEXT: bl OUTLINED_FUNCTION_0 @@ -55,8 +55,8 @@ ; CHECK-LABEL: OUTLINED_FUNCTION_0: ; CHECK: .cfi_startproc ; CHECK: adrp x1, _ZTIi -; CHECK-NEXT: add x1, x1, :lo12:_ZTIi ; CHECK-NEXT: mov x2, xzr +; CHECK-NEXT: add x1, x1, :lo12:_ZTIi ; CHECK-NEXT: str w19, [x0] ; CHECK-NEXT: b __cxa_throw ; CHECK: .cfi_endproc diff --git a/llvm/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll b/llvm/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll --- a/llvm/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll +++ b/llvm/test/CodeGen/AArch64/machine_cse_impdef_killflags.ll @@ -9,11 +9,11 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: mov w8, #1 ; CHECK-NEXT: cmp x0, #0 +; CHECK-NEXT: csel w8, wzr, w8, ne ; CHECK-NEXT: mov x9, #2 ; CHECK-NEXT: mov x10, #3 -; CHECK-NEXT: csel w8, wzr, w8, ne -; CHECK-NEXT: csel x9, x9, x10, ne ; CHECK-NEXT: ubfx x8, x8, #0, #32 +; CHECK-NEXT: csel x9, x9, x10, ne ; CHECK-NEXT: add x0, x9, x8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/madd-lohi.ll b/llvm/test/CodeGen/AArch64/madd-lohi.ll --- a/llvm/test/CodeGen/AArch64/madd-lohi.ll +++ b/llvm/test/CodeGen/AArch64/madd-lohi.ll @@ -7,16 +7,16 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: umulh x8, x0, x2 ; CHECK-NEXT: madd x8, x0, x3, x8 -; CHECK-NEXT: madd x1, x1, x2, x8 ; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: madd x1, x1, x2, x8 ; CHECK-NEXT: ret ; ; CHECK-BE-LABEL: test_128bitmul: ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: umulh x8, x1, x3 ; CHECK-BE-NEXT: madd x8, x1, x2, x8 -; CHECK-BE-NEXT: madd x0, x0, x3, x8 ; CHECK-BE-NEXT: mul x1, x1, x3 +; CHECK-BE-NEXT: madd x0, x0, x3, x8 ; CHECK-BE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll b/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll --- a/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll +++ b/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll @@ -14,11 +14,11 @@ define i32 @test_memcpy(i32* nocapture %p, i32* nocapture readonly %q) { ; CHECK-LABEL: test_memcpy: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp w8, w9, [x1] -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ldp w9, w10, [x1] +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add w0, w9, w10 +; CHECK-NEXT: ldr q0, [x8, #16] +; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %p0 = bitcast i32* %p to i8* %add.ptr = getelementptr inbounds i32, i32* %p, i64 4 @@ -37,11 +37,11 @@ define i32 @test_memcpy_inline(i32* nocapture %p, i32* nocapture readonly %q) { ; CHECK-LABEL: test_memcpy_inline: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp w8, w9, [x1] -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ldp w9, w10, [x1] +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add w0, w9, w10 +; CHECK-NEXT: ldr q0, [x8, #16] +; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %p0 = bitcast i32* %p to i8* %add.ptr = getelementptr inbounds i32, i32* %p, i64 4 @@ -60,11 +60,11 @@ define i32 @test_memmove(i32* nocapture %p, i32* nocapture readonly %q) { ; CHECK-LABEL: test_memmove: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp w8, w9, [x1] -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ldp w9, w10, [x1] +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add w0, w9, w10 +; CHECK-NEXT: ldr q0, [x8, #16] +; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %p0 = bitcast i32* %p to i8* %add.ptr = getelementptr inbounds i32, i32* %p, i64 4 @@ -84,11 +84,11 @@ define i32 @test_memset(i32* nocapture %p, i32* nocapture readonly %q) { ; CHECK-LABEL: test_memset: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp w8, w9, [x1] -; CHECK-NEXT: mov x10, #-6148914691236517206 -; CHECK-NEXT: stp x10, x10, [x0] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ldp w10, w11, [x1] +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov x9, #-6148914691236517206 +; CHECK-NEXT: add w0, w10, w11 +; CHECK-NEXT: stp x9, x9, [x8] ; CHECK-NEXT: ret %p0 = bitcast i32* %p to i8* tail call void @llvm.memset.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(16) %p0, i8 170, i64 16, i1 false), !alias.scope !2, !noalias !4 @@ -105,11 +105,11 @@ define i32 @test_mempcpy(i32* nocapture %p, i32* nocapture readonly %q) { ; CHECK-LABEL: test_mempcpy: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp w8, w9, [x1] -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: str q0, [x0] -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ldp w9, w10, [x1] +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: add w0, w9, w10 +; CHECK-NEXT: ldr q0, [x8, #16] +; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret %p0 = bitcast i32* %p to i8* %add.ptr = getelementptr inbounds i32, i32* %p, i64 4 diff --git a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll --- a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll +++ b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll @@ -497,12 +497,12 @@ ; BE-LABEL: le_i64_to_i16_order: ; BE: // %bb.0: ; BE-NEXT: lsr x8, x0, #16 -; BE-NEXT: lsr x9, x0, #32 -; BE-NEXT: lsr x10, x0, #48 +; BE-NEXT: lsr x9, x0, #48 +; BE-NEXT: lsr x10, x0, #32 ; BE-NEXT: strh w0, [x1] ; BE-NEXT: strh w8, [x1, #2] -; BE-NEXT: strh w10, [x1, #6] -; BE-NEXT: strh w9, [x1, #4] +; BE-NEXT: strh w9, [x1, #6] +; BE-NEXT: strh w10, [x1, #4] ; BE-NEXT: ret %sh1 = lshr i64 %x, 16 %sh2 = lshr i64 %x, 32 @@ -525,11 +525,11 @@ ; LE-LABEL: be_i64_to_i16: ; LE: // %bb.0: ; LE-NEXT: lsr x8, x0, #32 -; LE-NEXT: lsr x9, x0, #48 -; LE-NEXT: ror w10, w0, #16 -; LE-NEXT: str w10, [x1, #4] +; LE-NEXT: ror w9, w0, #16 +; LE-NEXT: lsr x10, x0, #48 ; LE-NEXT: strh w8, [x1, #2] -; LE-NEXT: strh w9, [x1] +; LE-NEXT: str w9, [x1, #4] +; LE-NEXT: strh w10, [x1] ; LE-NEXT: ret ; ; BE-LABEL: be_i64_to_i16: @@ -556,13 +556,13 @@ define void @be_i64_to_i16_order(i64 %x, i16* %p0) { ; LE-LABEL: be_i64_to_i16_order: ; LE: // %bb.0: -; LE-NEXT: lsr x8, x0, #16 +; LE-NEXT: lsr x8, x0, #48 ; LE-NEXT: lsr x9, x0, #32 -; LE-NEXT: lsr x10, x0, #48 +; LE-NEXT: lsr x10, x0, #16 ; LE-NEXT: strh w0, [x1, #6] -; LE-NEXT: strh w10, [x1] +; LE-NEXT: strh w8, [x1] ; LE-NEXT: strh w9, [x1, #2] -; LE-NEXT: strh w8, [x1, #4] +; LE-NEXT: strh w10, [x1, #4] ; LE-NEXT: ret ; ; BE-LABEL: be_i64_to_i16_order: @@ -672,8 +672,8 @@ ; CHECK-LABEL: i64_to_i32_wrong_addr: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr x8, x0, #32 -; CHECK-NEXT: str w8, [x1, #12] ; CHECK-NEXT: str w0, [x1] +; CHECK-NEXT: str w8, [x1, #12] ; CHECK-NEXT: ret %sh1 = lshr i64 %x, 32 %t0 = trunc i64 %x to i32 @@ -689,13 +689,13 @@ define void @i64_to_i16_wrong_order(i64 %x, i16* %p0) { ; CHECK-LABEL: i64_to_i16_wrong_order: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: lsr x9, x0, #32 -; CHECK-NEXT: lsr x10, x0, #48 -; CHECK-NEXT: strh w10, [x1, #6] -; CHECK-NEXT: strh w8, [x1, #4] -; CHECK-NEXT: strh w9, [x1, #2] +; CHECK-NEXT: lsr x8, x0, #48 +; CHECK-NEXT: lsr x9, x0, #16 +; CHECK-NEXT: lsr x10, x0, #32 ; CHECK-NEXT: strh w0, [x1] +; CHECK-NEXT: strh w8, [x1, #6] +; CHECK-NEXT: strh w9, [x1, #4] +; CHECK-NEXT: strh w10, [x1, #2] ; CHECK-NEXT: ret %sh1 = lshr i64 %x, 16 %sh2 = lshr i64 %x, 32 @@ -746,19 +746,19 @@ define void @i64_to_i8_incomplete(i64 %x, i8* %p0) { ; CHECK-LABEL: i64_to_i8_incomplete: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr x8, x0, #8 -; CHECK-NEXT: lsr x9, x0, #16 -; CHECK-NEXT: lsr x10, x0, #32 -; CHECK-NEXT: lsr x11, x0, #40 -; CHECK-NEXT: lsr x12, x0, #48 -; CHECK-NEXT: lsr x13, x0, #56 -; CHECK-NEXT: strb w13, [x1] -; CHECK-NEXT: strb w12, [x1, #1] -; CHECK-NEXT: strb w11, [x1, #2] -; CHECK-NEXT: strb w10, [x1, #3] -; CHECK-NEXT: strb w9, [x1, #5] -; CHECK-NEXT: strb w8, [x1, #6] +; CHECK-NEXT: lsr x8, x0, #56 +; CHECK-NEXT: lsr x9, x0, #48 +; CHECK-NEXT: lsr x10, x0, #40 +; CHECK-NEXT: lsr x11, x0, #32 ; CHECK-NEXT: strb w0, [x1, #7] +; CHECK-NEXT: strb w8, [x1] +; CHECK-NEXT: lsr x8, x0, #16 +; CHECK-NEXT: strb w9, [x1, #1] +; CHECK-NEXT: lsr x9, x0, #8 +; CHECK-NEXT: strb w10, [x1, #2] +; CHECK-NEXT: strb w11, [x1, #3] +; CHECK-NEXT: strb w8, [x1, #5] +; CHECK-NEXT: strb w9, [x1, #6] ; CHECK-NEXT: ret %sh1 = lshr i64 %x, 8 %sh2 = lshr i64 %x, 16 @@ -798,8 +798,8 @@ ; CHECK-LABEL: i32_to_i16_wrong_addr: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr w8, w0, #16 -; CHECK-NEXT: strh w8, [x1, #4] ; CHECK-NEXT: strh w0, [x1] +; CHECK-NEXT: strh w8, [x1, #4] ; CHECK-NEXT: ret %sh1 = lshr i32 %x, 16 %t0 = trunc i32 %x to i16 @@ -815,13 +815,13 @@ define void @i32_to_i8_wrong_order(i32 %x, i8* %p0) { ; CHECK-LABEL: i32_to_i8_wrong_order: ; CHECK: // %bb.0: -; CHECK-NEXT: lsr w8, w0, #8 +; CHECK-NEXT: lsr w8, w0, #24 ; CHECK-NEXT: lsr w9, w0, #16 -; CHECK-NEXT: lsr w10, w0, #24 +; CHECK-NEXT: lsr w10, w0, #8 ; CHECK-NEXT: strb w0, [x1, #3] -; CHECK-NEXT: strb w10, [x1, #1] +; CHECK-NEXT: strb w8, [x1, #1] ; CHECK-NEXT: strb w9, [x1] -; CHECK-NEXT: strb w8, [x1, #2] +; CHECK-NEXT: strb w10, [x1, #2] ; CHECK-NEXT: ret %sh1 = lshr i32 %x, 8 %sh2 = lshr i32 %x, 16 diff --git a/llvm/test/CodeGen/AArch64/midpoint-int.ll b/llvm/test/CodeGen/AArch64/midpoint-int.ll --- a/llvm/test/CodeGen/AArch64/midpoint-int.ll +++ b/llvm/test/CodeGen/AArch64/midpoint-int.ll @@ -14,13 +14,13 @@ ; CHECK-LABEL: scalar_i32_signed_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, w1 -; CHECK-NEXT: csel w9, w1, w0, gt -; CHECK-NEXT: csel w10, w0, w1, gt -; CHECK-NEXT: mov w8, #-1 -; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: cneg w8, w8, le -; CHECK-NEXT: lsr w9, w9, #1 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: mov w10, #-1 +; CHECK-NEXT: csel w8, w1, w0, gt +; CHECK-NEXT: csel w9, w0, w1, gt +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: cneg w9, w10, le +; CHECK-NEXT: lsr w8, w8, #1 +; CHECK-NEXT: madd w0, w8, w9, w0 ; CHECK-NEXT: ret %t3 = icmp sgt i32 %a1, %a2 ; signed %t4 = select i1 %t3, i32 -1, i32 1 @@ -37,13 +37,13 @@ ; CHECK-LABEL: scalar_i32_unsigned_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, w1 -; CHECK-NEXT: csel w9, w1, w0, hi -; CHECK-NEXT: csel w10, w0, w1, hi -; CHECK-NEXT: mov w8, #-1 -; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: cneg w8, w8, ls -; CHECK-NEXT: lsr w9, w9, #1 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: mov w10, #-1 +; CHECK-NEXT: csel w8, w1, w0, hi +; CHECK-NEXT: csel w9, w0, w1, hi +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: cneg w9, w10, ls +; CHECK-NEXT: lsr w8, w8, #1 +; CHECK-NEXT: madd w0, w8, w9, w0 ; CHECK-NEXT: ret %t3 = icmp ugt i32 %a1, %a2 %t4 = select i1 %t3, i32 -1, i32 1 @@ -61,15 +61,15 @@ define i32 @scalar_i32_signed_mem_reg(i32* %a1_addr, i32 %a2) nounwind { ; CHECK-LABEL: scalar_i32_signed_mem_reg: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #-1 -; CHECK-NEXT: cmp w8, w1 -; CHECK-NEXT: csel w10, w1, w8, gt -; CHECK-NEXT: csel w11, w8, w1, gt +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: cmp w9, w1 +; CHECK-NEXT: csel w10, w1, w9, gt +; CHECK-NEXT: csel w11, w9, w1, gt ; CHECK-NEXT: sub w10, w11, w10 -; CHECK-NEXT: cneg w9, w9, le +; CHECK-NEXT: cneg w8, w8, le ; CHECK-NEXT: lsr w10, w10, #1 -; CHECK-NEXT: madd w0, w10, w9, w8 +; CHECK-NEXT: madd w0, w10, w8, w9 ; CHECK-NEXT: ret %a1 = load i32, i32* %a1_addr %t3 = icmp sgt i32 %a1, %a2 ; signed @@ -86,15 +86,15 @@ define i32 @scalar_i32_signed_reg_mem(i32 %a1, i32* %a2_addr) nounwind { ; CHECK-LABEL: scalar_i32_signed_reg_mem: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x1] -; CHECK-NEXT: mov w9, #-1 -; CHECK-NEXT: cmp w0, w8 -; CHECK-NEXT: csel w10, w8, w0, gt -; CHECK-NEXT: csel w8, w0, w8, gt -; CHECK-NEXT: sub w8, w8, w10 -; CHECK-NEXT: cneg w9, w9, le -; CHECK-NEXT: lsr w8, w8, #1 -; CHECK-NEXT: madd w0, w8, w9, w0 +; CHECK-NEXT: ldr w9, [x1] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: cmp w0, w9 +; CHECK-NEXT: csel w10, w9, w0, gt +; CHECK-NEXT: csel w9, w0, w9, gt +; CHECK-NEXT: sub w9, w9, w10 +; CHECK-NEXT: cneg w8, w8, le +; CHECK-NEXT: lsr w9, w9, #1 +; CHECK-NEXT: madd w0, w9, w8, w0 ; CHECK-NEXT: ret %a2 = load i32, i32* %a2_addr %t3 = icmp sgt i32 %a1, %a2 ; signed @@ -111,16 +111,16 @@ define i32 @scalar_i32_signed_mem_mem(i32* %a1_addr, i32* %a2_addr) nounwind { ; CHECK-LABEL: scalar_i32_signed_mem_mem: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: ldr w9, [x1] -; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w11, w9, w8, gt -; CHECK-NEXT: csel w9, w8, w9, gt -; CHECK-NEXT: sub w9, w9, w11 -; CHECK-NEXT: cneg w10, w10, le -; CHECK-NEXT: lsr w9, w9, #1 -; CHECK-NEXT: madd w0, w9, w10, w8 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: ldr w10, [x1] +; CHECK-NEXT: cmp w9, w10 +; CHECK-NEXT: csel w11, w10, w9, gt +; CHECK-NEXT: csel w10, w9, w10, gt +; CHECK-NEXT: sub w10, w10, w11 +; CHECK-NEXT: cneg w8, w8, le +; CHECK-NEXT: lsr w10, w10, #1 +; CHECK-NEXT: madd w0, w10, w8, w9 ; CHECK-NEXT: ret %a1 = load i32, i32* %a1_addr %a2 = load i32, i32* %a2_addr @@ -145,13 +145,13 @@ ; CHECK-LABEL: scalar_i64_signed_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp x0, x1 -; CHECK-NEXT: csel x9, x1, x0, gt -; CHECK-NEXT: csel x10, x0, x1, gt -; CHECK-NEXT: mov x8, #-1 -; CHECK-NEXT: sub x9, x10, x9 -; CHECK-NEXT: cneg x8, x8, le -; CHECK-NEXT: lsr x9, x9, #1 -; CHECK-NEXT: madd x0, x9, x8, x0 +; CHECK-NEXT: mov x10, #-1 +; CHECK-NEXT: csel x8, x1, x0, gt +; CHECK-NEXT: csel x9, x0, x1, gt +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: cneg x9, x10, le +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: madd x0, x8, x9, x0 ; CHECK-NEXT: ret %t3 = icmp sgt i64 %a1, %a2 ; signed %t4 = select i1 %t3, i64 -1, i64 1 @@ -168,13 +168,13 @@ ; CHECK-LABEL: scalar_i64_unsigned_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp x0, x1 -; CHECK-NEXT: csel x9, x1, x0, hi -; CHECK-NEXT: csel x10, x0, x1, hi -; CHECK-NEXT: mov x8, #-1 -; CHECK-NEXT: sub x9, x10, x9 -; CHECK-NEXT: cneg x8, x8, ls -; CHECK-NEXT: lsr x9, x9, #1 -; CHECK-NEXT: madd x0, x9, x8, x0 +; CHECK-NEXT: mov x10, #-1 +; CHECK-NEXT: csel x8, x1, x0, hi +; CHECK-NEXT: csel x9, x0, x1, hi +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: cneg x9, x10, ls +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: madd x0, x8, x9, x0 ; CHECK-NEXT: ret %t3 = icmp ugt i64 %a1, %a2 %t4 = select i1 %t3, i64 -1, i64 1 @@ -192,15 +192,15 @@ define i64 @scalar_i64_signed_mem_reg(i64* %a1_addr, i64 %a2) nounwind { ; CHECK-LABEL: scalar_i64_signed_mem_reg: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov x9, #-1 -; CHECK-NEXT: cmp x8, x1 -; CHECK-NEXT: csel x10, x1, x8, gt -; CHECK-NEXT: csel x11, x8, x1, gt +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: cmp x9, x1 +; CHECK-NEXT: csel x10, x1, x9, gt +; CHECK-NEXT: csel x11, x9, x1, gt ; CHECK-NEXT: sub x10, x11, x10 -; CHECK-NEXT: cneg x9, x9, le +; CHECK-NEXT: cneg x8, x8, le ; CHECK-NEXT: lsr x10, x10, #1 -; CHECK-NEXT: madd x0, x10, x9, x8 +; CHECK-NEXT: madd x0, x10, x8, x9 ; CHECK-NEXT: ret %a1 = load i64, i64* %a1_addr %t3 = icmp sgt i64 %a1, %a2 ; signed @@ -217,15 +217,15 @@ define i64 @scalar_i64_signed_reg_mem(i64 %a1, i64* %a2_addr) nounwind { ; CHECK-LABEL: scalar_i64_signed_reg_mem: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x1] -; CHECK-NEXT: mov x9, #-1 -; CHECK-NEXT: cmp x0, x8 -; CHECK-NEXT: csel x10, x8, x0, gt -; CHECK-NEXT: csel x8, x0, x8, gt -; CHECK-NEXT: sub x8, x8, x10 -; CHECK-NEXT: cneg x9, x9, le -; CHECK-NEXT: lsr x8, x8, #1 -; CHECK-NEXT: madd x0, x8, x9, x0 +; CHECK-NEXT: ldr x9, [x1] +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: cmp x0, x9 +; CHECK-NEXT: csel x10, x9, x0, gt +; CHECK-NEXT: csel x9, x0, x9, gt +; CHECK-NEXT: sub x9, x9, x10 +; CHECK-NEXT: cneg x8, x8, le +; CHECK-NEXT: lsr x9, x9, #1 +; CHECK-NEXT: madd x0, x9, x8, x0 ; CHECK-NEXT: ret %a2 = load i64, i64* %a2_addr %t3 = icmp sgt i64 %a1, %a2 ; signed @@ -242,16 +242,16 @@ define i64 @scalar_i64_signed_mem_mem(i64* %a1_addr, i64* %a2_addr) nounwind { ; CHECK-LABEL: scalar_i64_signed_mem_mem: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ldr x9, [x1] -; CHECK-NEXT: mov x10, #-1 -; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: csel x11, x9, x8, gt -; CHECK-NEXT: csel x9, x8, x9, gt -; CHECK-NEXT: sub x9, x9, x11 -; CHECK-NEXT: cneg x10, x10, le -; CHECK-NEXT: lsr x9, x9, #1 -; CHECK-NEXT: madd x0, x9, x10, x8 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: mov x8, #-1 +; CHECK-NEXT: ldr x10, [x1] +; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: csel x11, x10, x9, gt +; CHECK-NEXT: csel x10, x9, x10, gt +; CHECK-NEXT: sub x10, x10, x11 +; CHECK-NEXT: cneg x8, x8, le +; CHECK-NEXT: lsr x10, x10, #1 +; CHECK-NEXT: madd x0, x10, x8, x9 ; CHECK-NEXT: ret %a1 = load i64, i64* %a1_addr %a2 = load i64, i64* %a2_addr @@ -276,14 +276,14 @@ ; CHECK-LABEL: scalar_i16_signed_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: cmp w8, w1, sxth -; CHECK-NEXT: cneg w8, w9, le -; CHECK-NEXT: csel w9, w1, w0, gt -; CHECK-NEXT: csel w10, w0, w1, gt -; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: ubfx w9, w9, #1, #15 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: csel w8, w1, w0, gt +; CHECK-NEXT: csel w9, w0, w1, gt +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: cneg w9, w10, le +; CHECK-NEXT: ubfx w8, w8, #1, #15 +; CHECK-NEXT: madd w0, w8, w9, w0 ; CHECK-NEXT: ret %t3 = icmp sgt i16 %a1, %a2 ; signed %t4 = select i1 %t3, i16 -1, i16 1 @@ -300,14 +300,14 @@ ; CHECK-LABEL: scalar_i16_unsigned_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: cmp w8, w1, uxth -; CHECK-NEXT: cneg w8, w9, ls -; CHECK-NEXT: csel w9, w1, w0, hi -; CHECK-NEXT: csel w10, w0, w1, hi -; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: ubfx w9, w9, #1, #15 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: csel w8, w1, w0, hi +; CHECK-NEXT: csel w9, w0, w1, hi +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: cneg w9, w10, ls +; CHECK-NEXT: ubfx w8, w8, #1, #15 +; CHECK-NEXT: madd w0, w8, w9, w0 ; CHECK-NEXT: ret %t3 = icmp ugt i16 %a1, %a2 %t4 = select i1 %t3, i16 -1, i16 1 @@ -325,15 +325,15 @@ define i16 @scalar_i16_signed_mem_reg(i16* %a1_addr, i16 %a2) nounwind { ; CHECK-LABEL: scalar_i16_signed_mem_reg: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsh w8, [x0] -; CHECK-NEXT: mov w9, #-1 -; CHECK-NEXT: cmp w8, w1, sxth -; CHECK-NEXT: csel w10, w1, w8, gt -; CHECK-NEXT: csel w11, w8, w1, gt +; CHECK-NEXT: ldrsh w9, [x0] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: cmp w9, w1, sxth +; CHECK-NEXT: csel w10, w1, w9, gt +; CHECK-NEXT: csel w11, w9, w1, gt ; CHECK-NEXT: sub w10, w11, w10 -; CHECK-NEXT: cneg w9, w9, le +; CHECK-NEXT: cneg w8, w8, le ; CHECK-NEXT: ubfx w10, w10, #1, #15 -; CHECK-NEXT: madd w0, w10, w9, w8 +; CHECK-NEXT: madd w0, w10, w8, w9 ; CHECK-NEXT: ret %a1 = load i16, i16* %a1_addr %t3 = icmp sgt i16 %a1, %a2 ; signed @@ -350,14 +350,14 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, i16* %a2_addr) nounwind { ; CHECK-LABEL: scalar_i16_signed_reg_mem: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsh w8, [x1] -; CHECK-NEXT: sxth w9, w0 +; CHECK-NEXT: ldrsh w9, [x1] +; CHECK-NEXT: sxth w8, w0 ; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: csel w8, w9, w0, gt +; CHECK-NEXT: csel w9, w0, w9, gt +; CHECK-NEXT: sub w8, w9, w8 ; CHECK-NEXT: cneg w9, w10, le -; CHECK-NEXT: csel w10, w8, w0, gt -; CHECK-NEXT: csel w8, w0, w8, gt -; CHECK-NEXT: sub w8, w8, w10 ; CHECK-NEXT: ubfx w8, w8, #1, #15 ; CHECK-NEXT: madd w0, w8, w9, w0 ; CHECK-NEXT: ret @@ -376,16 +376,16 @@ define i16 @scalar_i16_signed_mem_mem(i16* %a1_addr, i16* %a2_addr) nounwind { ; CHECK-LABEL: scalar_i16_signed_mem_mem: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsh w8, [x0] -; CHECK-NEXT: ldrsh w9, [x1] -; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w11, w9, w8, gt -; CHECK-NEXT: csel w9, w8, w9, gt -; CHECK-NEXT: sub w9, w9, w11 -; CHECK-NEXT: cneg w10, w10, le -; CHECK-NEXT: ubfx w9, w9, #1, #15 -; CHECK-NEXT: madd w0, w9, w10, w8 +; CHECK-NEXT: ldrsh w9, [x0] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: ldrsh w10, [x1] +; CHECK-NEXT: cmp w9, w10 +; CHECK-NEXT: csel w11, w10, w9, gt +; CHECK-NEXT: csel w10, w9, w10, gt +; CHECK-NEXT: sub w10, w10, w11 +; CHECK-NEXT: cneg w8, w8, le +; CHECK-NEXT: ubfx w10, w10, #1, #15 +; CHECK-NEXT: madd w0, w10, w8, w9 ; CHECK-NEXT: ret %a1 = load i16, i16* %a1_addr %a2 = load i16, i16* %a2_addr @@ -410,14 +410,14 @@ ; CHECK-LABEL: scalar_i8_signed_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: cmp w8, w1, sxtb -; CHECK-NEXT: cneg w8, w9, le -; CHECK-NEXT: csel w9, w1, w0, gt -; CHECK-NEXT: csel w10, w0, w1, gt -; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: ubfx w9, w9, #1, #7 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: csel w8, w1, w0, gt +; CHECK-NEXT: csel w9, w0, w1, gt +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: cneg w9, w10, le +; CHECK-NEXT: ubfx w8, w8, #1, #7 +; CHECK-NEXT: madd w0, w8, w9, w0 ; CHECK-NEXT: ret %t3 = icmp sgt i8 %a1, %a2 ; signed %t4 = select i1 %t3, i8 -1, i8 1 @@ -434,14 +434,14 @@ ; CHECK-LABEL: scalar_i8_unsigned_reg_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: mov w9, #-1 +; CHECK-NEXT: mov w10, #-1 ; CHECK-NEXT: cmp w8, w1, uxtb -; CHECK-NEXT: cneg w8, w9, ls -; CHECK-NEXT: csel w9, w1, w0, hi -; CHECK-NEXT: csel w10, w0, w1, hi -; CHECK-NEXT: sub w9, w10, w9 -; CHECK-NEXT: ubfx w9, w9, #1, #7 -; CHECK-NEXT: madd w0, w9, w8, w0 +; CHECK-NEXT: csel w8, w1, w0, hi +; CHECK-NEXT: csel w9, w0, w1, hi +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: cneg w9, w10, ls +; CHECK-NEXT: ubfx w8, w8, #1, #7 +; CHECK-NEXT: madd w0, w8, w9, w0 ; CHECK-NEXT: ret %t3 = icmp ugt i8 %a1, %a2 %t4 = select i1 %t3, i8 -1, i8 1 @@ -459,15 +459,15 @@ define i8 @scalar_i8_signed_mem_reg(i8* %a1_addr, i8 %a2) nounwind { ; CHECK-LABEL: scalar_i8_signed_mem_reg: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsb w8, [x0] -; CHECK-NEXT: mov w9, #-1 -; CHECK-NEXT: cmp w8, w1, sxtb -; CHECK-NEXT: csel w10, w1, w8, gt -; CHECK-NEXT: csel w11, w8, w1, gt +; CHECK-NEXT: ldrsb w9, [x0] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: cmp w9, w1, sxtb +; CHECK-NEXT: csel w10, w1, w9, gt +; CHECK-NEXT: csel w11, w9, w1, gt ; CHECK-NEXT: sub w10, w11, w10 -; CHECK-NEXT: cneg w9, w9, le +; CHECK-NEXT: cneg w8, w8, le ; CHECK-NEXT: ubfx w10, w10, #1, #7 -; CHECK-NEXT: madd w0, w10, w9, w8 +; CHECK-NEXT: madd w0, w10, w8, w9 ; CHECK-NEXT: ret %a1 = load i8, i8* %a1_addr %t3 = icmp sgt i8 %a1, %a2 ; signed @@ -484,14 +484,14 @@ define i8 @scalar_i8_signed_reg_mem(i8 %a1, i8* %a2_addr) nounwind { ; CHECK-LABEL: scalar_i8_signed_reg_mem: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsb w8, [x1] -; CHECK-NEXT: sxtb w9, w0 +; CHECK-NEXT: ldrsb w9, [x1] +; CHECK-NEXT: sxtb w8, w0 ; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: csel w8, w9, w0, gt +; CHECK-NEXT: csel w9, w0, w9, gt +; CHECK-NEXT: sub w8, w9, w8 ; CHECK-NEXT: cneg w9, w10, le -; CHECK-NEXT: csel w10, w8, w0, gt -; CHECK-NEXT: csel w8, w0, w8, gt -; CHECK-NEXT: sub w8, w8, w10 ; CHECK-NEXT: ubfx w8, w8, #1, #7 ; CHECK-NEXT: madd w0, w8, w9, w0 ; CHECK-NEXT: ret @@ -510,16 +510,16 @@ define i8 @scalar_i8_signed_mem_mem(i8* %a1_addr, i8* %a2_addr) nounwind { ; CHECK-LABEL: scalar_i8_signed_mem_mem: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrsb w8, [x0] -; CHECK-NEXT: ldrsb w9, [x1] -; CHECK-NEXT: mov w10, #-1 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w11, w9, w8, gt -; CHECK-NEXT: csel w9, w8, w9, gt -; CHECK-NEXT: sub w9, w9, w11 -; CHECK-NEXT: cneg w10, w10, le -; CHECK-NEXT: ubfx w9, w9, #1, #7 -; CHECK-NEXT: madd w0, w9, w10, w8 +; CHECK-NEXT: ldrsb w9, [x0] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: ldrsb w10, [x1] +; CHECK-NEXT: cmp w9, w10 +; CHECK-NEXT: csel w11, w10, w9, gt +; CHECK-NEXT: csel w10, w9, w10, gt +; CHECK-NEXT: sub w10, w10, w11 +; CHECK-NEXT: cneg w8, w8, le +; CHECK-NEXT: ubfx w10, w10, #1, #7 +; CHECK-NEXT: madd w0, w10, w8, w9 ; CHECK-NEXT: ret %a1 = load i8, i8* %a1_addr %a2 = load i8, i8* %a2_addr diff --git a/llvm/test/CodeGen/AArch64/min-max.ll b/llvm/test/CodeGen/AArch64/min-max.ll --- a/llvm/test/CodeGen/AArch64/min-max.ll +++ b/llvm/test/CodeGen/AArch64/min-max.ll @@ -97,12 +97,19 @@ declare <32 x i8> @llvm.smax.v32i8(<32 x i8> %a, <32 x i8> %b) readnone define void @smax32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %p) { -; CHECK-LABEL: smax32i8: -; CHECK: // %bb.0: -; CHECK-NEXT: smax v0.16b, v0.16b, v2.16b -; CHECK-NEXT: smax v1.16b, v1.16b, v3.16b -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: smax32i8: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: smax v1.16b, v1.16b, v3.16b +; CHECK-ISEL-NEXT: smax v0.16b, v0.16b, v2.16b +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: smax32i8: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: smax v0.16b, v0.16b, v2.16b +; CHECK-GLOBAL-NEXT: smax v1.16b, v1.16b, v3.16b +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %a, <32 x i8> %b) store <32 x i8> %c, <32 x i8>* %p ret void @@ -133,12 +140,19 @@ declare <16 x i16> @llvm.smax.v16i16(<16 x i16> %a, <16 x i16> %b) readnone define void @smax16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %p) { -; CHECK-LABEL: smax16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: smax v0.8h, v0.8h, v2.8h -; CHECK-NEXT: smax v1.8h, v1.8h, v3.8h -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: smax16i16: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: smax v1.8h, v1.8h, v3.8h +; CHECK-ISEL-NEXT: smax v0.8h, v0.8h, v2.8h +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: smax16i16: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: smax v0.8h, v0.8h, v2.8h +; CHECK-GLOBAL-NEXT: smax v1.8h, v1.8h, v3.8h +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %a, <16 x i16> %b) store <16 x i16> %c, <16 x i16>* %p ret void @@ -169,12 +183,19 @@ declare <8 x i32> @llvm.smax.v8i32(<8 x i32> %a, <8 x i32> %b) readnone define void @smax8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %p) { -; CHECK-LABEL: smax8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s -; CHECK-NEXT: smax v1.4s, v1.4s, v3.4s -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: smax8i32: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: smax v1.4s, v1.4s, v3.4s +; CHECK-ISEL-NEXT: smax v0.4s, v0.4s, v2.4s +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: smax8i32: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: smax v0.4s, v0.4s, v2.4s +; CHECK-GLOBAL-NEXT: smax v1.4s, v1.4s, v3.4s +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <8 x i32>@llvm.smax.v8i32(<8 x i32> %a, <8 x i32> %b) store <8 x i32> %c, <8 x i32>* %p ret void @@ -225,10 +246,10 @@ define void @smax4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) { ; CHECK-ISEL-LABEL: smax4i64: ; CHECK-ISEL: // %bb.0: -; CHECK-ISEL-NEXT: cmgt v4.2d, v0.2d, v2.2d -; CHECK-ISEL-NEXT: cmgt v5.2d, v1.2d, v3.2d -; CHECK-ISEL-NEXT: bif v0.16b, v2.16b, v4.16b -; CHECK-ISEL-NEXT: bif v1.16b, v3.16b, v5.16b +; CHECK-ISEL-NEXT: cmgt v4.2d, v1.2d, v3.2d +; CHECK-ISEL-NEXT: cmgt v5.2d, v0.2d, v2.2d +; CHECK-ISEL-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-ISEL-NEXT: bif v0.16b, v2.16b, v5.16b ; CHECK-ISEL-NEXT: stp q0, q1, [x0] ; CHECK-ISEL-NEXT: ret ; @@ -340,12 +361,19 @@ declare <32 x i8> @llvm.umax.v32i8(<32 x i8> %a, <32 x i8> %b) readnone define void @umax32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %p) { -; CHECK-LABEL: umax32i8: -; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.16b, v0.16b, v2.16b -; CHECK-NEXT: umax v1.16b, v1.16b, v3.16b -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: umax32i8: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: umax v1.16b, v1.16b, v3.16b +; CHECK-ISEL-NEXT: umax v0.16b, v0.16b, v2.16b +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: umax32i8: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: umax v0.16b, v0.16b, v2.16b +; CHECK-GLOBAL-NEXT: umax v1.16b, v1.16b, v3.16b +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %a, <32 x i8> %b) store <32 x i8> %c, <32 x i8>* %p ret void @@ -376,12 +404,19 @@ declare <16 x i16> @llvm.umax.v16i16(<16 x i16> %a, <16 x i16> %b) readnone define void @umax16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %p) { -; CHECK-LABEL: umax16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.8h, v0.8h, v2.8h -; CHECK-NEXT: umax v1.8h, v1.8h, v3.8h -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: umax16i16: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: umax v1.8h, v1.8h, v3.8h +; CHECK-ISEL-NEXT: umax v0.8h, v0.8h, v2.8h +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: umax16i16: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: umax v0.8h, v0.8h, v2.8h +; CHECK-GLOBAL-NEXT: umax v1.8h, v1.8h, v3.8h +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %a, <16 x i16> %b) store <16 x i16> %c, <16 x i16>* %p ret void @@ -412,12 +447,19 @@ declare <8 x i32> @llvm.umax.v8i32(<8 x i32> %a, <8 x i32> %b) readnone define void @umax8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %p) { -; CHECK-LABEL: umax8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s -; CHECK-NEXT: umax v1.4s, v1.4s, v3.4s -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: umax8i32: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: umax v1.4s, v1.4s, v3.4s +; CHECK-ISEL-NEXT: umax v0.4s, v0.4s, v2.4s +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: umax8i32: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: umax v0.4s, v0.4s, v2.4s +; CHECK-GLOBAL-NEXT: umax v1.4s, v1.4s, v3.4s +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <8 x i32>@llvm.umax.v8i32(<8 x i32> %a, <8 x i32> %b) store <8 x i32> %c, <8 x i32>* %p ret void @@ -468,10 +510,10 @@ define void @umax4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) { ; CHECK-ISEL-LABEL: umax4i64: ; CHECK-ISEL: // %bb.0: -; CHECK-ISEL-NEXT: cmhi v4.2d, v0.2d, v2.2d -; CHECK-ISEL-NEXT: cmhi v5.2d, v1.2d, v3.2d -; CHECK-ISEL-NEXT: bif v0.16b, v2.16b, v4.16b -; CHECK-ISEL-NEXT: bif v1.16b, v3.16b, v5.16b +; CHECK-ISEL-NEXT: cmhi v4.2d, v1.2d, v3.2d +; CHECK-ISEL-NEXT: cmhi v5.2d, v0.2d, v2.2d +; CHECK-ISEL-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-ISEL-NEXT: bif v0.16b, v2.16b, v5.16b ; CHECK-ISEL-NEXT: stp q0, q1, [x0] ; CHECK-ISEL-NEXT: ret ; @@ -583,12 +625,19 @@ declare <32 x i8> @llvm.smin.v32i8(<32 x i8> %a, <32 x i8> %b) readnone define void @smin32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %p) { -; CHECK-LABEL: smin32i8: -; CHECK: // %bb.0: -; CHECK-NEXT: smin v0.16b, v0.16b, v2.16b -; CHECK-NEXT: smin v1.16b, v1.16b, v3.16b -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: smin32i8: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: smin v1.16b, v1.16b, v3.16b +; CHECK-ISEL-NEXT: smin v0.16b, v0.16b, v2.16b +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: smin32i8: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: smin v0.16b, v0.16b, v2.16b +; CHECK-GLOBAL-NEXT: smin v1.16b, v1.16b, v3.16b +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %a, <32 x i8> %b) store <32 x i8> %c, <32 x i8>* %p ret void @@ -619,12 +668,19 @@ declare <16 x i16> @llvm.smin.v16i16(<16 x i16> %a, <16 x i16> %b) readnone define void @smin16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %p) { -; CHECK-LABEL: smin16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: smin v0.8h, v0.8h, v2.8h -; CHECK-NEXT: smin v1.8h, v1.8h, v3.8h -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: smin16i16: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: smin v1.8h, v1.8h, v3.8h +; CHECK-ISEL-NEXT: smin v0.8h, v0.8h, v2.8h +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: smin16i16: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: smin v0.8h, v0.8h, v2.8h +; CHECK-GLOBAL-NEXT: smin v1.8h, v1.8h, v3.8h +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %a, <16 x i16> %b) store <16 x i16> %c, <16 x i16>* %p ret void @@ -655,12 +711,19 @@ declare <8 x i32> @llvm.smin.v8i32(<8 x i32> %a, <8 x i32> %b) readnone define void @smin8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %p) { -; CHECK-LABEL: smin8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s -; CHECK-NEXT: smin v1.4s, v1.4s, v3.4s -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: smin8i32: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: smin v1.4s, v1.4s, v3.4s +; CHECK-ISEL-NEXT: smin v0.4s, v0.4s, v2.4s +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: smin8i32: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: smin v0.4s, v0.4s, v2.4s +; CHECK-GLOBAL-NEXT: smin v1.4s, v1.4s, v3.4s +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <8 x i32>@llvm.smin.v8i32(<8 x i32> %a, <8 x i32> %b) store <8 x i32> %c, <8 x i32>* %p ret void @@ -711,10 +774,10 @@ define void @smin4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) { ; CHECK-ISEL-LABEL: smin4i64: ; CHECK-ISEL: // %bb.0: -; CHECK-ISEL-NEXT: cmgt v4.2d, v2.2d, v0.2d -; CHECK-ISEL-NEXT: cmgt v5.2d, v3.2d, v1.2d -; CHECK-ISEL-NEXT: bif v0.16b, v2.16b, v4.16b -; CHECK-ISEL-NEXT: bif v1.16b, v3.16b, v5.16b +; CHECK-ISEL-NEXT: cmgt v4.2d, v3.2d, v1.2d +; CHECK-ISEL-NEXT: cmgt v5.2d, v2.2d, v0.2d +; CHECK-ISEL-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-ISEL-NEXT: bif v0.16b, v2.16b, v5.16b ; CHECK-ISEL-NEXT: stp q0, q1, [x0] ; CHECK-ISEL-NEXT: ret ; @@ -826,12 +889,19 @@ declare <32 x i8> @llvm.umin.v32i8(<32 x i8> %a, <32 x i8> %b) readnone define void @umin32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %p) { -; CHECK-LABEL: umin32i8: -; CHECK: // %bb.0: -; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b -; CHECK-NEXT: umin v1.16b, v1.16b, v3.16b -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: umin32i8: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: umin v1.16b, v1.16b, v3.16b +; CHECK-ISEL-NEXT: umin v0.16b, v0.16b, v2.16b +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: umin32i8: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: umin v0.16b, v0.16b, v2.16b +; CHECK-GLOBAL-NEXT: umin v1.16b, v1.16b, v3.16b +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %a, <32 x i8> %b) store <32 x i8> %c, <32 x i8>* %p ret void @@ -862,12 +932,19 @@ declare <16 x i16> @llvm.umin.v16i16(<16 x i16> %a, <16 x i16> %b) readnone define void @umin16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %p) { -; CHECK-LABEL: umin16i16: -; CHECK: // %bb.0: -; CHECK-NEXT: umin v0.8h, v0.8h, v2.8h -; CHECK-NEXT: umin v1.8h, v1.8h, v3.8h -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: umin16i16: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: umin v1.8h, v1.8h, v3.8h +; CHECK-ISEL-NEXT: umin v0.8h, v0.8h, v2.8h +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: umin16i16: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: umin v0.8h, v0.8h, v2.8h +; CHECK-GLOBAL-NEXT: umin v1.8h, v1.8h, v3.8h +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a, <16 x i16> %b) store <16 x i16> %c, <16 x i16>* %p ret void @@ -898,12 +975,19 @@ declare <8 x i32> @llvm.umin.v8i32(<8 x i32> %a, <8 x i32> %b) readnone define void @umin8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %p) { -; CHECK-LABEL: umin8i32: -; CHECK: // %bb.0: -; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s -; CHECK-NEXT: umin v1.4s, v1.4s, v3.4s -; CHECK-NEXT: stp q0, q1, [x0] -; CHECK-NEXT: ret +; CHECK-ISEL-LABEL: umin8i32: +; CHECK-ISEL: // %bb.0: +; CHECK-ISEL-NEXT: umin v1.4s, v1.4s, v3.4s +; CHECK-ISEL-NEXT: umin v0.4s, v0.4s, v2.4s +; CHECK-ISEL-NEXT: stp q0, q1, [x0] +; CHECK-ISEL-NEXT: ret +; +; CHECK-GLOBAL-LABEL: umin8i32: +; CHECK-GLOBAL: // %bb.0: +; CHECK-GLOBAL-NEXT: umin v0.4s, v0.4s, v2.4s +; CHECK-GLOBAL-NEXT: umin v1.4s, v1.4s, v3.4s +; CHECK-GLOBAL-NEXT: stp q0, q1, [x0] +; CHECK-GLOBAL-NEXT: ret %c = call <8 x i32>@llvm.umin.v8i32(<8 x i32> %a, <8 x i32> %b) store <8 x i32> %c, <8 x i32>* %p ret void @@ -954,10 +1038,10 @@ define void @umin4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) { ; CHECK-ISEL-LABEL: umin4i64: ; CHECK-ISEL: // %bb.0: -; CHECK-ISEL-NEXT: cmhi v4.2d, v2.2d, v0.2d -; CHECK-ISEL-NEXT: cmhi v5.2d, v3.2d, v1.2d -; CHECK-ISEL-NEXT: bif v0.16b, v2.16b, v4.16b -; CHECK-ISEL-NEXT: bif v1.16b, v3.16b, v5.16b +; CHECK-ISEL-NEXT: cmhi v4.2d, v3.2d, v1.2d +; CHECK-ISEL-NEXT: cmhi v5.2d, v2.2d, v0.2d +; CHECK-ISEL-NEXT: bif v1.16b, v3.16b, v4.16b +; CHECK-ISEL-NEXT: bif v0.16b, v2.16b, v5.16b ; CHECK-ISEL-NEXT: stp q0, q1, [x0] ; CHECK-ISEL-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll b/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll --- a/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll +++ b/llvm/test/CodeGen/AArch64/minmax-of-minmax.ll @@ -10,9 +10,9 @@ define <4 x i32> @smin_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smin_ab_bc: ; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s -; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp slt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -26,9 +26,9 @@ define <4 x i32> @smin_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smin_ab_cb: ; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v2.4s, v1.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s -; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp slt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -74,9 +74,9 @@ define <4 x i32> @smin_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smin_ab_bc_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s -; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp slt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -90,9 +90,9 @@ define <4 x i32> @smin_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smin_ab_cb_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v2.4s, v1.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s -; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp slt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -138,9 +138,9 @@ define <4 x i32> @smin_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smin_ab_bc_eq_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s -; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp slt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -154,9 +154,9 @@ define <4 x i32> @smin_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smin_ab_cb_eq_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v2.4s, v1.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s -; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp slt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -202,9 +202,9 @@ define <4 x i32> @smin_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smin_ab_bc_eq_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s -; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp slt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -218,9 +218,9 @@ define <4 x i32> @smin_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smin_ab_cb_eq_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v2.4s, v1.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s -; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp slt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -266,9 +266,9 @@ define <4 x i32> @smax_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smax_ab_bc: ; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s -; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp sgt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -282,9 +282,9 @@ define <4 x i32> @smax_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smax_ab_cb: ; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v2.4s, v1.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s -; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp sgt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -330,9 +330,9 @@ define <4 x i32> @smax_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smax_ab_bc_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s -; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp sgt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -346,9 +346,9 @@ define <4 x i32> @smax_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smax_ab_cb_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v2.4s, v1.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s -; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp sgt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -394,9 +394,9 @@ define <4 x i32> @smax_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smax_ab_bc_eq_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s -; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp sgt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -410,9 +410,9 @@ define <4 x i32> @smax_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smax_ab_cb_eq_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v2.4s, v1.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s -; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp sgt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -458,9 +458,9 @@ define <4 x i32> @smax_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smax_ab_bc_eq_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s -; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp sgt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -474,9 +474,9 @@ define <4 x i32> @smax_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: smax_ab_cb_eq_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: smax v2.4s, v2.4s, v1.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s -; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp sgt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -522,9 +522,9 @@ define <4 x i32> @umin_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umin_ab_bc: ; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s -; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ult <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -538,9 +538,9 @@ define <4 x i32> @umin_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umin_ab_cb: ; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v2.4s, v1.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s -; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ult <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -586,9 +586,9 @@ define <4 x i32> @umin_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umin_ab_bc_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s -; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ult <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -602,9 +602,9 @@ define <4 x i32> @umin_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umin_ab_cb_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v2.4s, v1.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s -; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ult <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -650,9 +650,9 @@ define <4 x i32> @umin_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umin_ab_bc_eq_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s -; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ult <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -666,9 +666,9 @@ define <4 x i32> @umin_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umin_ab_cb_eq_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v2.4s, v1.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s -; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ult <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -714,9 +714,9 @@ define <4 x i32> @umin_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umin_ab_bc_eq_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s -; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ult <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -730,9 +730,9 @@ define <4 x i32> @umin_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umin_ab_cb_eq_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umin v2.4s, v2.4s, v1.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s -; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ult <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -778,9 +778,9 @@ define <4 x i32> @umax_ab_bc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umax_ab_bc: ; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s -; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ugt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -794,9 +794,9 @@ define <4 x i32> @umax_ab_cb(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umax_ab_cb: ; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v2.4s, v1.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s -; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ugt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -842,9 +842,9 @@ define <4 x i32> @umax_ab_bc_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umax_ab_bc_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s -; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ugt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -858,9 +858,9 @@ define <4 x i32> @umax_ab_cb_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umax_ab_cb_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v2.4s, v1.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s -; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ugt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -906,9 +906,9 @@ define <4 x i32> @umax_ab_bc_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umax_ab_bc_eq_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s -; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ugt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -922,9 +922,9 @@ define <4 x i32> @umax_ab_cb_eq_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umax_ab_cb_eq_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v2.4s, v1.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s -; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ugt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -970,9 +970,9 @@ define <4 x i32> @umax_ab_bc_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umax_ab_bc_eq_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s -; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ugt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -986,9 +986,9 @@ define <4 x i32> @umax_ab_cb_eq_swap_pred(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: umax_ab_cb_eq_swap_pred: ; CHECK: // %bb.0: +; CHECK-NEXT: umax v2.4s, v2.4s, v1.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s -; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s -; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %cmp_ab = icmp ugt <4 x i32> %a, %b %min_ab = select <4 x i1> %cmp_ab, <4 x i32> %a, <4 x i32> %b @@ -1034,8 +1034,8 @@ define <4 x i32> @notted_smin_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_ab_bc: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s @@ -1056,8 +1056,8 @@ define <4 x i32> @notted_smin_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_ab_cb: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s @@ -1078,9 +1078,9 @@ define <4 x i32> @notted_smin_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_bc_ab: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s @@ -1100,9 +1100,9 @@ define <4 x i32> @notted_smin_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_bc_ba: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s ; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s @@ -1122,8 +1122,8 @@ define <4 x i32> @notted_smin_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_ab_bc_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s @@ -1144,8 +1144,8 @@ define <4 x i32> @notted_smin_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_ab_cb_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s @@ -1166,9 +1166,9 @@ define <4 x i32> @notted_smin_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_bc_ab_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s @@ -1188,9 +1188,9 @@ define <4 x i32> @notted_smin_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_bc_ba_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s ; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s @@ -1210,8 +1210,8 @@ define <4 x i32> @notted_smin_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_ab_bc_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s @@ -1232,8 +1232,8 @@ define <4 x i32> @notted_smin_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_ab_cb_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s @@ -1254,9 +1254,9 @@ define <4 x i32> @notted_smin_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_bc_ab_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s @@ -1276,9 +1276,9 @@ define <4 x i32> @notted_smin_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_bc_ba_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s ; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s @@ -1298,8 +1298,8 @@ define <4 x i32> @notted_smin_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_ab_bc_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v1.4s, v1.4s, v2.4s @@ -1320,8 +1320,8 @@ define <4 x i32> @notted_smin_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_ab_cb_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v1.4s, v2.4s, v1.4s @@ -1342,9 +1342,9 @@ define <4 x i32> @notted_smin_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_bc_ab_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s @@ -1364,9 +1364,9 @@ define <4 x i32> @notted_smin_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smin_bc_ba_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smin v0.4s, v1.4s, v0.4s ; CHECK-NEXT: smin v0.4s, v2.4s, v0.4s @@ -1386,8 +1386,8 @@ define <4 x i32> @notted_smax_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_ab_bc: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s @@ -1408,8 +1408,8 @@ define <4 x i32> @notted_smax_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_ab_cb: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s @@ -1430,9 +1430,9 @@ define <4 x i32> @notted_smax_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_bc_ab: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s @@ -1452,9 +1452,9 @@ define <4 x i32> @notted_smax_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_bc_ba: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s ; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s @@ -1474,8 +1474,8 @@ define <4 x i32> @notted_smax_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_ab_bc_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s @@ -1496,8 +1496,8 @@ define <4 x i32> @notted_smax_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_ab_cb_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s @@ -1518,9 +1518,9 @@ define <4 x i32> @notted_smax_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_bc_ab_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s @@ -1540,9 +1540,9 @@ define <4 x i32> @notted_smax_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_bc_ba_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s ; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s @@ -1562,8 +1562,8 @@ define <4 x i32> @notted_smax_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_ab_bc_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s @@ -1584,8 +1584,8 @@ define <4 x i32> @notted_smax_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_ab_cb_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s @@ -1606,9 +1606,9 @@ define <4 x i32> @notted_smax_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_bc_ab_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s @@ -1628,9 +1628,9 @@ define <4 x i32> @notted_smax_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_bc_ba_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s ; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s @@ -1650,8 +1650,8 @@ define <4 x i32> @notted_smax_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_ab_bc_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s @@ -1672,8 +1672,8 @@ define <4 x i32> @notted_smax_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_ab_cb_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v1.4s, v2.4s, v1.4s @@ -1694,9 +1694,9 @@ define <4 x i32> @notted_smax_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_bc_ab_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s @@ -1716,9 +1716,9 @@ define <4 x i32> @notted_smax_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_smax_bc_ba_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: smax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: smax v0.4s, v1.4s, v0.4s ; CHECK-NEXT: smax v0.4s, v2.4s, v0.4s @@ -1738,8 +1738,8 @@ define <4 x i32> @notted_umin_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_ab_bc: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s @@ -1760,8 +1760,8 @@ define <4 x i32> @notted_umin_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_ab_cb: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s @@ -1782,9 +1782,9 @@ define <4 x i32> @notted_umin_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_bc_ab: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s @@ -1804,9 +1804,9 @@ define <4 x i32> @notted_umin_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_bc_ba: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s ; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s @@ -1826,8 +1826,8 @@ define <4 x i32> @notted_umin_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_ab_bc_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s @@ -1848,8 +1848,8 @@ define <4 x i32> @notted_umin_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_ab_cb_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s @@ -1870,9 +1870,9 @@ define <4 x i32> @notted_umin_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_bc_ab_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s @@ -1892,9 +1892,9 @@ define <4 x i32> @notted_umin_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_bc_ba_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s ; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s @@ -1914,8 +1914,8 @@ define <4 x i32> @notted_umin_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_ab_bc_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s @@ -1936,8 +1936,8 @@ define <4 x i32> @notted_umin_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_ab_cb_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s @@ -1958,9 +1958,9 @@ define <4 x i32> @notted_umin_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_bc_ab_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s @@ -1980,9 +1980,9 @@ define <4 x i32> @notted_umin_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_bc_ba_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s ; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s @@ -2002,8 +2002,8 @@ define <4 x i32> @notted_umin_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_ab_bc_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s @@ -2024,8 +2024,8 @@ define <4 x i32> @notted_umin_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_ab_cb_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v1.4s, v2.4s, v1.4s @@ -2046,9 +2046,9 @@ define <4 x i32> @notted_umin_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_bc_ab_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s @@ -2068,9 +2068,9 @@ define <4 x i32> @notted_umin_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umin_bc_ba_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umin v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umin v0.4s, v1.4s, v0.4s ; CHECK-NEXT: umin v0.4s, v2.4s, v0.4s @@ -2090,8 +2090,8 @@ define <4 x i32> @notted_umax_ab_bc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_ab_bc: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s @@ -2112,8 +2112,8 @@ define <4 x i32> @notted_umax_ab_cb(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_ab_cb: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s @@ -2134,9 +2134,9 @@ define <4 x i32> @notted_umax_bc_ab(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_bc_ab: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s @@ -2156,9 +2156,9 @@ define <4 x i32> @notted_umax_bc_ba(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_bc_ba: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s ; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s @@ -2178,8 +2178,8 @@ define <4 x i32> @notted_umax_ab_bc_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_ab_bc_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s @@ -2200,8 +2200,8 @@ define <4 x i32> @notted_umax_ab_cb_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_ab_cb_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s @@ -2222,9 +2222,9 @@ define <4 x i32> @notted_umax_bc_ab_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_bc_ab_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s @@ -2244,9 +2244,9 @@ define <4 x i32> @notted_umax_bc_ba_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_bc_ba_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s ; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s @@ -2266,8 +2266,8 @@ define <4 x i32> @notted_umax_ab_bc_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_ab_bc_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s @@ -2288,8 +2288,8 @@ define <4 x i32> @notted_umax_ab_cb_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_ab_cb_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s @@ -2310,9 +2310,9 @@ define <4 x i32> @notted_umax_bc_ab_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_bc_ab_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s @@ -2332,9 +2332,9 @@ define <4 x i32> @notted_umax_bc_ba_eq_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_bc_ba_eq_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s ; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s @@ -2354,8 +2354,8 @@ define <4 x i32> @notted_umax_ab_bc_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_ab_bc_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v1.4s, v1.4s, v2.4s @@ -2376,8 +2376,8 @@ define <4 x i32> @notted_umax_ab_cb_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_ab_cb_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v2.16b, v2.16b ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v1.4s, v2.4s, v1.4s @@ -2398,9 +2398,9 @@ define <4 x i32> @notted_umax_bc_ab_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_bc_ab_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s @@ -2420,9 +2420,9 @@ define <4 x i32> @notted_umax_bc_ba_eq_swap_pred(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: notted_umax_bc_ba_eq_swap_pred: ; CHECK: // %bb.0: -; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: mvn v2.16b, v2.16b +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: umax v2.4s, v1.4s, v2.4s ; CHECK-NEXT: umax v0.4s, v1.4s, v0.4s ; CHECK-NEXT: umax v0.4s, v2.4s, v0.4s diff --git a/llvm/test/CodeGen/AArch64/minmax.ll b/llvm/test/CodeGen/AArch64/minmax.ll --- a/llvm/test/CodeGen/AArch64/minmax.ll +++ b/llvm/test/CodeGen/AArch64/minmax.ll @@ -108,9 +108,9 @@ define <16 x i32> @t11(<16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: t11: ; CHECK: // %bb.0: +; CHECK-NEXT: smin v2.4s, v2.4s, v6.4s ; CHECK-NEXT: smin v0.4s, v0.4s, v4.4s ; CHECK-NEXT: smin v1.4s, v1.4s, v5.4s -; CHECK-NEXT: smin v2.4s, v2.4s, v6.4s ; CHECK-NEXT: smin v3.4s, v3.4s, v7.4s ; CHECK-NEXT: ret %t1 = icmp sle <16 x i32> %a, %b @@ -122,10 +122,10 @@ define <16 x i8> @t12(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: t12: ; CHECK: // %bb.0: -; CHECK-NEXT: cmhi v2.16b, v1.16b, v0.16b -; CHECK-NEXT: movi v3.16b, #1 -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: and v1.16b, v2.16b, v3.16b +; CHECK-NEXT: movi v2.16b, #1 +; CHECK-NEXT: cmhi v3.16b, v1.16b, v0.16b +; CHECK-NEXT: bif v0.16b, v1.16b, v3.16b +; CHECK-NEXT: and v1.16b, v3.16b, v2.16b ; CHECK-NEXT: add v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %t1 = icmp ugt <16 x i8> %b, %a diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll b/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll --- a/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll +++ b/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll @@ -19,7 +19,6 @@ ; CHECK-LABEL: litp: ; CHECK: adrp [[R:x[0-9]+]], litp -; CHECKDONT-NEXT: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ; CHECKFUSE-NEXT: add {{x[0-9]+}}, [[R]], :lo12:litp } @@ -44,10 +43,10 @@ ; CHECK-LABEL: litl: ; CHECK: mov [[R:x[0-9]+]], {{#[0-9]+}} +; CHECKDONT-NEXT: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} ; CHECK-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16 ; CHECK: movk [[R]], {{#[0-9]+}}, lsl #32 -; CHECKDONT-NEXT: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} -; CHECKFUSE-NEXT: movk [[R]], {{#[0-9]+}}, lsl #48 +; CHECK-NEXT: movk [[R]], {{#[0-9]+}}, lsl #48 } ; Function Attrs: norecurse nounwind readnone diff --git a/llvm/test/CodeGen/AArch64/misched-fusion.ll b/llvm/test/CodeGen/AArch64/misched-fusion.ll --- a/llvm/test/CodeGen/AArch64/misched-fusion.ll +++ b/llvm/test/CodeGen/AArch64/misched-fusion.ll @@ -1,5 +1,5 @@ -; RUN: llc -o - %s -mtriple=aarch64-unknown -aarch64-enable-cond-br-tune=false -mattr=+arith-bcc-fusion | FileCheck %s --check-prefix=FUSEBCC -; RUN: llc -o - %s -mtriple=aarch64-unknown -aarch64-enable-cond-br-tune=false -mattr=+arith-cbz-fusion | FileCheck %s --check-prefix=FUSECBZ +; RUN: llc -o - %s -mtriple=aarch64-unknown -aarch64-enable-cond-br-tune=false -mcpu=cortex-a57 -mattr=+arith-bcc-fusion | FileCheck %s --check-prefix=FUSEBCC +; RUN: llc -o - %s -mtriple=aarch64-unknown -aarch64-enable-cond-br-tune=false -mcpu=cortex-a57 -mattr=+arith-cbz-fusion | FileCheck %s --check-prefix=FUSECBZ ; RUN: llc -o - %s -mtriple=aarch64-unknown -aarch64-enable-cond-br-tune=false -mcpu=cyclone | FileCheck %s --check-prefix=FUSEBCC --check-prefix=FUSECBZ target triple = "aarch64-unknown" diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll @@ -39,8 +39,9 @@ define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 { ; CHECK-LABEL: splice_v8i32_idx: ; CHECK: // %bb.0: +; CHECK-NEXT: ext v3.16b, v2.16b, v3.16b, #4 ; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #4 -; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #4 +; CHECK-NEXT: mov v1.16b, v3.16b ; CHECK-NEXT: ret %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5) ret <8 x i32> %res @@ -50,10 +51,12 @@ define <16 x float> @splice_v16f32_idx(<16 x float> %a, <16 x float> %b) #0 { ; CHECK-LABEL: splice_v16f32_idx: ; CHECK: // %bb.0: +; CHECK-NEXT: ext v5.16b, v4.16b, v5.16b, #12 +; CHECK-NEXT: ext v6.16b, v2.16b, v3.16b, #12 ; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #12 -; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #12 ; CHECK-NEXT: ext v2.16b, v3.16b, v4.16b, #12 -; CHECK-NEXT: ext v3.16b, v4.16b, v5.16b, #12 +; CHECK-NEXT: mov v3.16b, v5.16b +; CHECK-NEXT: mov v1.16b, v6.16b ; CHECK-NEXT: ret %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7) ret <16 x float> %res @@ -104,8 +107,9 @@ define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 { ; CHECK-LABEL: splice_v8i32: ; CHECK: // %bb.0: +; CHECK-NEXT: ext v3.16b, v2.16b, v3.16b, #4 ; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #4 -; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #4 +; CHECK-NEXT: mov v1.16b, v3.16b ; CHECK-NEXT: ret %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3) ret <8 x i32> %res @@ -115,10 +119,12 @@ define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 { ; CHECK-LABEL: splice_v16f32: ; CHECK: // %bb.0: +; CHECK-NEXT: ext v5.16b, v4.16b, v5.16b, #12 +; CHECK-NEXT: ext v6.16b, v2.16b, v3.16b, #12 ; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #12 -; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #12 ; CHECK-NEXT: ext v2.16b, v3.16b, v4.16b, #12 -; CHECK-NEXT: ext v3.16b, v4.16b, v5.16b, #12 +; CHECK-NEXT: mov v3.16b, v5.16b +; CHECK-NEXT: mov v1.16b, v6.16b ; CHECK-NEXT: ret %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9) ret <16 x float> %res diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -39,16 +39,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: sub x9, x9, #1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: mov w10, #256 -; CHECK-NEXT: cmp x9, #256 +; CHECK-NEXT: cmp x8, #256 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: csel x8, x8, x10, lo ; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] +; CHECK-NEXT: st1b { z1.b }, p0, [x9, #1, mul vl] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -80,16 +80,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cnth x10 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov w10, #128 +; CHECK-NEXT: cmp x8, #128 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #128 -; CHECK-NEXT: cmp x10, #128 +; CHECK-NEXT: csel x8, x8, x10, lo ; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] +; CHECK-NEXT: st1h { z1.h }, p0, [x9, #1, mul vl] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x9, x8, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -121,16 +121,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntw x10 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov w10, #64 +; CHECK-NEXT: cmp x8, #64 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #64 -; CHECK-NEXT: cmp x10, #64 +; CHECK-NEXT: csel x8, x8, x10, lo ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] +; CHECK-NEXT: st1w { z1.s }, p0, [x9, #1, mul vl] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x8, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -162,16 +162,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntd x10 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov w10, #32 +; CHECK-NEXT: cmp x8, #32 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #32 -; CHECK-NEXT: cmp x10, #32 +; CHECK-NEXT: csel x8, x8, x10, lo ; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] +; CHECK-NEXT: st1d { z1.d }, p0, [x9, #1, mul vl] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x9, x8, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -196,12 +196,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-8 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-8 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -234,18 +234,18 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntd x10 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: mov w9, #32 -; CHECK-NEXT: cmp x10, #32 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x8, #32 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: st1h { z1.h }, p0, [x10, #1, mul vl] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x10, x8] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -270,12 +270,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-6 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-6 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -308,18 +308,18 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntw x10 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: cntw x8 ; CHECK-NEXT: mov w9, #64 -; CHECK-NEXT: cmp x10, #64 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x8, #64 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: lsl x9, x9, #2 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] +; CHECK-NEXT: lsl x8, x8, #2 +; CHECK-NEXT: st1h { z1.h }, p0, [x10, #1, mul vl] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x10, x8] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -351,16 +351,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cnth x10 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov w10, #128 +; CHECK-NEXT: cmp x8, #128 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #128 -; CHECK-NEXT: cmp x10, #128 +; CHECK-NEXT: csel x8, x8, x10, lo ; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] +; CHECK-NEXT: st1h { z1.h }, p0, [x9, #1, mul vl] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x9, x8, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -385,12 +385,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-4 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-4 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -423,18 +423,18 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntd x10 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: mov w9, #32 -; CHECK-NEXT: cmp x10, #32 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x8, #32 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x8, x9] +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: st1w { z1.s }, p0, [x10, #1, mul vl] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x10, x8] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -466,16 +466,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntw x10 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov w10, #64 +; CHECK-NEXT: cmp x8, #64 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #64 -; CHECK-NEXT: cmp x10, #64 +; CHECK-NEXT: csel x8, x8, x10, lo ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] +; CHECK-NEXT: st1w { z1.s }, p0, [x9, #1, mul vl] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x8, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -507,16 +507,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: cntd x10 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov w10, #32 +; CHECK-NEXT: cmp x8, #32 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #32 -; CHECK-NEXT: cmp x10, #32 +; CHECK-NEXT: csel x8, x8, x10, lo ; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] +; CHECK-NEXT: st1d { z1.d }, p0, [x9, #1, mul vl] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x9, x8, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -531,9 +531,9 @@ ; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: ptrue p2.d ; CHECK-NEXT: and z1.d, z1.d, #0x1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: cmpne p0.d, p2/z, z1.d, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 1) ret %res @@ -546,9 +546,9 @@ ; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: ptrue p2.s ; CHECK-NEXT: and z1.s, z1.s, #0x1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: cmpne p0.s, p2/z, z1.s, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 2) ret %res @@ -561,9 +561,9 @@ ; CHECK-NEXT: mov z0.h, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: ptrue p2.h ; CHECK-NEXT: and z1.h, z1.h, #0x1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 +; CHECK-NEXT: cmpne p0.h, p2/z, z1.h, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 4) ret %res @@ -576,9 +576,9 @@ ; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: and z1.b, z1.b, #0x1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; CHECK-NEXT: cmpne p0.b, p2/z, z1.b, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 8) ret %res @@ -600,15 +600,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: orr x9, x8, #0x8 ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] -; CHECK-NEXT: orr x8, x8, #0x8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x9, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -622,26 +622,26 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-8 -; CHECK-NEXT: rdvl x10, #1 -; CHECK-NEXT: sub x10, x10, #1 -; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: mov w10, #16 +; CHECK-NEXT: sub x9, x9, #1 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #16 -; CHECK-NEXT: cmp x10, #16 +; CHECK-NEXT: cmp x9, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: csel x9, x9, x10, lo ; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] +; CHECK-NEXT: add x10, x8, x9, lsl #2 ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z7.s }, p0, [x8, #7, mul vl] ; CHECK-NEXT: st1w { z4.s }, p0, [x8, #4, mul vl] ; CHECK-NEXT: st1w { z5.s }, p0, [x8, #5, mul vl] ; CHECK-NEXT: st1w { z6.s }, p0, [x8, #6, mul vl] -; CHECK-NEXT: csel x9, x10, x9, lo ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] -; CHECK-NEXT: add x8, x8, x9, lsl #2 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8, #2, mul vl] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10, #1, mul vl] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x10, #2, mul vl] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x10, #3, mul vl] ; CHECK-NEXT: addvl sp, sp, #8 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -658,12 +658,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-16 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-16 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8, x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -690,17 +690,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #17 +; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #17 +; CHECK-NEXT: mov w10, #17 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: addvl x10, x8, #1 ; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: sub x9, x10, x9 ; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -713,12 +713,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-8 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-8 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -745,17 +745,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #18 +; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: addvl x10, x8, #1 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: sub x9, x10, x9 ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -768,12 +768,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-4 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-4 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -800,17 +800,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #20 +; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #20 +; CHECK-NEXT: mov w10, #20 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: addvl x10, x8, #1 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: sub x9, x10, x9 ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -823,12 +823,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-2 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-2 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -855,17 +855,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #24 +; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #24 +; CHECK-NEXT: mov w10, #24 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: addvl x10, x8, #1 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: sub x9, x10, x9 ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -878,12 +878,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-8 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-8 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8, x9, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -910,17 +910,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #18 +; CHECK-NEXT: mov w10, #18 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: addvl x10, x8, #1 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: sub x9, x10, x9 ; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -933,12 +933,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-4 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-4 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -965,17 +965,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #20 +; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #20 +; CHECK-NEXT: mov w10, #20 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: addvl x10, x8, #1 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: sub x9, x10, x9 ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -988,12 +988,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-2 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-2 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1020,17 +1020,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #24 +; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmp x9, #24 +; CHECK-NEXT: mov w10, #24 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: addvl x10, x8, #1 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: sub x9, x10, x9 ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x9] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1042,13 +1042,13 @@ define @splice_nxv2i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.d ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lastb d0, p0, z0.d +; CHECK-NEXT: lastb d0, p2, z0.d ; CHECK-NEXT: mov z1.d, p1/z, #1 // =0x1 ; CHECK-NEXT: insr z1.d, d0 ; CHECK-NEXT: and z1.d, z1.d, #0x1 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: cmpne p0.d, p2/z, z1.d, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 -1) ret %res @@ -1058,13 +1058,13 @@ define @splice_nxv4i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: lastb s0, p0, z0.s +; CHECK-NEXT: lastb s0, p2, z0.s ; CHECK-NEXT: mov z1.s, p1/z, #1 // =0x1 ; CHECK-NEXT: insr z1.s, s0 ; CHECK-NEXT: and z1.s, z1.s, #0x1 -; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: cmpne p0.s, p2/z, z1.s, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 -1) ret %res @@ -1074,13 +1074,13 @@ define @splice_nxv8i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.h ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: lastb h0, p0, z0.h +; CHECK-NEXT: lastb h0, p2, z0.h ; CHECK-NEXT: mov z1.h, p1/z, #1 // =0x1 ; CHECK-NEXT: insr z1.h, h0 ; CHECK-NEXT: and z1.h, z1.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 +; CHECK-NEXT: cmpne p0.h, p2/z, z1.h, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 -1) ret %res @@ -1090,13 +1090,13 @@ define @splice_nxv16i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv16i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: lastb b0, p0, z0.b +; CHECK-NEXT: lastb b0, p2, z0.b ; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 ; CHECK-NEXT: insr z1.b, b0 ; CHECK-NEXT: and z1.b, z1.b, #0x1 -; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; CHECK-NEXT: cmpne p0.b, p2/z, z1.b, #0 ; CHECK-NEXT: ret %res = call @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 -1) ret %res @@ -1108,12 +1108,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: mov x9, #-2 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: addvl x8, x8, #1 -; CHECK-NEXT: mov x9, #-2 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1128,17 +1128,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-4 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov x9, #-8 +; CHECK-NEXT: mov x10, #-8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: addvl x9, x8, #2 ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: sub x11, x9, #32 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] -; CHECK-NEXT: addvl x8, x8, #2 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8, x9, lsl #2] -; CHECK-NEXT: sub x8, x8, #32 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x10, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x11, #1, mul vl] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -1152,12 +1152,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-8 -; CHECK-NEXT: rdvl x9, #4 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w10, #68 +; CHECK-NEXT: rdvl x9, #4 ; CHECK-NEXT: cmp x9, #68 +; CHECK-NEXT: mov w10, #68 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: addvl x10, x8, #4 ; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] +; CHECK-NEXT: sub x9, x10, x9 ; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] @@ -1165,13 +1168,10 @@ ; CHECK-NEXT: st1w { z4.s }, p0, [x8, #4, mul vl] ; CHECK-NEXT: st1w { z5.s }, p0, [x8, #5, mul vl] ; CHECK-NEXT: st1w { z6.s }, p0, [x8, #6, mul vl] -; CHECK-NEXT: addvl x8, x8, #4 -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: sub x8, x8, x9 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x8, #2, mul vl] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x9, #1, mul vl] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x9, #2, mul vl] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x9, #3, mul vl] ; CHECK-NEXT: addvl sp, sp, #8 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/neg-abs.ll b/llvm/test/CodeGen/AArch64/neg-abs.ll --- a/llvm/test/CodeGen/AArch64/neg-abs.ll +++ b/llvm/test/CodeGen/AArch64/neg-abs.ll @@ -51,10 +51,10 @@ ; CHECK-LABEL: neg_abs128: ; CHECK: // %bb.0: ; CHECK-NEXT: asr x8, x1, #63 -; CHECK-NEXT: eor x10, x0, x8 -; CHECK-NEXT: eor x9, x1, x8 -; CHECK-NEXT: subs x0, x8, x10 -; CHECK-NEXT: sbcs x1, x8, x9 +; CHECK-NEXT: eor x9, x0, x8 +; CHECK-NEXT: eor x10, x1, x8 +; CHECK-NEXT: subs x0, x8, x9 +; CHECK-NEXT: sbcs x1, x8, x10 ; CHECK-NEXT: ret %abs = tail call i128 @llvm.abs.i128(i128 %x, i1 true) %neg = sub nsw i128 0, %abs diff --git a/llvm/test/CodeGen/AArch64/neg-imm.ll b/llvm/test/CodeGen/AArch64/neg-imm.ll --- a/llvm/test/CodeGen/AArch64/neg-imm.ll +++ b/llvm/test/CodeGen/AArch64/neg-imm.ll @@ -20,8 +20,9 @@ ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_1: // %for.inc ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: add w8, w20, #1 ; CHECK-NEXT: cmp w20, w19 -; CHECK-NEXT: add w20, w20, #1 +; CHECK-NEXT: mov w20, w8 ; CHECK-NEXT: b.gt .LBB0_4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -892,11 +892,11 @@ ; CHECK-LABEL: vselect_equivalent_shuffle_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI89_0 -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI89_0] ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: tbl v0.8b, { v0.16b }, v2.8b +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI89_0] +; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b ; CHECK-NEXT: ret %c = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> ret <8 x i8> %c @@ -906,9 +906,9 @@ ; CHECK-LABEL: vselect_equivalent_shuffle_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI90_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI90_0] ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI90_0] ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/neon-dotpattern.ll b/llvm/test/CodeGen/AArch64/neon-dotpattern.ll --- a/llvm/test/CodeGen/AArch64/neon-dotpattern.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotpattern.ll @@ -5,8 +5,8 @@ ; CHECK-LABEL: test_sdot_v4i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w8, [x2] -; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: dup v0.2s, wzr +; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: sdot v0.2s, v1.8b, v2.8b @@ -52,8 +52,8 @@ ; CHECK-LABEL: test_udot_v4i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr w8, [x2] -; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: dup v0.2s, wzr +; CHECK-NEXT: ldr w9, [x1] ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: udot v0.2s, v1.8b, v2.8b diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -7,11 +7,11 @@ define i32 @test_udot_v8i8(i8* nocapture readonly %a, i8* nocapture readonly %b) { ; CHECK-LABEL: test_udot_v8i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: udot v2.2s, v1.8b, v0.8b -; CHECK-NEXT: addp v0.2s, v2.2s, v2.2s +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: udot v0.2s, v2.8b, v1.8b +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -29,10 +29,10 @@ define i32 @test_udot_v8i8_nomla(i8* nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v8i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: movi v0.8b, #1 +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: movi v2.8b, #1 -; CHECK-NEXT: udot v1.2s, v0.8b, v2.8b +; CHECK-NEXT: udot v1.2s, v2.8b, v0.8b ; CHECK-NEXT: addp v0.2s, v1.2s, v1.2s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -47,11 +47,11 @@ define i32 @test_sdot_v8i8(i8* nocapture readonly %a, i8* nocapture readonly %b) { ; CHECK-LABEL: test_sdot_v8i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: sdot v2.2s, v1.8b, v0.8b -; CHECK-NEXT: addp v0.2s, v2.2s, v2.2s +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: sdot v0.2s, v2.8b, v1.8b +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -69,10 +69,10 @@ define i32 @test_sdot_v8i8_nomla(i8* nocapture readonly %a1) { ; CHECK-LABEL: test_sdot_v8i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: movi v0.8b, #1 +; CHECK-NEXT: ldr d2, [x0] ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: movi v2.8b, #1 -; CHECK-NEXT: sdot v1.2s, v0.8b, v2.8b +; CHECK-NEXT: sdot v1.2s, v2.8b, v0.8b ; CHECK-NEXT: addp v0.2s, v1.2s, v1.2s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -88,11 +88,11 @@ define i32 @test_udot_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: udot v2.4s, v1.16b, v0.16b -; CHECK-NEXT: addv s0, v2.4s +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: udot v0.4s, v1.16b, v2.16b +; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 ; CHECK-NEXT: ret @@ -112,11 +112,11 @@ define i32 @test_udot_v16i8_nomla(i8* nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v16i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: movi v1.16b, #1 -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: udot v2.4s, v0.16b, v1.16b -; CHECK-NEXT: addv s0, v2.4s +; CHECK-NEXT: movi v0.16b, #1 +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: udot v1.4s, v2.16b, v0.16b +; CHECK-NEXT: addv s0, v1.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -130,11 +130,11 @@ define i32 @test_sdot_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: sdot v2.4s, v1.16b, v0.16b -; CHECK-NEXT: addv s0, v2.4s +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: sdot v0.4s, v1.16b, v2.16b +; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 ; CHECK-NEXT: ret @@ -154,11 +154,11 @@ define i32 @test_sdot_v16i8_nomla(i8* nocapture readonly %a1) { ; CHECK-LABEL: test_sdot_v16i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: movi v1.16b, #1 -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: sdot v2.4s, v0.16b, v1.16b -; CHECK-NEXT: addv s0, v2.4s +; CHECK-NEXT: movi v0.16b, #1 +; CHECK-NEXT: ldr q2, [x0] +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sdot v1.4s, v2.16b, v0.16b +; CHECK-NEXT: addv s0, v1.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -195,11 +195,11 @@ define i32 @test_udot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; CHECK-LABEL: test_udot_v8i8_double_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: movi v3.8b, #1 -; CHECK-NEXT: udot v1.2s, v2.8b, v3.8b -; CHECK-NEXT: udot v1.2s, v0.8b, v3.8b -; CHECK-NEXT: addp v0.2s, v1.2s, v1.2s +; CHECK-NEXT: movi v1.8b, #1 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: udot v3.2s, v2.8b, v1.8b +; CHECK-NEXT: udot v3.2s, v0.8b, v1.8b +; CHECK-NEXT: addp v0.2s, v3.2s, v3.2s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: @@ -277,11 +277,11 @@ define i32 @test_sdot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; CHECK-LABEL: test_sdot_v8i8_double_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: movi v3.8b, #1 -; CHECK-NEXT: sdot v1.2s, v2.8b, v3.8b -; CHECK-NEXT: sdot v1.2s, v0.8b, v3.8b -; CHECK-NEXT: addp v0.2s, v1.2s, v1.2s +; CHECK-NEXT: movi v1.8b, #1 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: sdot v3.2s, v2.8b, v1.8b +; CHECK-NEXT: sdot v3.2s, v0.8b, v1.8b +; CHECK-NEXT: addp v0.2s, v3.2s, v3.2s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/neon-mla-mls.ll b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll --- a/llvm/test/CodeGen/AArch64/neon-mla-mls.ll +++ b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll @@ -138,9 +138,8 @@ define <8 x i8> @mls2v8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { ; CHECK-LABEL: mls2v8xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.8b, v2.8b -; CHECK-NEXT: mla v2.8b, v0.8b, v1.8b -; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: mul v0.8b, v0.8b, v1.8b +; CHECK-NEXT: sub v0.8b, v0.8b, v2.8b ; CHECK-NEXT: ret %tmp1 = mul <8 x i8> %A, %B; %tmp2 = sub <8 x i8> %tmp1, %C; @@ -150,9 +149,8 @@ define <16 x i8> @mls2v16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { ; CHECK-LABEL: mls2v16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.16b, v2.16b -; CHECK-NEXT: mla v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b +; CHECK-NEXT: sub v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %tmp1 = mul <16 x i8> %A, %B; %tmp2 = sub <16 x i8> %tmp1, %C; @@ -162,9 +160,8 @@ define <4 x i16> @mls2v4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { ; CHECK-LABEL: mls2v4xi16: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.4h, v2.4h -; CHECK-NEXT: mla v2.4h, v0.4h, v1.4h -; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h +; CHECK-NEXT: sub v0.4h, v0.4h, v2.4h ; CHECK-NEXT: ret %tmp1 = mul <4 x i16> %A, %B; %tmp2 = sub <4 x i16> %tmp1, %C; @@ -174,9 +171,8 @@ define <8 x i16> @mls2v8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { ; CHECK-LABEL: mls2v8xi16: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.8h, v2.8h -; CHECK-NEXT: mla v2.8h, v0.8h, v1.8h -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-NEXT: sub v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret %tmp1 = mul <8 x i16> %A, %B; %tmp2 = sub <8 x i16> %tmp1, %C; @@ -186,9 +182,8 @@ define <2 x i32> @mls2v2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { ; CHECK-LABEL: mls2v2xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.2s, v2.2s -; CHECK-NEXT: mla v2.2s, v0.2s, v1.2s -; CHECK-NEXT: fmov d0, d2 +; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s +; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s ; CHECK-NEXT: ret %tmp1 = mul <2 x i32> %A, %B; %tmp2 = sub <2 x i32> %tmp1, %C; @@ -198,9 +193,8 @@ define <4 x i32> @mls2v4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { ; CHECK-LABEL: mls2v4xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %tmp1 = mul <4 x i32> %A, %B; %tmp2 = sub <4 x i32> %tmp1, %C; diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll --- a/llvm/test/CodeGen/AArch64/neon-mov.ll +++ b/llvm/test/CodeGen/AArch64/neon-mov.ll @@ -334,8 +334,8 @@ ; CHECK-LABEL: movi1d: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI40_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI40_0] ; CHECK-NEXT: movi d1, #0x00ffffffff0000 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI40_0] ; CHECK-NEXT: b test_movi1d %1 = tail call <2 x i32> @test_movi1d(<2 x i32> , <2 x i32> ) ret <2 x i32> %1 diff --git a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll --- a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll +++ b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll @@ -80,9 +80,9 @@ ; CHECK-LABEL: v16i8_2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, .LCPI7_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_0] ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_0] ; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/neon-shift-neg.ll b/llvm/test/CodeGen/AArch64/neon-shift-neg.ll --- a/llvm/test/CodeGen/AArch64/neon-shift-neg.ll +++ b/llvm/test/CodeGen/AArch64/neon-shift-neg.ll @@ -376,8 +376,8 @@ ; CHECK-LABEL: shrn64x2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg x8, x0 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -392,8 +392,8 @@ ; CHECK-LABEL: shrn32x4: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -408,8 +408,8 @@ ; CHECK-LABEL: shrn16x8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: @@ -424,8 +424,8 @@ ; CHECK-LABEL: shrn8x16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret entry: @@ -440,8 +440,8 @@ ; CHECK-LABEL: lshrn64x2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg x8, x0 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -456,8 +456,8 @@ ; CHECK-LABEL: lshrn32x4: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -472,8 +472,8 @@ ; CHECK-LABEL: lshrn16x8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: @@ -488,8 +488,8 @@ ; CHECK-LABEL: lshrn8x16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret entry: @@ -504,8 +504,8 @@ ; CHECK-LABEL: shln64x2: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg x8, x0 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -520,8 +520,8 @@ ; CHECK-LABEL: shln32x4: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -536,8 +536,8 @@ ; CHECK-LABEL: shln16x8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: @@ -552,8 +552,8 @@ ; CHECK-LABEL: shln8x16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: neg w8, w0 -; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll --- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll +++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll @@ -30,10 +30,10 @@ define void @v8i64_v8i32(<8 x i64> %a, <8 x i32>* %result) { ; CHECK-LABEL: v8i64_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: xtn v2.2s, v2.2d -; CHECK-NEXT: xtn2 v0.4s, v1.2d +; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: xtn2 v2.4s, v3.2d +; CHECK-NEXT: xtn2 v0.4s, v1.2d ; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %b = trunc <8 x i64> %a to <8 x i32> @@ -47,8 +47,8 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strh w8, [x0, #2] ; CHECK-NEXT: strh w9, [x0] +; CHECK-NEXT: strh w8, [x0, #2] ; CHECK-NEXT: ret %b = trunc <2 x i32> %a to <2 x i16> store <2 x i16> %b, <2 x i16>* %result @@ -81,10 +81,10 @@ define void @v16i32_v16i16(<16 x i32> %a, <16 x i16>* %result) { ; CHECK-LABEL: v16i32_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: xtn v2.4h, v2.4s -; CHECK-NEXT: xtn2 v0.8h, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: xtn2 v2.8h, v3.4s +; CHECK-NEXT: xtn2 v0.8h, v1.4s ; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %b = trunc <16 x i32> %a to <16 x i16> @@ -98,8 +98,8 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w8, [x0, #1] ; CHECK-NEXT: strb w9, [x0] +; CHECK-NEXT: strb w8, [x0, #1] ; CHECK-NEXT: ret %b = trunc <2 x i32> %a to <2 x i8> store <2 x i8> %b, <2 x i8>* %result @@ -150,18 +150,18 @@ define void @v32i32_v32i8(<32 x i32> %a, <32 x i8>* %result) { ; CHECK-LABEL: v32i32_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: xtn v4.4h, v4.4s -; CHECK-NEXT: xtn v2.4h, v2.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: xtn v6.4h, v6.4s -; CHECK-NEXT: xtn2 v0.8h, v1.4s +; CHECK-NEXT: xtn v2.4h, v2.4s ; CHECK-NEXT: xtn2 v4.8h, v5.4s -; CHECK-NEXT: xtn2 v2.8h, v3.4s +; CHECK-NEXT: xtn2 v0.8h, v1.4s ; CHECK-NEXT: xtn2 v6.8h, v7.4s -; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: xtn2 v2.8h, v3.4s ; CHECK-NEXT: xtn v1.8b, v4.8h -; CHECK-NEXT: xtn2 v0.16b, v2.8h +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: xtn2 v1.16b, v6.8h +; CHECK-NEXT: xtn2 v0.16b, v2.8h ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret %b = trunc <32 x i32> %a to <32 x i8> @@ -175,8 +175,8 @@ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: strb w8, [x0, #1] ; CHECK-NEXT: strb w9, [x0] +; CHECK-NEXT: strb w8, [x0, #1] ; CHECK-NEXT: ret %b = trunc <2 x i16> %a to <2 x i8> store <2 x i8> %b, <2 x i8>* %result @@ -221,10 +221,10 @@ define void @v32i16_v32i8(<32 x i16> %a, <32 x i8>* %result) { ; CHECK-LABEL: v32i16_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: xtn v2.8b, v2.8h -; CHECK-NEXT: xtn2 v0.16b, v1.8h +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: xtn2 v2.16b, v3.8h +; CHECK-NEXT: xtn2 v0.16b, v1.8h ; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret %b = trunc <32 x i16> %a to <32 x i8> diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll --- a/llvm/test/CodeGen/AArch64/nontemporal.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal.ll @@ -208,8 +208,8 @@ define void @test_stnp_v4f32_invalid_offset_4(i8* %p, <4 x float> %v) #0 { ; CHECK-LABEL: test_stnp_v4f32_invalid_offset_4: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #4 ; CHECK-NEXT: mov d1, v0[1] +; CHECK-NEXT: add x8, x0, #4 ; CHECK-NEXT: stnp d0, d1, [x8] ; CHECK-NEXT: ret %tmp0 = getelementptr i8, i8* %p, i32 4 @@ -221,8 +221,8 @@ define void @test_stnp_v4f32_invalid_offset_neg_4(i8* %p, <4 x float> %v) #0 { ; CHECK-LABEL: test_stnp_v4f32_invalid_offset_neg_4: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub x8, x0, #4 ; CHECK-NEXT: mov d1, v0[1] +; CHECK-NEXT: sub x8, x0, #4 ; CHECK-NEXT: stnp d0, d1, [x8] ; CHECK-NEXT: ret %tmp0 = getelementptr i8, i8* %p, i32 -4 @@ -234,8 +234,8 @@ define void @test_stnp_v4f32_invalid_offset_512(i8* %p, <4 x float> %v) #0 { ; CHECK-LABEL: test_stnp_v4f32_invalid_offset_512: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #512 ; CHECK-NEXT: mov d1, v0[1] +; CHECK-NEXT: add x8, x0, #512 ; CHECK-NEXT: stnp d0, d1, [x8] ; CHECK-NEXT: ret %tmp0 = getelementptr i8, i8* %p, i32 512 @@ -259,8 +259,8 @@ define void @test_stnp_v4f32_invalid_offset_508(i8* %p, <4 x float> %v) #0 { ; CHECK-LABEL: test_stnp_v4f32_invalid_offset_508: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #508 ; CHECK-NEXT: mov d1, v0[1] +; CHECK-NEXT: add x8, x0, #508 ; CHECK-NEXT: stnp d0, d1, [x8] ; CHECK-NEXT: ret %tmp0 = getelementptr i8, i8* %p, i32 508 @@ -272,8 +272,8 @@ define void @test_stnp_v4f32_invalid_offset_neg_520(i8* %p, <4 x float> %v) #0 { ; CHECK-LABEL: test_stnp_v4f32_invalid_offset_neg_520: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub x8, x0, #520 ; CHECK-NEXT: mov d1, v0[1] +; CHECK-NEXT: sub x8, x0, #520 ; CHECK-NEXT: stnp d0, d1, [x8] ; CHECK-NEXT: ret %tmp0 = getelementptr i8, i8* %p, i32 -520 @@ -299,8 +299,8 @@ ; CHECK-LABEL: test_stnp_v2f32_invalid_offset_256: ; CHECK: ; %bb.0: ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: add x8, x0, #256 ; CHECK-NEXT: mov s1, v0[1] +; CHECK-NEXT: add x8, x0, #256 ; CHECK-NEXT: stnp s0, s1, [x8] ; CHECK-NEXT: ret %tmp0 = getelementptr i8, i8* %p, i32 256 @@ -326,8 +326,8 @@ ; CHECK-LABEL: test_stnp_v2f32_invalid_offset_neg_260: ; CHECK: ; %bb.0: ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sub x8, x0, #260 ; CHECK-NEXT: mov s1, v0[1] +; CHECK-NEXT: sub x8, x0, #260 ; CHECK-NEXT: stnp s0, s1, [x8] ; CHECK-NEXT: ret %tmp0 = getelementptr i8, i8* %p, i32 -260 @@ -450,44 +450,44 @@ define void @test_stnp_v17f32(<17 x float> %v, <17 x float>* %ptr) { ; CHECK-LABEL: test_stnp_v17f32: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ; kill: def $s1 killed $s1 def $q1 -; CHECK-NEXT: ; kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: ldr s16, [sp, #16] -; CHECK-NEXT: mov.s v0[1], v1[0] -; CHECK-NEXT: ldr s1, [sp] ; CHECK-NEXT: add x8, sp, #20 -; CHECK-NEXT: ld1.s { v16 }[1], [x8] -; CHECK-NEXT: add x8, sp, #4 -; CHECK-NEXT: ld1.s { v1 }[1], [x8] -; CHECK-NEXT: add x8, sp, #24 -; CHECK-NEXT: ld1.s { v16 }[2], [x8] -; CHECK-NEXT: add x8, sp, #8 -; CHECK-NEXT: ld1.s { v1 }[2], [x8] -; CHECK-NEXT: add x8, sp, #28 +; CHECK-NEXT: ldr s17, [sp] +; CHECK-NEXT: add x9, sp, #4 ; CHECK-NEXT: ; kill: def $s4 killed $s4 def $q4 -; CHECK-NEXT: ld1.s { v16 }[3], [x8] -; CHECK-NEXT: add x8, sp, #12 +; CHECK-NEXT: ; kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: ; kill: def $s5 killed $s5 def $q5 -; CHECK-NEXT: ; kill: def $s2 killed $s2 def $q2 -; CHECK-NEXT: mov.s v4[1], v5[0] -; CHECK-NEXT: ld1.s { v1 }[3], [x8] +; CHECK-NEXT: ; kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: ; kill: def $s6 killed $s6 def $q6 -; CHECK-NEXT: mov.s v0[2], v2[0] -; CHECK-NEXT: ldr s2, [sp, #32] -; CHECK-NEXT: mov.s v4[2], v6[0] +; CHECK-NEXT: ; kill: def $s2 killed $s2 def $q2 ; CHECK-NEXT: ; kill: def $s7 killed $s7 def $q7 ; CHECK-NEXT: ; kill: def $s3 killed $s3 def $q3 -; CHECK-NEXT: mov.s v0[3], v3[0] +; CHECK-NEXT: ld1.s { v16 }[1], [x8] +; CHECK-NEXT: add x8, sp, #24 +; CHECK-NEXT: ld1.s { v17 }[1], [x9] +; CHECK-NEXT: add x9, sp, #8 +; CHECK-NEXT: mov.s v4[1], v5[0] +; CHECK-NEXT: mov.s v0[1], v1[0] +; CHECK-NEXT: ld1.s { v16 }[2], [x8] +; CHECK-NEXT: add x8, sp, #28 +; CHECK-NEXT: ld1.s { v17 }[2], [x9] +; CHECK-NEXT: add x9, sp, #12 +; CHECK-NEXT: mov.s v4[2], v6[0] +; CHECK-NEXT: mov.s v0[2], v2[0] +; CHECK-NEXT: ld1.s { v16 }[3], [x8] +; CHECK-NEXT: ld1.s { v17 }[3], [x9] ; CHECK-NEXT: mov.s v4[3], v7[0] +; CHECK-NEXT: mov.s v0[3], v3[0] +; CHECK-NEXT: mov d1, v16[1] +; CHECK-NEXT: mov d2, v17[1] ; CHECK-NEXT: mov d3, v4[1] ; CHECK-NEXT: mov d5, v0[1] +; CHECK-NEXT: stnp d16, d1, [x0, #48] +; CHECK-NEXT: ldr s1, [sp, #32] +; CHECK-NEXT: stnp d17, d2, [x0, #32] ; CHECK-NEXT: stnp d4, d3, [x0, #16] ; CHECK-NEXT: stnp d0, d5, [x0] -; CHECK-NEXT: mov d0, v16[1] -; CHECK-NEXT: mov d3, v1[1] -; CHECK-NEXT: stnp d16, d0, [x0, #48] -; CHECK-NEXT: stnp d1, d3, [x0, #32] -; CHECK-NEXT: str s2, [x0, #64] +; CHECK-NEXT: str s1, [x0, #64] ; CHECK-NEXT: ret entry: @@ -497,12 +497,12 @@ define void @test_stnp_v16i32_invalid_offset(<16 x i32> %v, <16 x i32>* %ptr) { ; CHECK-LABEL: test_stnp_v16i32_invalid_offset: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov w8, #32000 -; CHECK-NEXT: mov w9, #32032 +; CHECK-NEXT: mov w8, #32032 +; CHECK-NEXT: mov w9, #32000 ; CHECK-NEXT: add x8, x0, x8 ; CHECK-NEXT: add x9, x0, x9 -; CHECK-NEXT: stnp q2, q3, [x9] -; CHECK-NEXT: stnp q0, q1, [x8] +; CHECK-NEXT: stnp q2, q3, [x8] +; CHECK-NEXT: stnp q0, q1, [x9] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll --- a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll +++ b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll @@ -5,18 +5,18 @@ ; CHECK-LABEL: jsimd_idct_ifast_neon_intrinsic: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldr q0, [x1, #32] +; CHECK-NEXT: mov w8, w3 ; CHECK-NEXT: ldr q1, [x1, #96] ; CHECK-NEXT: ldr q2, [x0, #32] ; CHECK-NEXT: ldr q3, [x0, #96] -; CHECK-NEXT: ldr x8, [x2, #48] -; CHECK-NEXT: mov w9, w3 +; CHECK-NEXT: ldr x9, [x2, #48] ; CHECK-NEXT: mul v0.8h, v2.8h, v0.8h ; CHECK-NEXT: mul v1.8h, v3.8h, v1.8h ; CHECK-NEXT: add v2.8h, v0.8h, v1.8h -; CHECK-NEXT: str q2, [x8, x9] -; CHECK-NEXT: ldr x8, [x2, #56] ; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h -; CHECK-NEXT: str q0, [x8, x9] +; CHECK-NEXT: str q2, [x9, x8] +; CHECK-NEXT: ldr x9, [x2, #56] +; CHECK-NEXT: str q0, [x9, x8] ; CHECK-NEXT: ret entry: %add.ptr5 = getelementptr inbounds i16, i16* %coef_block, i64 16 diff --git a/llvm/test/CodeGen/AArch64/pow.ll b/llvm/test/CodeGen/AArch64/pow.ll --- a/llvm/test/CodeGen/AArch64/pow.ll +++ b/llvm/test/CodeGen/AArch64/pow.ll @@ -74,30 +74,30 @@ ; CHECK-NEXT: fmov s1, #0.25000000 ; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: bl powf -; CHECK-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov s1, #0.25000000 +; CHECK-NEXT: str d0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl powf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: mov v0.s[1], v1.s[0] +; CHECK-NEXT: fmov s1, #0.25000000 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: fmov s1, #0.25000000 ; CHECK-NEXT: mov s0, v0.s[2] ; CHECK-NEXT: bl powf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: mov s0, v0.s[3] ; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fmov s1, #0.25000000 -; CHECK-NEXT: mov s0, v0.s[3] ; CHECK-NEXT: bl powf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v1.s[3], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: add sp, sp, #48 @@ -115,14 +115,14 @@ ; CHECK-NEXT: fmov d1, #0.25000000 ; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: bl pow -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: fmov d1, #0.25000000 +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl pow ; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/pull-conditional-binop-through-shift.ll b/llvm/test/CodeGen/AArch64/pull-conditional-binop-through-shift.ll --- a/llvm/test/CodeGen/AArch64/pull-conditional-binop-through-shift.ll +++ b/llvm/test/CodeGen/AArch64/pull-conditional-binop-through-shift.ll @@ -115,8 +115,8 @@ ; CHECK-LABEL: add_nosignbit_select_shl: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #2147418112 -; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: tst w1, #0x1 +; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: csel w8, w8, w0, ne ; CHECK-NEXT: lsl w0, w8, #8 ; CHECK-NEXT: str w0, [x2] @@ -242,8 +242,8 @@ ; CHECK-LABEL: add_nosignbit_select_lshr: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #2147418112 -; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: tst w1, #0x1 +; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: csel w8, w8, w0, ne ; CHECK-NEXT: lsr w0, w8, #8 ; CHECK-NEXT: str w0, [x2] @@ -369,8 +369,8 @@ ; CHECK-LABEL: add_nosignbit_select_ashr: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #2147418112 -; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: tst w1, #0x1 +; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: csel w8, w8, w0, ne ; CHECK-NEXT: asr w0, w8, #8 ; CHECK-NEXT: str w0, [x2] diff --git a/llvm/test/CodeGen/AArch64/qmovn.ll b/llvm/test/CodeGen/AArch64/qmovn.ll --- a/llvm/test/CodeGen/AArch64/qmovn.ll +++ b/llvm/test/CodeGen/AArch64/qmovn.ll @@ -86,10 +86,10 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w8, #2147483647 ; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: mov x9, #-2147483648 +; CHECK-NEXT: mov x8, #-2147483648 ; CHECK-NEXT: cmgt v2.2d, v1.2d, v0.2d ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: dup v1.2d, x9 +; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmgt v2.2d, v0.2d, v1.2d ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: xtn v0.2s, v0.2d @@ -108,10 +108,10 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov x8, #-2147483648 ; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: mov w9, #2147483647 +; CHECK-NEXT: mov w8, #2147483647 ; CHECK-NEXT: cmgt v2.2d, v0.2d, v1.2d ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: dup v1.2d, x9 +; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmgt v2.2d, v1.2d, v0.2d ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: xtn v0.2s, v0.2d diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -8,11 +8,12 @@ define dso_local void @run_test() local_unnamed_addr #0 { ; CHECK-LABEL: run_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 96 ; CHECK-NEXT: .cfi_offset b8, -8 ; CHECK-NEXT: .cfi_offset b9, -16 ; CHECK-NEXT: .cfi_offset b10, -24 @@ -21,14 +22,13 @@ ; CHECK-NEXT: .cfi_offset b13, -48 ; CHECK-NEXT: .cfi_offset b14, -56 ; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: adrp x10, B+48 ; CHECK-NEXT: adrp x11, A ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: add x10, x10, :lo12:B+48 ; CHECK-NEXT: add x11, x11, :lo12:A -; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: // implicit-def: $q2 ; CHECK-NEXT: // implicit-def: $q3 ; CHECK-NEXT: // implicit-def: $q4 @@ -57,103 +57,113 @@ ; CHECK-NEXT: // implicit-def: $q11 ; CHECK-NEXT: // implicit-def: $q12 ; CHECK-NEXT: // implicit-def: $q13 +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: // kill: killed $q0 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov x12, xzr -; CHECK-NEXT: ldr q14, [x12] -; CHECK-NEXT: ldr q15, [x8] -; CHECK-NEXT: ldr x18, [x12] -; CHECK-NEXT: ldr x0, [x8] -; CHECK-NEXT: mov x12, v14.d[1] +; CHECK-NEXT: ldr q14, [x8] +; CHECK-NEXT: ldr q15, [x10], #64 +; CHECK-NEXT: add x15, x11, x8 +; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: ldr q0, [x12] ; CHECK-NEXT: fmov x13, d14 -; CHECK-NEXT: ldr q14, [x10], #64 -; CHECK-NEXT: fmov x15, d15 -; CHECK-NEXT: mov x14, v15.d[1] -; CHECK-NEXT: mul x1, x15, x18 -; CHECK-NEXT: mov x16, v14.d[1] -; CHECK-NEXT: fmov x17, d14 +; CHECK-NEXT: ldr x12, [x12] +; CHECK-NEXT: fmov x0, d15 +; CHECK-NEXT: mov x14, v14.d[1] +; CHECK-NEXT: ldr x15, [x15, #128] +; CHECK-NEXT: fmov x16, d0 +; CHECK-NEXT: mul x17, x13, x12 +; CHECK-NEXT: mov x18, v0.d[1] +; CHECK-NEXT: mul x4, x0, x12 +; CHECK-NEXT: mul x1, x16, x12 +; CHECK-NEXT: mul x3, x14, x12 +; CHECK-NEXT: fmov d0, x17 +; CHECK-NEXT: mul x5, x13, x15 +; CHECK-NEXT: mov x17, v15.d[1] +; CHECK-NEXT: fmov d15, x4 ; CHECK-NEXT: fmov d14, x1 -; CHECK-NEXT: mul x1, x14, x18 -; CHECK-NEXT: mov v14.d[1], x1 -; CHECK-NEXT: mul x1, x13, x18 -; CHECK-NEXT: add v12.2d, v12.2d, v14.2d -; CHECK-NEXT: fmov d14, x1 -; CHECK-NEXT: mul x1, x12, x18 +; CHECK-NEXT: mul x1, x18, x12 +; CHECK-NEXT: ldr x2, [x8], #8 +; CHECK-NEXT: mov v0.d[1], x3 +; CHECK-NEXT: mul x3, x16, x15 +; CHECK-NEXT: mul x12, x17, x12 +; CHECK-NEXT: fmov d1, x5 +; CHECK-NEXT: mul x13, x13, x2 +; CHECK-NEXT: cmp x8, #64 ; CHECK-NEXT: mov v14.d[1], x1 -; CHECK-NEXT: mul x1, x17, x18 +; CHECK-NEXT: mul x1, x14, x15 +; CHECK-NEXT: add v12.2d, v12.2d, v0.2d +; CHECK-NEXT: mul x14, x14, x2 +; CHECK-NEXT: mov v15.d[1], x12 +; CHECK-NEXT: mul x12, x18, x2 +; CHECK-NEXT: mul x18, x18, x15 +; CHECK-NEXT: fmov d0, x3 +; CHECK-NEXT: mov v1.d[1], x1 +; CHECK-NEXT: mul x16, x16, x2 +; CHECK-NEXT: mul x3, x0, x15 +; CHECK-NEXT: add v10.2d, v10.2d, v15.2d +; CHECK-NEXT: fmov d15, x13 +; CHECK-NEXT: mov v0.d[1], x18 +; CHECK-NEXT: mul x13, x0, x2 +; CHECK-NEXT: add v29.2d, v29.2d, v1.2d +; CHECK-NEXT: mul x15, x17, x15 +; CHECK-NEXT: mov v15.d[1], x14 +; CHECK-NEXT: fmov d1, x16 +; CHECK-NEXT: add v28.2d, v28.2d, v0.2d +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add v13.2d, v13.2d, v14.2d +; CHECK-NEXT: mov v1.d[1], x12 +; CHECK-NEXT: mul x12, x17, x2 +; CHECK-NEXT: add v0.2d, v0.2d, v15.2d ; CHECK-NEXT: add v11.2d, v11.2d, v14.2d -; CHECK-NEXT: fmov d14, x1 -; CHECK-NEXT: mul x18, x16, x18 -; CHECK-NEXT: mov v14.d[1], x18 -; CHECK-NEXT: mul x18, x15, x0 -; CHECK-NEXT: add x1, x11, x8 -; CHECK-NEXT: add v10.2d, v10.2d, v14.2d -; CHECK-NEXT: fmov d14, x18 -; CHECK-NEXT: mul x18, x14, x0 -; CHECK-NEXT: ldr x1, [x1, #128] -; CHECK-NEXT: mov v14.d[1], x18 -; CHECK-NEXT: mul x18, x13, x0 -; CHECK-NEXT: add v8.2d, v8.2d, v14.2d -; CHECK-NEXT: add v25.2d, v25.2d, v14.2d -; CHECK-NEXT: add v22.2d, v22.2d, v14.2d -; CHECK-NEXT: add v18.2d, v18.2d, v14.2d -; CHECK-NEXT: add v6.2d, v6.2d, v14.2d -; CHECK-NEXT: add v0.2d, v0.2d, v14.2d -; CHECK-NEXT: fmov d14, x18 -; CHECK-NEXT: mul x18, x12, x0 -; CHECK-NEXT: mov v14.d[1], x18 -; CHECK-NEXT: mul x18, x17, x0 -; CHECK-NEXT: mul x0, x16, x0 -; CHECK-NEXT: add v9.2d, v9.2d, v14.2d -; CHECK-NEXT: add v31.2d, v31.2d, v14.2d -; CHECK-NEXT: add v26.2d, v26.2d, v14.2d -; CHECK-NEXT: add v23.2d, v23.2d, v14.2d -; CHECK-NEXT: add v21.2d, v21.2d, v14.2d -; CHECK-NEXT: add v19.2d, v19.2d, v14.2d -; CHECK-NEXT: add v17.2d, v17.2d, v14.2d -; CHECK-NEXT: add v7.2d, v7.2d, v14.2d -; CHECK-NEXT: add v5.2d, v5.2d, v14.2d -; CHECK-NEXT: add v3.2d, v3.2d, v14.2d -; CHECK-NEXT: add v2.2d, v2.2d, v14.2d -; CHECK-NEXT: fmov d14, x18 -; CHECK-NEXT: mul x15, x15, x1 -; CHECK-NEXT: mov v14.d[1], x0 -; CHECK-NEXT: mul x14, x14, x1 -; CHECK-NEXT: add v30.2d, v30.2d, v14.2d -; CHECK-NEXT: add v24.2d, v24.2d, v14.2d -; CHECK-NEXT: add v20.2d, v20.2d, v14.2d -; CHECK-NEXT: add v16.2d, v16.2d, v14.2d -; CHECK-NEXT: add v4.2d, v4.2d, v14.2d -; CHECK-NEXT: add v1.2d, v1.2d, v14.2d -; CHECK-NEXT: fmov d14, x15 -; CHECK-NEXT: mul x13, x13, x1 -; CHECK-NEXT: mov v14.d[1], x14 -; CHECK-NEXT: mul x12, x12, x1 -; CHECK-NEXT: add v29.2d, v29.2d, v14.2d -; CHECK-NEXT: fmov d14, x13 -; CHECK-NEXT: mul x17, x17, x1 -; CHECK-NEXT: mov v14.d[1], x12 -; CHECK-NEXT: mul x16, x16, x1 -; CHECK-NEXT: add v28.2d, v28.2d, v14.2d -; CHECK-NEXT: fmov d14, x17 -; CHECK-NEXT: mov v14.d[1], x16 -; CHECK-NEXT: add x8, x8, #8 +; CHECK-NEXT: fmov d14, x3 +; CHECK-NEXT: add v9.2d, v9.2d, v1.2d +; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: fmov d0, x13 +; CHECK-NEXT: mov v14.d[1], x15 +; CHECK-NEXT: add v31.2d, v31.2d, v1.2d +; CHECK-NEXT: mov v0.d[1], x12 +; CHECK-NEXT: add v26.2d, v26.2d, v1.2d +; CHECK-NEXT: add v23.2d, v23.2d, v1.2d +; CHECK-NEXT: add v21.2d, v21.2d, v1.2d +; CHECK-NEXT: add v19.2d, v19.2d, v1.2d +; CHECK-NEXT: add v17.2d, v17.2d, v1.2d +; CHECK-NEXT: add v7.2d, v7.2d, v1.2d +; CHECK-NEXT: add v5.2d, v5.2d, v1.2d +; CHECK-NEXT: add v3.2d, v3.2d, v1.2d +; CHECK-NEXT: add v2.2d, v2.2d, v1.2d +; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: add v27.2d, v27.2d, v14.2d -; CHECK-NEXT: cmp x8, #64 -; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: add v8.2d, v8.2d, v15.2d +; CHECK-NEXT: add v25.2d, v25.2d, v15.2d +; CHECK-NEXT: add v22.2d, v22.2d, v15.2d +; CHECK-NEXT: add v18.2d, v18.2d, v15.2d +; CHECK-NEXT: add v6.2d, v6.2d, v15.2d +; CHECK-NEXT: add v30.2d, v30.2d, v0.2d +; CHECK-NEXT: add v24.2d, v24.2d, v0.2d +; CHECK-NEXT: add v20.2d, v20.2d, v0.2d +; CHECK-NEXT: add v16.2d, v16.2d, v0.2d +; CHECK-NEXT: add v4.2d, v4.2d, v0.2d +; CHECK-NEXT: add v1.2d, v1.2d, v0.2d +; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup ; CHECK-NEXT: adrp x8, C +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add x8, x8, :lo12:C +; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: stp q13, q12, [x8] ; CHECK-NEXT: stp q11, q10, [x8, #32] ; CHECK-NEXT: stp q9, q8, [x8, #64] -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: stp q0, q2, [x8, #464] +; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: stp q31, q30, [x8, #96] +; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: stp q29, q28, [x8, #144] +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: stp q27, q26, [x8, #176] ; CHECK-NEXT: str q25, [x8, #208] ; CHECK-NEXT: stp q24, q23, [x8, #240] @@ -163,9 +173,8 @@ ; CHECK-NEXT: stp q16, q7, [x8, #368] ; CHECK-NEXT: stp q6, q5, [x8, #400] ; CHECK-NEXT: stp q4, q3, [x8, #432] -; CHECK-NEXT: stp q0, q2, [x8, #464] -; CHECK-NEXT: str q1, [x8, #496] -; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: str q0, [x8, #496] +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret entry: br label %for.cond1.preheader diff --git a/llvm/test/CodeGen/AArch64/rand.ll b/llvm/test/CodeGen/AArch64/rand.ll --- a/llvm/test/CodeGen/AArch64/rand.ll +++ b/llvm/test/CodeGen/AArch64/rand.ll @@ -4,11 +4,12 @@ define i32 @rndr(i64* %__addr) { ; CHECK-LABEL: rndr: ; CHECK: // %bb.0: -; CHECK-NEXT: mrs x9, RNDR +; CHECK-NEXT: mrs x10, RNDR +; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: cset w8, eq ; CHECK-NEXT: and w8, w8, #0x1 -; CHECK-NEXT: str x9, [x0] ; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: str x10, [x9] ; CHECK-NEXT: ret %1 = tail call { i64, i1 } @llvm.aarch64.rndr() %2 = extractvalue { i64, i1 } %1, 0 @@ -22,11 +23,12 @@ define i32 @rndrrs(i64* %__addr) { ; CHECK-LABEL: rndrrs: ; CHECK: // %bb.0: -; CHECK-NEXT: mrs x9, RNDRRS +; CHECK-NEXT: mrs x10, RNDRRS +; CHECK-NEXT: mov x9, x0 ; CHECK-NEXT: cset w8, eq ; CHECK-NEXT: and w8, w8, #0x1 -; CHECK-NEXT: str x9, [x0] ; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: str x10, [x9] ; CHECK-NEXT: ret %1 = tail call { i64, i1 } @llvm.aarch64.rndrrs() %2 = extractvalue { i64, i1 } %1, 0 diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll --- a/llvm/test/CodeGen/AArch64/reduce-and.ll +++ b/llvm/test/CodeGen/AArch64/reduce-and.ll @@ -44,13 +44,13 @@ ; CHECK-LABEL: test_redand_v4i1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: and w9, w10, w9 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: and w8, w8, w11 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret ; @@ -62,10 +62,10 @@ ; GISEL-NEXT: mov h3, v0.h[3] ; GISEL-NEXT: fmov w8, s0 ; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 -; GISEL-NEXT: and w9, w9, w10 +; GISEL-NEXT: and w9, w10, w11 ; GISEL-NEXT: and w8, w8, w9 ; GISEL-NEXT: and w0, w8, #0x1 ; GISEL-NEXT: ret @@ -77,21 +77,21 @@ ; CHECK-LABEL: test_redand_v8i1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w14, v0.b[1] -; CHECK-NEXT: umov w15, v0.b[0] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: and w14, w15, w14 -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: and w13, w14, w13 -; CHECK-NEXT: umov w11, v0.b[4] -; CHECK-NEXT: and w12, w13, w12 -; CHECK-NEXT: umov w10, v0.b[5] -; CHECK-NEXT: and w11, w12, w11 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: umov w8, v0.b[7] -; CHECK-NEXT: and w9, w10, w9 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] +; CHECK-NEXT: umov w13, v0.b[5] ; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: umov w9, v0.b[6] +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[7] +; CHECK-NEXT: and w8, w8, w11 +; CHECK-NEXT: and w8, w8, w12 +; CHECK-NEXT: and w8, w8, w13 +; CHECK-NEXT: and w8, w8, w9 +; CHECK-NEXT: and w8, w8, w10 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret ; @@ -101,24 +101,24 @@ ; GISEL-NEXT: mov b1, v0.b[1] ; GISEL-NEXT: mov b2, v0.b[2] ; GISEL-NEXT: mov b3, v0.b[3] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: mov b4, v0.b[4] ; GISEL-NEXT: mov b5, v0.b[5] -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 ; GISEL-NEXT: mov b6, v0.b[6] ; GISEL-NEXT: mov b7, v0.b[7] -; GISEL-NEXT: and w9, w9, w10 -; GISEL-NEXT: fmov w10, s4 -; GISEL-NEXT: fmov w11, s5 -; GISEL-NEXT: and w10, w10, w11 -; GISEL-NEXT: fmov w11, s6 -; GISEL-NEXT: fmov w12, s7 -; GISEL-NEXT: and w11, w11, w12 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 +; GISEL-NEXT: fmov w12, s4 +; GISEL-NEXT: fmov w13, s5 ; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: and w9, w10, w11 +; GISEL-NEXT: fmov w9, s6 +; GISEL-NEXT: and w10, w10, w11 +; GISEL-NEXT: fmov w11, s7 +; GISEL-NEXT: and w12, w12, w13 +; GISEL-NEXT: and w8, w8, w10 +; GISEL-NEXT: and w9, w9, w11 +; GISEL-NEXT: and w9, w12, w9 ; GISEL-NEXT: and w8, w8, w9 ; GISEL-NEXT: and w0, w8, #0x1 ; GISEL-NEXT: ret @@ -133,19 +133,19 @@ ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[3] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: and w8, w8, w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: and w8, w8, w12 ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: and w8, w8, w9 +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: and w8, w8, w11 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret ; @@ -154,47 +154,47 @@ ; GISEL-NEXT: mov b1, v0.b[1] ; GISEL-NEXT: mov b2, v0.b[2] ; GISEL-NEXT: mov b3, v0.b[3] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: mov b4, v0.b[4] ; GISEL-NEXT: mov b5, v0.b[5] -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 ; GISEL-NEXT: mov b6, v0.b[6] ; GISEL-NEXT: mov b7, v0.b[7] -; GISEL-NEXT: and w9, w9, w10 -; GISEL-NEXT: fmov w10, s4 -; GISEL-NEXT: fmov w11, s5 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: mov b16, v0.b[8] ; GISEL-NEXT: mov b17, v0.b[9] -; GISEL-NEXT: and w10, w10, w11 -; GISEL-NEXT: fmov w11, s6 -; GISEL-NEXT: fmov w12, s7 ; GISEL-NEXT: mov b18, v0.b[10] ; GISEL-NEXT: mov b19, v0.b[11] -; GISEL-NEXT: and w11, w11, w12 -; GISEL-NEXT: fmov w12, s16 -; GISEL-NEXT: fmov w13, s17 +; GISEL-NEXT: and w8, w8, w9 +; GISEL-NEXT: and w9, w10, w11 +; GISEL-NEXT: fmov w10, s4 +; GISEL-NEXT: fmov w11, s5 +; GISEL-NEXT: fmov w12, s6 +; GISEL-NEXT: fmov w13, s7 ; GISEL-NEXT: mov b20, v0.b[12] ; GISEL-NEXT: mov b21, v0.b[13] -; GISEL-NEXT: and w12, w12, w13 -; GISEL-NEXT: fmov w13, s18 -; GISEL-NEXT: fmov w14, s19 ; GISEL-NEXT: mov b22, v0.b[14] ; GISEL-NEXT: mov b23, v0.b[15] -; GISEL-NEXT: and w13, w13, w14 -; GISEL-NEXT: fmov w14, s20 -; GISEL-NEXT: fmov w15, s21 -; GISEL-NEXT: and w14, w14, w15 -; GISEL-NEXT: fmov w15, s22 -; GISEL-NEXT: fmov w16, s23 -; GISEL-NEXT: and w15, w15, w16 +; GISEL-NEXT: and w10, w10, w11 +; GISEL-NEXT: and w11, w12, w13 +; GISEL-NEXT: fmov w12, s16 +; GISEL-NEXT: fmov w13, s17 +; GISEL-NEXT: fmov w14, s18 +; GISEL-NEXT: fmov w15, s19 +; GISEL-NEXT: fmov w16, s22 +; GISEL-NEXT: fmov w17, s23 ; GISEL-NEXT: and w8, w8, w9 +; GISEL-NEXT: and w12, w12, w13 ; GISEL-NEXT: and w9, w10, w11 +; GISEL-NEXT: and w13, w14, w15 +; GISEL-NEXT: fmov w14, s20 +; GISEL-NEXT: fmov w15, s21 ; GISEL-NEXT: and w10, w12, w13 -; GISEL-NEXT: and w11, w14, w15 ; GISEL-NEXT: and w8, w8, w9 +; GISEL-NEXT: and w14, w14, w15 +; GISEL-NEXT: and w15, w16, w17 +; GISEL-NEXT: and w11, w14, w15 ; GISEL-NEXT: and w9, w10, w11 ; GISEL-NEXT: and w8, w8, w9 ; GISEL-NEXT: and w0, w8, #0x1 @@ -240,13 +240,13 @@ ; CHECK-LABEL: test_redand_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: and w9, w10, w9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: and w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v4i8: @@ -257,10 +257,10 @@ ; GISEL-NEXT: mov h3, v0.h[3] ; GISEL-NEXT: fmov w8, s0 ; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 -; GISEL-NEXT: and w9, w9, w10 +; GISEL-NEXT: and w9, w10, w11 ; GISEL-NEXT: and w0, w8, w9 ; GISEL-NEXT: ret %and_result = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a) @@ -271,21 +271,21 @@ ; CHECK-LABEL: test_redand_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w14, v0.b[1] -; CHECK-NEXT: umov w15, v0.b[0] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: and w14, w15, w14 -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: and w13, w14, w13 -; CHECK-NEXT: umov w11, v0.b[4] -; CHECK-NEXT: and w12, w13, w12 -; CHECK-NEXT: umov w10, v0.b[5] -; CHECK-NEXT: and w11, w12, w11 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] +; CHECK-NEXT: umov w13, v0.b[5] +; CHECK-NEXT: and w8, w9, w8 ; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: umov w8, v0.b[7] -; CHECK-NEXT: and w9, w10, w9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[7] +; CHECK-NEXT: and w8, w8, w11 +; CHECK-NEXT: and w8, w8, w12 +; CHECK-NEXT: and w8, w8, w13 +; CHECK-NEXT: and w8, w8, w9 +; CHECK-NEXT: and w0, w8, w10 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v8i8: @@ -294,24 +294,24 @@ ; GISEL-NEXT: mov b1, v0.b[1] ; GISEL-NEXT: mov b2, v0.b[2] ; GISEL-NEXT: mov b3, v0.b[3] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: mov b4, v0.b[4] ; GISEL-NEXT: mov b5, v0.b[5] -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 ; GISEL-NEXT: mov b6, v0.b[6] ; GISEL-NEXT: mov b7, v0.b[7] -; GISEL-NEXT: and w9, w9, w10 -; GISEL-NEXT: fmov w10, s4 -; GISEL-NEXT: fmov w11, s5 -; GISEL-NEXT: and w10, w10, w11 -; GISEL-NEXT: fmov w11, s6 -; GISEL-NEXT: fmov w12, s7 -; GISEL-NEXT: and w11, w11, w12 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 +; GISEL-NEXT: fmov w12, s4 +; GISEL-NEXT: fmov w13, s5 ; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: and w9, w10, w11 +; GISEL-NEXT: fmov w9, s6 +; GISEL-NEXT: and w10, w10, w11 +; GISEL-NEXT: fmov w11, s7 +; GISEL-NEXT: and w12, w12, w13 +; GISEL-NEXT: and w8, w8, w10 +; GISEL-NEXT: and w9, w9, w11 +; GISEL-NEXT: and w9, w12, w9 ; GISEL-NEXT: and w0, w8, w9 ; GISEL-NEXT: ret %and_result = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a) @@ -325,19 +325,19 @@ ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[3] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: and w8, w8, w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: and w8, w8, w12 ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: and w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v16i8: @@ -347,24 +347,24 @@ ; GISEL-NEXT: mov b1, v0.b[1] ; GISEL-NEXT: mov b2, v0.b[2] ; GISEL-NEXT: mov b3, v0.b[3] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: mov b4, v0.b[4] ; GISEL-NEXT: mov b5, v0.b[5] -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 ; GISEL-NEXT: mov b6, v0.b[6] ; GISEL-NEXT: mov b7, v0.b[7] -; GISEL-NEXT: and w9, w9, w10 -; GISEL-NEXT: fmov w10, s4 -; GISEL-NEXT: fmov w11, s5 -; GISEL-NEXT: and w10, w10, w11 -; GISEL-NEXT: fmov w11, s6 -; GISEL-NEXT: fmov w12, s7 -; GISEL-NEXT: and w11, w11, w12 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 +; GISEL-NEXT: fmov w12, s4 +; GISEL-NEXT: fmov w13, s5 ; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: and w9, w10, w11 +; GISEL-NEXT: fmov w9, s6 +; GISEL-NEXT: and w10, w10, w11 +; GISEL-NEXT: fmov w11, s7 +; GISEL-NEXT: and w12, w12, w13 +; GISEL-NEXT: and w8, w8, w10 +; GISEL-NEXT: and w9, w9, w11 +; GISEL-NEXT: and w9, w12, w9 ; GISEL-NEXT: and w0, w8, w9 ; GISEL-NEXT: ret %and_result = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a) @@ -379,19 +379,19 @@ ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[3] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: and w8, w8, w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: and w8, w8, w12 ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: and w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v32i8: @@ -402,24 +402,24 @@ ; GISEL-NEXT: mov b1, v0.b[1] ; GISEL-NEXT: mov b2, v0.b[2] ; GISEL-NEXT: mov b3, v0.b[3] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: mov b4, v0.b[4] ; GISEL-NEXT: mov b5, v0.b[5] -; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 ; GISEL-NEXT: mov b6, v0.b[6] ; GISEL-NEXT: mov b7, v0.b[7] -; GISEL-NEXT: and w9, w9, w10 -; GISEL-NEXT: fmov w10, s4 -; GISEL-NEXT: fmov w11, s5 -; GISEL-NEXT: and w10, w10, w11 -; GISEL-NEXT: fmov w11, s6 -; GISEL-NEXT: fmov w12, s7 -; GISEL-NEXT: and w11, w11, w12 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 +; GISEL-NEXT: fmov w12, s4 +; GISEL-NEXT: fmov w13, s5 ; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: and w9, w10, w11 +; GISEL-NEXT: fmov w9, s6 +; GISEL-NEXT: and w10, w10, w11 +; GISEL-NEXT: fmov w11, s7 +; GISEL-NEXT: and w12, w12, w13 +; GISEL-NEXT: and w8, w8, w10 +; GISEL-NEXT: and w9, w9, w11 +; GISEL-NEXT: and w9, w12, w9 ; GISEL-NEXT: and w0, w8, w9 ; GISEL-NEXT: ret %and_result = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a) @@ -430,13 +430,13 @@ ; CHECK-LABEL: test_redand_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: and w9, w10, w9 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: and w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v4i16: @@ -447,10 +447,10 @@ ; GISEL-NEXT: mov h3, v0.h[3] ; GISEL-NEXT: fmov w8, s0 ; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 -; GISEL-NEXT: and w9, w9, w10 +; GISEL-NEXT: and w9, w10, w11 ; GISEL-NEXT: and w0, w8, w9 ; GISEL-NEXT: ret %and_result = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a) @@ -465,10 +465,10 @@ ; CHECK-NEXT: umov w8, v0.h[1] ; CHECK-NEXT: umov w9, v0.h[0] ; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: and w8, w9, w8 ; CHECK-NEXT: and w8, w8, w10 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: and w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v8i16: @@ -480,10 +480,10 @@ ; GISEL-NEXT: mov h3, v0.h[3] ; GISEL-NEXT: fmov w8, s0 ; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 -; GISEL-NEXT: and w9, w9, w10 +; GISEL-NEXT: and w9, w10, w11 ; GISEL-NEXT: and w0, w8, w9 ; GISEL-NEXT: ret %and_result = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a) @@ -499,10 +499,10 @@ ; CHECK-NEXT: umov w8, v0.h[1] ; CHECK-NEXT: umov w9, v0.h[0] ; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: and w8, w9, w8 ; CHECK-NEXT: and w8, w8, w10 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: and w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v16i16: @@ -515,10 +515,10 @@ ; GISEL-NEXT: mov h3, v0.h[3] ; GISEL-NEXT: fmov w8, s0 ; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: and w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 -; GISEL-NEXT: and w9, w9, w10 +; GISEL-NEXT: and w9, w10, w11 ; GISEL-NEXT: and w0, w8, w9 ; GISEL-NEXT: ret %and_result = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %a) diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll --- a/llvm/test/CodeGen/AArch64/reduce-or.ll +++ b/llvm/test/CodeGen/AArch64/reduce-or.ll @@ -44,13 +44,13 @@ ; CHECK-LABEL: test_redor_v4i1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: orr w9, w10, w9 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: orr w8, w8, w11 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret ; @@ -62,10 +62,10 @@ ; GISEL-NEXT: mov h3, v0.h[3] ; GISEL-NEXT: fmov w8, s0 ; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 -; GISEL-NEXT: orr w9, w9, w10 +; GISEL-NEXT: orr w9, w10, w11 ; GISEL-NEXT: orr w8, w8, w9 ; GISEL-NEXT: and w0, w8, #0x1 ; GISEL-NEXT: ret @@ -77,21 +77,21 @@ ; CHECK-LABEL: test_redor_v8i1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w14, v0.b[1] -; CHECK-NEXT: umov w15, v0.b[0] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: orr w14, w15, w14 -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: orr w13, w14, w13 -; CHECK-NEXT: umov w11, v0.b[4] -; CHECK-NEXT: orr w12, w13, w12 -; CHECK-NEXT: umov w10, v0.b[5] -; CHECK-NEXT: orr w11, w12, w11 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: umov w8, v0.b[7] -; CHECK-NEXT: orr w9, w10, w9 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] +; CHECK-NEXT: umov w13, v0.b[5] ; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: umov w9, v0.b[6] +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[7] +; CHECK-NEXT: orr w8, w8, w11 +; CHECK-NEXT: orr w8, w8, w12 +; CHECK-NEXT: orr w8, w8, w13 +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: orr w8, w8, w10 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret ; @@ -101,24 +101,24 @@ ; GISEL-NEXT: mov b1, v0.b[1] ; GISEL-NEXT: mov b2, v0.b[2] ; GISEL-NEXT: mov b3, v0.b[3] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: mov b4, v0.b[4] ; GISEL-NEXT: mov b5, v0.b[5] -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 ; GISEL-NEXT: mov b6, v0.b[6] ; GISEL-NEXT: mov b7, v0.b[7] -; GISEL-NEXT: orr w9, w9, w10 -; GISEL-NEXT: fmov w10, s4 -; GISEL-NEXT: fmov w11, s5 -; GISEL-NEXT: orr w10, w10, w11 -; GISEL-NEXT: fmov w11, s6 -; GISEL-NEXT: fmov w12, s7 -; GISEL-NEXT: orr w11, w11, w12 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 +; GISEL-NEXT: fmov w12, s4 +; GISEL-NEXT: fmov w13, s5 ; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: orr w9, w10, w11 +; GISEL-NEXT: fmov w9, s6 +; GISEL-NEXT: orr w10, w10, w11 +; GISEL-NEXT: fmov w11, s7 +; GISEL-NEXT: orr w12, w12, w13 +; GISEL-NEXT: orr w8, w8, w10 +; GISEL-NEXT: orr w9, w9, w11 +; GISEL-NEXT: orr w9, w12, w9 ; GISEL-NEXT: orr w8, w8, w9 ; GISEL-NEXT: and w0, w8, #0x1 ; GISEL-NEXT: ret @@ -133,19 +133,19 @@ ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[3] -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: orr w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: orr w8, w8, w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: orr w8, w8, w12 ; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: orr w8, w8, w11 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret ; @@ -154,47 +154,47 @@ ; GISEL-NEXT: mov b1, v0.b[1] ; GISEL-NEXT: mov b2, v0.b[2] ; GISEL-NEXT: mov b3, v0.b[3] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: mov b4, v0.b[4] ; GISEL-NEXT: mov b5, v0.b[5] -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 ; GISEL-NEXT: mov b6, v0.b[6] ; GISEL-NEXT: mov b7, v0.b[7] -; GISEL-NEXT: orr w9, w9, w10 -; GISEL-NEXT: fmov w10, s4 -; GISEL-NEXT: fmov w11, s5 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: mov b16, v0.b[8] ; GISEL-NEXT: mov b17, v0.b[9] -; GISEL-NEXT: orr w10, w10, w11 -; GISEL-NEXT: fmov w11, s6 -; GISEL-NEXT: fmov w12, s7 ; GISEL-NEXT: mov b18, v0.b[10] ; GISEL-NEXT: mov b19, v0.b[11] -; GISEL-NEXT: orr w11, w11, w12 -; GISEL-NEXT: fmov w12, s16 -; GISEL-NEXT: fmov w13, s17 +; GISEL-NEXT: orr w8, w8, w9 +; GISEL-NEXT: orr w9, w10, w11 +; GISEL-NEXT: fmov w10, s4 +; GISEL-NEXT: fmov w11, s5 +; GISEL-NEXT: fmov w12, s6 +; GISEL-NEXT: fmov w13, s7 ; GISEL-NEXT: mov b20, v0.b[12] ; GISEL-NEXT: mov b21, v0.b[13] -; GISEL-NEXT: orr w12, w12, w13 -; GISEL-NEXT: fmov w13, s18 -; GISEL-NEXT: fmov w14, s19 ; GISEL-NEXT: mov b22, v0.b[14] ; GISEL-NEXT: mov b23, v0.b[15] -; GISEL-NEXT: orr w13, w13, w14 -; GISEL-NEXT: fmov w14, s20 -; GISEL-NEXT: fmov w15, s21 -; GISEL-NEXT: orr w14, w14, w15 -; GISEL-NEXT: fmov w15, s22 -; GISEL-NEXT: fmov w16, s23 -; GISEL-NEXT: orr w15, w15, w16 +; GISEL-NEXT: orr w10, w10, w11 +; GISEL-NEXT: orr w11, w12, w13 +; GISEL-NEXT: fmov w12, s16 +; GISEL-NEXT: fmov w13, s17 +; GISEL-NEXT: fmov w14, s18 +; GISEL-NEXT: fmov w15, s19 +; GISEL-NEXT: fmov w16, s22 +; GISEL-NEXT: fmov w17, s23 ; GISEL-NEXT: orr w8, w8, w9 +; GISEL-NEXT: orr w12, w12, w13 ; GISEL-NEXT: orr w9, w10, w11 +; GISEL-NEXT: orr w13, w14, w15 +; GISEL-NEXT: fmov w14, s20 +; GISEL-NEXT: fmov w15, s21 ; GISEL-NEXT: orr w10, w12, w13 -; GISEL-NEXT: orr w11, w14, w15 ; GISEL-NEXT: orr w8, w8, w9 +; GISEL-NEXT: orr w14, w14, w15 +; GISEL-NEXT: orr w15, w16, w17 +; GISEL-NEXT: orr w11, w14, w15 ; GISEL-NEXT: orr w9, w10, w11 ; GISEL-NEXT: orr w8, w8, w9 ; GISEL-NEXT: and w0, w8, #0x1 @@ -239,13 +239,13 @@ ; CHECK-LABEL: test_redor_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: orr w9, w10, w9 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: orr w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v4i8: @@ -256,10 +256,10 @@ ; GISEL-NEXT: mov h3, v0.h[3] ; GISEL-NEXT: fmov w8, s0 ; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 -; GISEL-NEXT: orr w9, w9, w10 +; GISEL-NEXT: orr w9, w10, w11 ; GISEL-NEXT: orr w0, w8, w9 ; GISEL-NEXT: ret %or_result = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a) @@ -270,21 +270,21 @@ ; CHECK-LABEL: test_redor_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w14, v0.b[1] -; CHECK-NEXT: umov w15, v0.b[0] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: orr w14, w15, w14 -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: orr w13, w14, w13 -; CHECK-NEXT: umov w11, v0.b[4] -; CHECK-NEXT: orr w12, w13, w12 -; CHECK-NEXT: umov w10, v0.b[5] -; CHECK-NEXT: orr w11, w12, w11 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] +; CHECK-NEXT: umov w13, v0.b[5] +; CHECK-NEXT: orr w8, w9, w8 ; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: umov w8, v0.b[7] -; CHECK-NEXT: orr w9, w10, w9 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[7] +; CHECK-NEXT: orr w8, w8, w11 +; CHECK-NEXT: orr w8, w8, w12 +; CHECK-NEXT: orr w8, w8, w13 +; CHECK-NEXT: orr w8, w8, w9 +; CHECK-NEXT: orr w0, w8, w10 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v8i8: @@ -293,24 +293,24 @@ ; GISEL-NEXT: mov b1, v0.b[1] ; GISEL-NEXT: mov b2, v0.b[2] ; GISEL-NEXT: mov b3, v0.b[3] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: mov b4, v0.b[4] ; GISEL-NEXT: mov b5, v0.b[5] -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 ; GISEL-NEXT: mov b6, v0.b[6] ; GISEL-NEXT: mov b7, v0.b[7] -; GISEL-NEXT: orr w9, w9, w10 -; GISEL-NEXT: fmov w10, s4 -; GISEL-NEXT: fmov w11, s5 -; GISEL-NEXT: orr w10, w10, w11 -; GISEL-NEXT: fmov w11, s6 -; GISEL-NEXT: fmov w12, s7 -; GISEL-NEXT: orr w11, w11, w12 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 +; GISEL-NEXT: fmov w12, s4 +; GISEL-NEXT: fmov w13, s5 ; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: orr w9, w10, w11 +; GISEL-NEXT: fmov w9, s6 +; GISEL-NEXT: orr w10, w10, w11 +; GISEL-NEXT: fmov w11, s7 +; GISEL-NEXT: orr w12, w12, w13 +; GISEL-NEXT: orr w8, w8, w10 +; GISEL-NEXT: orr w9, w9, w11 +; GISEL-NEXT: orr w9, w12, w9 ; GISEL-NEXT: orr w0, w8, w9 ; GISEL-NEXT: ret %or_result = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a) @@ -324,19 +324,19 @@ ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[3] -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: orr w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: orr w8, w8, w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: orr w8, w8, w12 ; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: orr w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v16i8: @@ -346,24 +346,24 @@ ; GISEL-NEXT: mov b1, v0.b[1] ; GISEL-NEXT: mov b2, v0.b[2] ; GISEL-NEXT: mov b3, v0.b[3] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: mov b4, v0.b[4] ; GISEL-NEXT: mov b5, v0.b[5] -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 ; GISEL-NEXT: mov b6, v0.b[6] ; GISEL-NEXT: mov b7, v0.b[7] -; GISEL-NEXT: orr w9, w9, w10 -; GISEL-NEXT: fmov w10, s4 -; GISEL-NEXT: fmov w11, s5 -; GISEL-NEXT: orr w10, w10, w11 -; GISEL-NEXT: fmov w11, s6 -; GISEL-NEXT: fmov w12, s7 -; GISEL-NEXT: orr w11, w11, w12 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 +; GISEL-NEXT: fmov w12, s4 +; GISEL-NEXT: fmov w13, s5 ; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: orr w9, w10, w11 +; GISEL-NEXT: fmov w9, s6 +; GISEL-NEXT: orr w10, w10, w11 +; GISEL-NEXT: fmov w11, s7 +; GISEL-NEXT: orr w12, w12, w13 +; GISEL-NEXT: orr w8, w8, w10 +; GISEL-NEXT: orr w9, w9, w11 +; GISEL-NEXT: orr w9, w12, w9 ; GISEL-NEXT: orr w0, w8, w9 ; GISEL-NEXT: ret %or_result = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a) @@ -378,19 +378,19 @@ ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[3] -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: orr w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: orr w8, w8, w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: orr w8, w8, w12 ; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: orr w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: orr w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v32i8: @@ -401,24 +401,24 @@ ; GISEL-NEXT: mov b1, v0.b[1] ; GISEL-NEXT: mov b2, v0.b[2] ; GISEL-NEXT: mov b3, v0.b[3] -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 ; GISEL-NEXT: mov b4, v0.b[4] ; GISEL-NEXT: mov b5, v0.b[5] -; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 ; GISEL-NEXT: mov b6, v0.b[6] ; GISEL-NEXT: mov b7, v0.b[7] -; GISEL-NEXT: orr w9, w9, w10 -; GISEL-NEXT: fmov w10, s4 -; GISEL-NEXT: fmov w11, s5 -; GISEL-NEXT: orr w10, w10, w11 -; GISEL-NEXT: fmov w11, s6 -; GISEL-NEXT: fmov w12, s7 -; GISEL-NEXT: orr w11, w11, w12 +; GISEL-NEXT: fmov w8, s0 +; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 +; GISEL-NEXT: fmov w12, s4 +; GISEL-NEXT: fmov w13, s5 ; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: orr w9, w10, w11 +; GISEL-NEXT: fmov w9, s6 +; GISEL-NEXT: orr w10, w10, w11 +; GISEL-NEXT: fmov w11, s7 +; GISEL-NEXT: orr w12, w12, w13 +; GISEL-NEXT: orr w8, w8, w10 +; GISEL-NEXT: orr w9, w9, w11 +; GISEL-NEXT: orr w9, w12, w9 ; GISEL-NEXT: orr w0, w8, w9 ; GISEL-NEXT: ret %or_result = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a) @@ -429,13 +429,13 @@ ; CHECK-LABEL: test_redor_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: orr w9, w10, w9 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: orr w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v4i16: @@ -446,10 +446,10 @@ ; GISEL-NEXT: mov h3, v0.h[3] ; GISEL-NEXT: fmov w8, s0 ; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 -; GISEL-NEXT: orr w9, w9, w10 +; GISEL-NEXT: orr w9, w10, w11 ; GISEL-NEXT: orr w0, w8, w9 ; GISEL-NEXT: ret %or_result = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a) @@ -464,10 +464,10 @@ ; CHECK-NEXT: umov w8, v0.h[1] ; CHECK-NEXT: umov w9, v0.h[0] ; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: orr w8, w9, w8 ; CHECK-NEXT: orr w8, w8, w10 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: orr w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v8i16: @@ -479,10 +479,10 @@ ; GISEL-NEXT: mov h3, v0.h[3] ; GISEL-NEXT: fmov w8, s0 ; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 -; GISEL-NEXT: orr w9, w9, w10 +; GISEL-NEXT: orr w9, w10, w11 ; GISEL-NEXT: orr w0, w8, w9 ; GISEL-NEXT: ret %or_result = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a) @@ -498,10 +498,10 @@ ; CHECK-NEXT: umov w8, v0.h[1] ; CHECK-NEXT: umov w9, v0.h[0] ; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: orr w8, w9, w8 ; CHECK-NEXT: orr w8, w8, w10 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: orr w0, w8, w11 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v16i16: @@ -514,10 +514,10 @@ ; GISEL-NEXT: mov h3, v0.h[3] ; GISEL-NEXT: fmov w8, s0 ; GISEL-NEXT: fmov w9, s1 +; GISEL-NEXT: fmov w10, s2 +; GISEL-NEXT: fmov w11, s3 ; GISEL-NEXT: orr w8, w8, w9 -; GISEL-NEXT: fmov w9, s2 -; GISEL-NEXT: fmov w10, s3 -; GISEL-NEXT: orr w9, w9, w10 +; GISEL-NEXT: orr w9, w10, w11 ; GISEL-NEXT: orr w0, w8, w9 ; GISEL-NEXT: ret %or_result = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a) diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll --- a/llvm/test/CodeGen/AArch64/reduce-xor.ll +++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll @@ -27,13 +27,13 @@ ; CHECK-LABEL: test_redxor_v4i1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: eor w9, w10, w9 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: eor w8, w9, w8 +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: eor w8, w8, w11 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %or_result = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a) @@ -44,21 +44,21 @@ ; CHECK-LABEL: test_redxor_v8i1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w14, v0.b[1] -; CHECK-NEXT: umov w15, v0.b[0] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: eor w14, w15, w14 -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: eor w13, w14, w13 -; CHECK-NEXT: umov w11, v0.b[4] -; CHECK-NEXT: eor w12, w13, w12 -; CHECK-NEXT: umov w10, v0.b[5] -; CHECK-NEXT: eor w11, w12, w11 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: umov w8, v0.b[7] -; CHECK-NEXT: eor w9, w10, w9 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] +; CHECK-NEXT: umov w13, v0.b[5] ; CHECK-NEXT: eor w8, w9, w8 +; CHECK-NEXT: umov w9, v0.b[6] +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[7] +; CHECK-NEXT: eor w8, w8, w11 +; CHECK-NEXT: eor w8, w8, w12 +; CHECK-NEXT: eor w8, w8, w13 +; CHECK-NEXT: eor w8, w8, w9 +; CHECK-NEXT: eor w8, w8, w10 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %or_result = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a) @@ -72,19 +72,19 @@ ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[3] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: eor w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: eor w8, w8, w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: eor w8, w8, w12 ; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: eor w8, w8, w9 +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: eor w8, w8, w11 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %or_result = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a) @@ -115,13 +115,13 @@ ; CHECK-LABEL: test_redxor_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: eor w9, w10, w9 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: eor w8, w9, w8 +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: eor w0, w8, w11 ; CHECK-NEXT: ret %xor_result = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a) ret i8 %xor_result @@ -131,21 +131,21 @@ ; CHECK-LABEL: test_redxor_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w14, v0.b[1] -; CHECK-NEXT: umov w15, v0.b[0] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: eor w14, w15, w14 -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: eor w13, w14, w13 -; CHECK-NEXT: umov w11, v0.b[4] -; CHECK-NEXT: eor w12, w13, w12 -; CHECK-NEXT: umov w10, v0.b[5] -; CHECK-NEXT: eor w11, w12, w11 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] +; CHECK-NEXT: umov w13, v0.b[5] +; CHECK-NEXT: eor w8, w9, w8 ; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: umov w8, v0.b[7] -; CHECK-NEXT: eor w9, w10, w9 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[7] +; CHECK-NEXT: eor w8, w8, w11 +; CHECK-NEXT: eor w8, w8, w12 +; CHECK-NEXT: eor w8, w8, w13 +; CHECK-NEXT: eor w8, w8, w9 +; CHECK-NEXT: eor w0, w8, w10 ; CHECK-NEXT: ret %xor_result = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a) ret i8 %xor_result @@ -158,19 +158,19 @@ ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[3] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: eor w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: eor w8, w8, w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: eor w8, w8, w12 ; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: eor w0, w8, w11 ; CHECK-NEXT: ret %xor_result = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a) ret i8 %xor_result @@ -184,19 +184,19 @@ ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] +; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[2] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[3] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: eor w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: eor w8, w8, w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: eor w8, w8, w12 ; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: eor w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: eor w0, w8, w11 ; CHECK-NEXT: ret %xor_result = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %a) ret i8 %xor_result @@ -206,13 +206,13 @@ ; CHECK-LABEL: test_redxor_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: eor w9, w10, w9 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: eor w8, w9, w8 +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: eor w0, w8, w11 ; CHECK-NEXT: ret %xor_result = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a) ret i16 %xor_result @@ -226,10 +226,10 @@ ; CHECK-NEXT: umov w8, v0.h[1] ; CHECK-NEXT: umov w9, v0.h[0] ; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: eor w8, w9, w8 ; CHECK-NEXT: eor w8, w8, w10 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: eor w0, w8, w11 ; CHECK-NEXT: ret %xor_result = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a) ret i16 %xor_result @@ -244,10 +244,10 @@ ; CHECK-NEXT: umov w8, v0.h[1] ; CHECK-NEXT: umov w9, v0.h[0] ; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: eor w8, w9, w8 ; CHECK-NEXT: eor w8, w8, w10 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: eor w0, w8, w11 ; CHECK-NEXT: ret %xor_result = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %a) ret i16 %xor_result diff --git a/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll b/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll --- a/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll +++ b/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll @@ -27,8 +27,8 @@ ; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]] ; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #1 -; CHECK: sturb w[[STRVAL:[0-9]+]], [x29, [[LOCADDR]]] -; CHECK: and x0, x[[STRVAL]], #0xff +; CHECK: and x0, x[[STRVAL:[0-9]+]], #0xff +; CHECK: sturb w[[STRVAL]], [x29, [[LOCADDR]]] %ret.1 = load i8, i8* %locvar %ret.2 = zext i8 %ret.1 to i64 diff --git a/llvm/test/CodeGen/AArch64/rotate-extract.ll b/llvm/test/CodeGen/AArch64/rotate-extract.ll --- a/llvm/test/CodeGen/AArch64/rotate-extract.ll +++ b/llvm/test/CodeGen/AArch64/rotate-extract.ll @@ -127,15 +127,15 @@ define i32 @no_extract_udiv(i32 %i) nounwind { ; CHECK-LABEL: no_extract_udiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: mov w9, #33437 -; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: movk w9, #21399, lsl #16 +; CHECK-NEXT: mov w8, #33437 +; CHECK-NEXT: mov w9, #43691 +; CHECK-NEXT: movk w8, #21399, lsl #16 +; CHECK-NEXT: movk w9, #43690, lsl #16 ; CHECK-NEXT: umull x8, w0, w8 ; CHECK-NEXT: umull x9, w0, w9 -; CHECK-NEXT: lsr x8, x8, #33 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: extr w0, w8, w9, #4 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: lsr x9, x9, #33 +; CHECK-NEXT: extr w0, w9, w8, #4 ; CHECK-NEXT: ret %lhs_div = udiv i32 %i, 3 %rhs_div = udiv i32 %i, 49 diff --git a/llvm/test/CodeGen/AArch64/rvmarker-pseudo-expansion-and-outlining.mir b/llvm/test/CodeGen/AArch64/rvmarker-pseudo-expansion-and-outlining.mir --- a/llvm/test/CodeGen/AArch64/rvmarker-pseudo-expansion-and-outlining.mir +++ b/llvm/test/CodeGen/AArch64/rvmarker-pseudo-expansion-and-outlining.mir @@ -7,13 +7,13 @@ # CHECK: bb.0: # CHECK: bl _cb1 # CHECK-NEXT: mov x29, x29 -# CHECK-NEXT: bl _OUTLINED_FUNCTION_0 +# CHECK: b _OUTLINED_FUNCTION_0 # # CHECK-LABEL: _fn2: # CHECK: bb.0: # CHECK: bl _cb2 # CHECK-NEXT: mov x29, x29 -# CHECK-NEXT: bl _OUTLINED_FUNCTION_0 +# CHECK: b _OUTLINED_FUNCTION_0 # # CHECK-LABEL: _OUTLINED_FUNCTION_0: # CHECK: bb.0: diff --git a/llvm/test/CodeGen/AArch64/sadd_sat.ll b/llvm/test/CodeGen/AArch64/sadd_sat.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat.ll @@ -40,8 +40,8 @@ ; CHECK-NEXT: add w8, w8, w1, sxth ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: csel w8, w8, w9, lt -; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NEXT: mov w9, #-32768 +; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %tmp = call i16 @llvm.sadd.sat.i16(i16 %x, i16 %y); @@ -52,12 +52,12 @@ ; CHECK-LABEL: func8: ; CHECK: // %bb.0: ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: add w8, w8, w1, sxtb ; CHECK-NEXT: mov w9, #127 +; CHECK-NEXT: add w8, w8, w1, sxtb ; CHECK-NEXT: cmp w8, #127 ; CHECK-NEXT: csel w8, w8, w9, lt -; CHECK-NEXT: cmn w8, #128 ; CHECK-NEXT: mov w9, #-128 +; CHECK-NEXT: cmn w8, #128 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %tmp = call i8 @llvm.sadd.sat.i8(i8 %x, i8 %y); @@ -70,11 +70,11 @@ ; CHECK-NEXT: lsl w8, w1, #28 ; CHECK-NEXT: sbfx w9, w0, #0, #4 ; CHECK-NEXT: add w8, w9, w8, asr #28 -; CHECK-NEXT: mov w10, #7 +; CHECK-NEXT: mov w9, #7 ; CHECK-NEXT: cmp w8, #7 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: cmn w8, #8 +; CHECK-NEXT: csel w8, w8, w9, lt ; CHECK-NEXT: mov w9, #-8 +; CHECK-NEXT: cmn w8, #8 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %tmp = call i4 @llvm.sadd.sat.i4(i4 %x, i4 %y); diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll b/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll @@ -37,14 +37,14 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; CHECK-LABEL: func16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth w8, w0 ; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: mov w10, #32767 -; CHECK-NEXT: add w8, w8, w9, sxth -; CHECK-NEXT: cmp w8, w10 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 +; CHECK-NEXT: sxth w10, w0 +; CHECK-NEXT: mov w8, #32767 +; CHECK-NEXT: add w9, w10, w9, sxth +; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: csel w8, w9, w8, lt ; CHECK-NEXT: mov w9, #-32768 +; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %a = mul i16 %y, %z @@ -55,14 +55,14 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; CHECK-LABEL: func8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 ; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: add w8, w8, w9, sxtb -; CHECK-NEXT: mov w10, #127 -; CHECK-NEXT: cmp w8, #127 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: cmn w8, #128 +; CHECK-NEXT: sxtb w10, w0 +; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: add w9, w10, w9, sxtb +; CHECK-NEXT: cmp w9, #127 +; CHECK-NEXT: csel w8, w9, w8, lt ; CHECK-NEXT: mov w9, #-128 +; CHECK-NEXT: cmn w8, #128 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %a = mul i8 %y, %z @@ -74,14 +74,14 @@ ; CHECK-LABEL: func4: ; CHECK: // %bb.0: ; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: sbfx w8, w0, #0, #4 +; CHECK-NEXT: sbfx w10, w0, #0, #4 +; CHECK-NEXT: mov w8, #7 ; CHECK-NEXT: lsl w9, w9, #28 -; CHECK-NEXT: add w8, w8, w9, asr #28 -; CHECK-NEXT: mov w10, #7 -; CHECK-NEXT: cmp w8, #7 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: cmn w8, #8 +; CHECK-NEXT: add w9, w10, w9, asr #28 +; CHECK-NEXT: cmp w9, #7 +; CHECK-NEXT: csel w8, w9, w8, lt ; CHECK-NEXT: mov w9, #-8 +; CHECK-NEXT: cmn w8, #8 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %a = mul i4 %y, %z diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -54,9 +54,9 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; CHECK-LABEL: v64i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sqadd v2.16b, v2.16b, v6.16b ; CHECK-NEXT: sqadd v0.16b, v0.16b, v4.16b ; CHECK-NEXT: sqadd v1.16b, v1.16b, v5.16b -; CHECK-NEXT: sqadd v2.16b, v2.16b, v6.16b ; CHECK-NEXT: sqadd v3.16b, v3.16b, v7.16b ; CHECK-NEXT: ret %z = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y) @@ -85,9 +85,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; CHECK-LABEL: v32i16: ; CHECK: // %bb.0: +; CHECK-NEXT: sqadd v2.8h, v2.8h, v6.8h ; CHECK-NEXT: sqadd v0.8h, v0.8h, v4.8h ; CHECK-NEXT: sqadd v1.8h, v1.8h, v5.8h -; CHECK-NEXT: sqadd v2.8h, v2.8h, v6.8h ; CHECK-NEXT: sqadd v3.8h, v3.8h, v7.8h ; CHECK-NEXT: ret %z = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) @@ -116,8 +116,8 @@ ; CHECK-NEXT: ldr s1, [x1] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: shl v1.4h, v1.4h, #8 ; CHECK-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-NEXT: shl v1.4h, v1.4h, #8 ; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: xtn v0.8b, v0.8h @@ -134,11 +134,11 @@ ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1 { v0.b }[0], [x1] +; CHECK-NEXT: add x8, x1, #1 ; CHECK-NEXT: ld1 { v1.b }[0], [x0] -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: add x9, x1, #1 -; CHECK-NEXT: ld1 { v0.b }[4], [x9] -; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: add x9, x0, #1 +; CHECK-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: shl v1.2s, v1.2s, #24 ; CHECK-NEXT: sqadd v0.2s, v1.2s, v0.2s @@ -174,11 +174,11 @@ ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1 { v0.h }[0], [x1] +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: ld1 { v1.h }[0], [x0] -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: add x9, x1, #2 -; CHECK-NEXT: ld1 { v0.h }[2], [x9] -; CHECK-NEXT: ld1 { v1.h }[2], [x8] +; CHECK-NEXT: add x9, x0, #2 +; CHECK-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-NEXT: sqadd v0.2s, v1.2s, v0.2s @@ -207,10 +207,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { ; CHECK-LABEL: v12i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: sqadd v1.8h, v1.8h, v2.8h -; CHECK-NEXT: sqadd v0.8h, v0.8h, v3.8h +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: sqadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: sqadd v1.8h, v2.8h, v3.8h ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: str d1, [x2, #16] ; CHECK-NEXT: ret @@ -254,10 +254,10 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; CHECK-LABEL: v16i4: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.16b, v0.16b, #4 ; CHECK-NEXT: shl v1.16b, v1.16b, #4 -; CHECK-NEXT: sshr v0.16b, v0.16b, #4 +; CHECK-NEXT: shl v0.16b, v0.16b, #4 ; CHECK-NEXT: sshr v1.16b, v1.16b, #4 +; CHECK-NEXT: sshr v0.16b, v0.16b, #4 ; CHECK-NEXT: shl v1.16b, v1.16b, #4 ; CHECK-NEXT: shl v0.16b, v0.16b, #4 ; CHECK-NEXT: sqadd v0.16b, v0.16b, v1.16b @@ -307,9 +307,9 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; CHECK-LABEL: v16i32: ; CHECK: // %bb.0: +; CHECK-NEXT: sqadd v2.4s, v2.4s, v6.4s ; CHECK-NEXT: sqadd v0.4s, v0.4s, v4.4s ; CHECK-NEXT: sqadd v1.4s, v1.4s, v5.4s -; CHECK-NEXT: sqadd v2.4s, v2.4s, v6.4s ; CHECK-NEXT: sqadd v3.4s, v3.4s, v7.4s ; CHECK-NEXT: ret %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y) @@ -338,9 +338,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: v8i64: ; CHECK: // %bb.0: +; CHECK-NEXT: sqadd v2.2d, v2.2d, v6.2d ; CHECK-NEXT: sqadd v0.2d, v0.2d, v4.2d ; CHECK-NEXT: sqadd v1.2d, v1.2d, v5.2d -; CHECK-NEXT: sqadd v2.2d, v2.2d, v6.2d ; CHECK-NEXT: sqadd v3.2d, v3.2d, v7.2d ; CHECK-NEXT: ret %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y) @@ -351,23 +351,23 @@ ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: ; CHECK-NEXT: adds x8, x2, x6 -; CHECK-NEXT: adcs x11, x3, x7 -; CHECK-NEXT: eor x9, x3, x7 -; CHECK-NEXT: eor x12, x3, x11 -; CHECK-NEXT: bics xzr, x12, x9 -; CHECK-NEXT: asr x9, x11, #63 -; CHECK-NEXT: eor x12, x9, #0x8000000000000000 -; CHECK-NEXT: csel x2, x9, x8, lt -; CHECK-NEXT: csel x3, x12, x11, lt +; CHECK-NEXT: eor x10, x3, x7 +; CHECK-NEXT: adcs x9, x3, x7 +; CHECK-NEXT: eor x11, x3, x9 +; CHECK-NEXT: asr x12, x9, #63 +; CHECK-NEXT: bics xzr, x11, x10 +; CHECK-NEXT: eor x10, x1, x5 +; CHECK-NEXT: csel x2, x12, x8, lt +; CHECK-NEXT: eor x8, x12, #0x8000000000000000 +; CHECK-NEXT: csel x3, x8, x9, lt ; CHECK-NEXT: adds x8, x0, x4 ; CHECK-NEXT: adcs x9, x1, x5 -; CHECK-NEXT: eor x10, x1, x5 -; CHECK-NEXT: eor x12, x1, x9 -; CHECK-NEXT: asr x11, x9, #63 -; CHECK-NEXT: bics xzr, x12, x10 -; CHECK-NEXT: eor x13, x11, #0x8000000000000000 -; CHECK-NEXT: csel x8, x11, x8, lt -; CHECK-NEXT: csel x1, x13, x9, lt +; CHECK-NEXT: eor x11, x1, x9 +; CHECK-NEXT: asr x12, x9, #63 +; CHECK-NEXT: bics xzr, x11, x10 +; CHECK-NEXT: eor x10, x12, #0x8000000000000000 +; CHECK-NEXT: csel x8, x12, x8, lt +; CHECK-NEXT: csel x1, x10, x9, lt ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -9,9 +9,9 @@ define i8 @unsigned_sat_constant_i8_using_min(i8 %x) { ; CHECK-LABEL: unsigned_sat_constant_i8_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: cmp w8, #213 +; CHECK-NEXT: and w9, w0, #0xff ; CHECK-NEXT: mov w8, #-43 +; CHECK-NEXT: cmp w9, #213 ; CHECK-NEXT: csel w8, w0, w8, lo ; CHECK-NEXT: add w0, w8, #42 ; CHECK-NEXT: ret @@ -81,10 +81,10 @@ define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) { ; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_notval: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #65493 -; CHECK-NEXT: add w8, w0, #42 -; CHECK-NEXT: cmp w9, w0, uxth -; CHECK-NEXT: csinv w0, w8, wzr, hs +; CHECK-NEXT: mov w8, #65493 +; CHECK-NEXT: add w9, w0, #42 +; CHECK-NEXT: cmp w8, w0, uxth +; CHECK-NEXT: csinv w0, w9, wzr, hs ; CHECK-NEXT: ret %a = add i16 %x, 42 %c = icmp ugt i16 %x, -43 @@ -95,8 +95,8 @@ define i32 @unsigned_sat_constant_i32_using_min(i32 %x) { ; CHECK-LABEL: unsigned_sat_constant_i32_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: cmn w0, #43 ; CHECK-NEXT: mov w8, #-43 +; CHECK-NEXT: cmn w0, #43 ; CHECK-NEXT: csel w8, w0, w8, lo ; CHECK-NEXT: add w0, w8, #42 ; CHECK-NEXT: ret @@ -133,8 +133,8 @@ define i64 @unsigned_sat_constant_i64_using_min(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: cmn x0, #43 ; CHECK-NEXT: mov x8, #-43 +; CHECK-NEXT: cmn x0, #43 ; CHECK-NEXT: csel x8, x0, x8, lo ; CHECK-NEXT: add x0, x8, #42 ; CHECK-NEXT: ret @@ -202,8 +202,8 @@ ; CHECK-LABEL: unsigned_sat_variable_i8_using_cmp_notval: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w1, #0xff -; CHECK-NEXT: add w8, w8, w0, uxtb ; CHECK-NEXT: add w9, w0, w1 +; CHECK-NEXT: add w8, w8, w0, uxtb ; CHECK-NEXT: tst w8, #0x100 ; CHECK-NEXT: csinv w0, w9, wzr, eq ; CHECK-NEXT: ret @@ -217,9 +217,9 @@ define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) { ; CHECK-LABEL: unsigned_sat_variable_i16_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: mvn w9, w1 -; CHECK-NEXT: cmp w8, w9, uxth +; CHECK-NEXT: mvn w8, w1 +; CHECK-NEXT: and w9, w0, #0xffff +; CHECK-NEXT: cmp w9, w8, uxth ; CHECK-NEXT: csinv w8, w0, w1, lo ; CHECK-NEXT: add w0, w8, w1 ; CHECK-NEXT: ret @@ -248,8 +248,8 @@ ; CHECK-LABEL: unsigned_sat_variable_i16_using_cmp_notval: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w1, #0xffff -; CHECK-NEXT: add w8, w8, w0, uxth ; CHECK-NEXT: add w9, w0, w1 +; CHECK-NEXT: add w8, w8, w0, uxth ; CHECK-NEXT: tst w8, #0x10000 ; CHECK-NEXT: csinv w0, w9, wzr, eq ; CHECK-NEXT: ret @@ -461,10 +461,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-43 ; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: mov w9, #42 +; CHECK-NEXT: mov w8, #42 ; CHECK-NEXT: cmhi v2.2d, v1.2d, v0.2d ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: dup v1.2d, x9 +; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %c = icmp ult <2 x i64> %x, diff --git a/llvm/test/CodeGen/AArch64/sdivpow2.ll b/llvm/test/CodeGen/AArch64/sdivpow2.ll --- a/llvm/test/CodeGen/AArch64/sdivpow2.ll +++ b/llvm/test/CodeGen/AArch64/sdivpow2.ll @@ -78,8 +78,8 @@ ; CHECK-LABEL: test7: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #281474976710655 -; CHECK-NEXT: add x8, x0, x8 ; CHECK-NEXT: cmp x0, #0 +; CHECK-NEXT: add x8, x0, x8 ; CHECK-NEXT: csel x8, x8, x0, lt ; CHECK-NEXT: asr x0, x8, #48 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/seh-finally.ll b/llvm/test/CodeGen/AArch64/seh-finally.ll --- a/llvm/test/CodeGen/AArch64/seh-finally.ll +++ b/llvm/test/CodeGen/AArch64/seh-finally.ll @@ -66,8 +66,8 @@ ; CHECK-LABEL: fin_simple_seh ; CHECK: movz x8, #:abs_g1_s:.Lsimple_seh$frame_escape_0 ; CHECK: movk x8, #:abs_g0_nc:.Lsimple_seh$frame_escape_0 -; CHECK: ldr w8, [x1, x8] ; CHECK: strb w0, [sp, #15] +; CHECK: ldr w8, [x1, x8] ; CHECK: bl foo %frame_pointer.addr = alloca i8*, align 8 @@ -120,8 +120,8 @@ ; CHECK-LABEL: fin_stack_realign ; CHECK: movz x8, #:abs_g1_s:.Lstack_realign$frame_escape_0 ; CHECK: movk x8, #:abs_g0_nc:.Lstack_realign$frame_escape_0 -; CHECK: ldr w8, [x1, x8] ; CHECK: strb w0, [sp, #15] +; CHECK: ldr w8, [x1, x8] ; CHECK: bl foo %frame_pointer.addr = alloca i8*, align 8 @@ -186,8 +186,8 @@ ; CHECK-LABEL: fin_vla_present ; CHECK: movz x8, #:abs_g1_s:.Lvla_present$frame_escape_0 ; CHECK: movk x8, #:abs_g0_nc:.Lvla_present$frame_escape_0 -; CHECK: ldr w8, [x1, x8] ; CHECK: strb w0, [sp, #15] +; CHECK: ldr w8, [x1, x8] ; CHECK: bl foo %frame_pointer.addr = alloca i8*, align 8 @@ -256,8 +256,8 @@ ; CHECK-LABEL: fin_vla_and_realign ; CHECK: movz x8, #:abs_g1_s:.Lvla_and_realign$frame_escape_0 ; CHECK: movk x8, #:abs_g0_nc:.Lvla_and_realign$frame_escape_0 -; CHECK: ldr w8, [x1, x8] ; CHECK: strb w0, [sp, #15] +; CHECK: ldr w8, [x1, x8] ; CHECK: bl foo %frame_pointer.addr = alloca i8*, align 8 diff --git a/llvm/test/CodeGen/AArch64/select-with-and-or.ll b/llvm/test/CodeGen/AArch64/select-with-and-or.ll --- a/llvm/test/CodeGen/AArch64/select-with-and-or.ll +++ b/llvm/test/CodeGen/AArch64/select-with-and-or.ll @@ -64,9 +64,9 @@ define <4 x i1> @and_vec(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) { ; CHECK-LABEL: and_vec: ; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v2.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmgt v1.4s, v2.4s, v3.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %a = icmp eq <4 x i32> %x, %y @@ -78,9 +78,9 @@ define <4 x i1> @or_vec(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) { ; CHECK-LABEL: or_vec: ; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v2.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmgt v1.4s, v2.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %a = icmp eq <4 x i32> %x, %y @@ -92,9 +92,9 @@ define <4 x i1> @and_not_vec(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) { ; CHECK-LABEL: and_not_vec: ; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v2.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmgt v1.4s, v2.4s, v3.4s -; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: bic v0.16b, v2.16b, v0.16b ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %a = icmp eq <4 x i32> %x, %y @@ -106,9 +106,9 @@ define <4 x i1> @or_not_vec(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) { ; CHECK-LABEL: or_not_vec: ; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v2.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmgt v1.4s, v2.4s, v3.4s -; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b +; CHECK-NEXT: orn v0.16b, v2.16b, v0.16b ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %a = icmp eq <4 x i32> %x, %y @@ -120,9 +120,9 @@ define <4 x i1> @and_vec_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) { ; CHECK-LABEL: and_vec_undef: ; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v2.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmgt v1.4s, v2.4s, v3.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %a = icmp eq <4 x i32> %x, %y @@ -134,9 +134,9 @@ define <4 x i1> @or_vec_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) { ; CHECK-LABEL: or_vec_undef: ; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v2.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmgt v1.4s, v2.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %a = icmp eq <4 x i32> %x, %y @@ -148,9 +148,9 @@ define <4 x i1> @and_not_vec_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) { ; CHECK-LABEL: and_not_vec_undef: ; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v2.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmgt v1.4s, v2.4s, v3.4s -; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: bic v0.16b, v2.16b, v0.16b ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %a = icmp eq <4 x i32> %x, %y @@ -162,9 +162,9 @@ define <4 x i1> @or_not_vec_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z, <4 x i32> %w) { ; CHECK-LABEL: or_not_vec_undef: ; CHECK: // %bb.0: +; CHECK-NEXT: cmgt v2.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmgt v1.4s, v2.4s, v3.4s -; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b +; CHECK-NEXT: orn v0.16b, v2.16b, v0.16b ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %a = icmp eq <4 x i32> %x, %y diff --git a/llvm/test/CodeGen/AArch64/select_const.ll b/llvm/test/CodeGen/AArch64/select_const.ll --- a/llvm/test/CodeGen/AArch64/select_const.ll +++ b/llvm/test/CodeGen/AArch64/select_const.ll @@ -126,8 +126,8 @@ define i32 @select_Cplus1_C(i1 %cond) { ; CHECK-LABEL: select_Cplus1_C: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: cinc w0, w8, ne ; CHECK-NEXT: ret %sel = select i1 %cond, i32 42, i32 41 @@ -137,8 +137,8 @@ define i32 @select_Cplus1_C_zeroext(i1 zeroext %cond) { ; CHECK-LABEL: select_Cplus1_C_zeroext: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, ne ; CHECK-NEXT: ret %sel = select i1 %cond, i32 42, i32 41 @@ -148,8 +148,8 @@ define i32 @select_Cplus1_C_signext(i1 signext %cond) { ; CHECK-LABEL: select_Cplus1_C_signext: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: cinc w0, w8, ne ; CHECK-NEXT: ret %sel = select i1 %cond, i32 42, i32 41 @@ -161,8 +161,8 @@ define i32 @select_C_Cplus1(i1 %cond) { ; CHECK-LABEL: select_C_Cplus1: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: cinc w0, w8, eq ; CHECK-NEXT: ret %sel = select i1 %cond, i32 41, i32 42 @@ -172,8 +172,8 @@ define i32 @select_C_Cplus1_zeroext(i1 zeroext %cond) { ; CHECK-LABEL: select_C_Cplus1_zeroext: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, eq ; CHECK-NEXT: ret %sel = select i1 %cond, i32 41, i32 42 @@ -183,8 +183,8 @@ define i32 @select_C_Cplus1_signext(i1 signext %cond) { ; CHECK-LABEL: select_C_Cplus1_signext: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: cinc w0, w8, eq ; CHECK-NEXT: ret %sel = select i1 %cond, i32 41, i32 42 @@ -197,8 +197,8 @@ define i32 @select_C1_C2(i1 %cond) { ; CHECK-LABEL: select_C1_C2: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #421 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -209,8 +209,8 @@ define i32 @select_C1_C2_zeroext(i1 zeroext %cond) { ; CHECK-LABEL: select_C1_C2_zeroext: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w9, #421 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -221,8 +221,8 @@ define i32 @select_C1_C2_signext(i1 signext %cond) { ; CHECK-LABEL: select_C1_C2_signext: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #421 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -235,8 +235,8 @@ define i8 @sel_constants_add_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_add_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #28 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csinc w0, w8, wzr, eq ; CHECK-NEXT: ret %sel = select i1 %cond, i8 -4, i8 23 @@ -247,8 +247,8 @@ define i8 @sel_constants_sub_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_sub_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #18 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #-9 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -260,8 +260,8 @@ define i8 @sel_constants_sub_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: sel_constants_sub_constant_sel_constants: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #9 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -273,8 +273,8 @@ define i8 @sel_constants_mul_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_mul_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #115 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #-20 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -286,8 +286,8 @@ define i8 @sel_constants_sdiv_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_sdiv_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csel w0, wzr, w8, ne ; CHECK-NEXT: ret %sel = select i1 %cond, i8 -4, i8 23 @@ -298,8 +298,8 @@ define i8 @sdiv_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: sdiv_constant_sel_constants: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csel w0, wzr, w8, ne ; CHECK-NEXT: ret %sel = select i1 %cond, i8 121, i8 23 @@ -310,8 +310,8 @@ define i8 @sel_constants_udiv_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_udiv_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #50 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -323,8 +323,8 @@ define i8 @udiv_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: udiv_constant_sel_constants: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csel w0, wzr, w8, ne ; CHECK-NEXT: ret %sel = select i1 %cond, i8 -4, i8 23 @@ -335,8 +335,8 @@ define i8 @sel_constants_srem_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_srem_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #-4 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: cinv w0, w8, eq ; CHECK-NEXT: ret %sel = select i1 %cond, i8 -4, i8 23 @@ -347,8 +347,8 @@ define i8 @srem_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: srem_constant_sel_constants: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #120 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -360,8 +360,8 @@ define i8 @sel_constants_urem_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_urem_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: cinc w0, w8, eq ; CHECK-NEXT: ret %sel = select i1 %cond, i8 -4, i8 23 @@ -372,8 +372,8 @@ define i8 @urem_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: urem_constant_sel_constants: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #120 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -385,8 +385,8 @@ define i8 @sel_constants_and_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_and_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: cinc w0, w8, eq ; CHECK-NEXT: ret %sel = select i1 %cond, i8 -4, i8 23 @@ -397,8 +397,8 @@ define i8 @sel_constants_or_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_or_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #23 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #-3 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -410,8 +410,8 @@ define i8 @sel_constants_xor_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_xor_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #18 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #-7 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -423,8 +423,8 @@ define i8 @sel_constants_shl_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_shl_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #-32 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #-128 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -436,8 +436,8 @@ define i8 @shl_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: shl_constant_sel_constants: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #4 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -449,8 +449,8 @@ define i8 @sel_constants_lshr_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_lshr_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #7 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csel w0, w8, wzr, ne ; CHECK-NEXT: ret %sel = select i1 %cond, i8 -4, i8 23 @@ -461,8 +461,8 @@ define i8 @lshr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: lshr_constant_sel_constants: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #16 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -485,8 +485,8 @@ define i8 @ashr_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: ashr_constant_sel_constants: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #-16 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w9, #-32 ; CHECK-NEXT: csel w0, w9, w8, ne ; CHECK-NEXT: ret @@ -498,13 +498,13 @@ define double @sel_constants_fadd_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_fadd_constant: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI42_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI42_0] ; CHECK-NEXT: mov x8, #7378697629483820646 +; CHECK-NEXT: adrp x9, .LCPI42_0 ; CHECK-NEXT: movk x8, #16444, lsl #48 ; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: fcsel d0, d0, d1, ne +; CHECK-NEXT: ldr d1, [x9, :lo12:.LCPI42_0] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fcsel d0, d1, d0, ne ; CHECK-NEXT: ret %sel = select i1 %cond, double -4.0, double 23.3 %bo = fadd double %sel, 5.1 @@ -514,12 +514,12 @@ define double @sel_constants_fsub_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_fsub_constant: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #3689348814741910323 ; CHECK-NEXT: adrp x8, .LCPI43_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI43_0] -; CHECK-NEXT: mov x8, #3689348814741910323 -; CHECK-NEXT: movk x8, #49186, lsl #48 +; CHECK-NEXT: movk x9, #49186, lsl #48 ; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI43_0] +; CHECK-NEXT: fmov d1, x9 ; CHECK-NEXT: fcsel d0, d1, d0, ne ; CHECK-NEXT: ret %sel = select i1 %cond, double -4.0, double 23.3 @@ -530,12 +530,12 @@ define double @fsub_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: fsub_constant_sel_constants: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #3689348814741910323 ; CHECK-NEXT: adrp x8, .LCPI44_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI44_0] -; CHECK-NEXT: mov x8, #3689348814741910323 -; CHECK-NEXT: movk x8, #16418, lsl #48 +; CHECK-NEXT: movk x9, #16418, lsl #48 ; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI44_0] +; CHECK-NEXT: fmov d1, x9 ; CHECK-NEXT: fcsel d0, d1, d0, ne ; CHECK-NEXT: ret %sel = select i1 %cond, double -4.0, double 23.3 @@ -546,12 +546,12 @@ define double @sel_constants_fmul_constant(i1 %cond) { ; CHECK-LABEL: sel_constants_fmul_constant: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #7378697629483820646 ; CHECK-NEXT: adrp x8, .LCPI45_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI45_0] -; CHECK-NEXT: mov x8, #7378697629483820646 -; CHECK-NEXT: movk x8, #49204, lsl #48 +; CHECK-NEXT: movk x9, #49204, lsl #48 ; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI45_0] +; CHECK-NEXT: fmov d1, x9 ; CHECK-NEXT: fcsel d0, d1, d0, ne ; CHECK-NEXT: ret %sel = select i1 %cond, double -4.0, double 23.3 @@ -564,9 +564,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI46_0 ; CHECK-NEXT: adrp x9, .LCPI46_1 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI46_0] ; CHECK-NEXT: ldr d1, [x9, :lo12:.LCPI46_1] -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: fcsel d0, d1, d0, ne ; CHECK-NEXT: ret %sel = select i1 %cond, double -4.0, double 23.3 @@ -577,12 +577,12 @@ define double @fdiv_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: fdiv_constant_sel_constants: ; CHECK: // %bb.0: +; CHECK-NEXT: mov x9, #7378697629483820646 ; CHECK-NEXT: adrp x8, .LCPI47_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI47_0] -; CHECK-NEXT: mov x8, #7378697629483820646 -; CHECK-NEXT: movk x8, #49140, lsl #48 +; CHECK-NEXT: movk x9, #49140, lsl #48 ; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI47_0] +; CHECK-NEXT: fmov d1, x9 ; CHECK-NEXT: fcsel d0, d1, d0, ne ; CHECK-NEXT: ret %sel = select i1 %cond, double -4.0, double 23.3 @@ -594,9 +594,9 @@ ; CHECK-LABEL: sel_constants_frem_constant: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI48_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI48_0] -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: fmov d1, #-4.00000000 +; CHECK-NEXT: tst w0, #0x1 +; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI48_0] ; CHECK-NEXT: fcsel d0, d1, d0, ne ; CHECK-NEXT: ret %sel = select i1 %cond, double -4.0, double 23.3 @@ -607,13 +607,13 @@ define double @frem_constant_sel_constants(i1 %cond) { ; CHECK-LABEL: frem_constant_sel_constants: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI49_0 -; CHECK-NEXT: ldr d0, [x8, :lo12:.LCPI49_0] ; CHECK-NEXT: mov x8, #7378697629483820646 +; CHECK-NEXT: adrp x9, .LCPI49_0 ; CHECK-NEXT: movk x8, #16404, lsl #48 ; CHECK-NEXT: tst w0, #0x1 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: fcsel d0, d0, d1, ne +; CHECK-NEXT: ldr d1, [x9, :lo12:.LCPI49_0] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: fcsel d0, d1, d0, ne ; CHECK-NEXT: ret %sel = select i1 %cond, double -4.0, double 23.3 %bo = frem double 5.1, %sel diff --git a/llvm/test/CodeGen/AArch64/select_fmf.ll b/llvm/test/CodeGen/AArch64/select_fmf.ll --- a/llvm/test/CodeGen/AArch64/select_fmf.ll +++ b/llvm/test/CodeGen/AArch64/select_fmf.ll @@ -7,12 +7,12 @@ define float @select_select_fold_select_and(float %w, float %x, float %y, float %z) { ; CHECK-LABEL: select_select_fold_select_and: ; CHECK: // %bb.0: +; CHECK-NEXT: fminnm s5, s1, s2 ; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: fminnm s1, s1, s2 -; CHECK-NEXT: fmaxnm s2, s0, s3 +; CHECK-NEXT: fmaxnm s1, s0, s3 ; CHECK-NEXT: fmov s4, #0.50000000 -; CHECK-NEXT: fccmp s1, s0, #4, lt -; CHECK-NEXT: fcsel s2, s2, s0, gt +; CHECK-NEXT: fccmp s5, s0, #4, lt +; CHECK-NEXT: fcsel s2, s1, s0, gt ; CHECK-NEXT: fadd s1, s0, s4 ; CHECK-NEXT: fadd s4, s1, s2 ; CHECK-NEXT: fcmp s4, s1 @@ -23,14 +23,14 @@ ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_2: // %if.end.i159.i.i ; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #48844, lsl #16 ; CHECK-NEXT: mov w9, #13107 +; CHECK-NEXT: movk w8, #48844, lsl #16 ; CHECK-NEXT: movk w9, #48819, lsl #16 +; CHECK-NEXT: fcmp s1, #0.0 ; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: fmov s4, w9 ; CHECK-NEXT: fadd s0, s0, s2 -; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: fadd s2, s3, s2 -; CHECK-NEXT: fcmp s1, #0.0 +; CHECK-NEXT: fadd s2, s3, s4 ; CHECK-NEXT: fcsel s0, s0, s2, gt ; CHECK-NEXT: ret %tmp21 = fcmp fast olt float %x, %y @@ -65,12 +65,12 @@ define float @select_select_fold_select_or(float %w, float %x, float %y, float %z) { ; CHECK-LABEL: select_select_fold_select_or: ; CHECK: // %bb.0: +; CHECK-NEXT: fminnm s5, s1, s2 ; CHECK-NEXT: fcmp s1, s2 -; CHECK-NEXT: fminnm s1, s1, s2 -; CHECK-NEXT: fmaxnm s2, s0, s3 +; CHECK-NEXT: fmaxnm s1, s0, s3 ; CHECK-NEXT: fmov s4, #0.50000000 -; CHECK-NEXT: fccmp s1, s0, #0, ge -; CHECK-NEXT: fcsel s2, s0, s2, gt +; CHECK-NEXT: fccmp s5, s0, #0, ge +; CHECK-NEXT: fcsel s2, s0, s1, gt ; CHECK-NEXT: fadd s1, s0, s4 ; CHECK-NEXT: fadd s4, s1, s2 ; CHECK-NEXT: fcmp s4, s1 @@ -81,14 +81,14 @@ ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_2: // %if.end.i159.i.i ; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #48844, lsl #16 ; CHECK-NEXT: mov w9, #13107 +; CHECK-NEXT: movk w8, #48844, lsl #16 ; CHECK-NEXT: movk w9, #48819, lsl #16 +; CHECK-NEXT: fcmp s1, #0.0 ; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: fmov s4, w9 ; CHECK-NEXT: fadd s0, s0, s2 -; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: fadd s2, s3, s2 -; CHECK-NEXT: fcmp s1, #0.0 +; CHECK-NEXT: fadd s2, s3, s4 ; CHECK-NEXT: fcsel s0, s0, s2, gt ; CHECK-NEXT: ret %tmp21 = fcmp fast olt float %x, %y diff --git a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll --- a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll +++ b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll @@ -121,8 +121,8 @@ define i8 @sel_shift_bool_i8(i1 %t) { ; CHECK-LABEL: sel_shift_bool_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #-128 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csel w0, w8, wzr, ne ; CHECK-NEXT: ret %shl = select i1 %t, i8 128, i8 0 @@ -132,8 +132,8 @@ define i16 @sel_shift_bool_i16(i1 %t) { ; CHECK-LABEL: sel_shift_bool_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #128 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csel w0, w8, wzr, ne ; CHECK-NEXT: ret %shl = select i1 %t, i16 128, i16 0 @@ -143,8 +143,8 @@ define i32 @sel_shift_bool_i32(i1 %t) { ; CHECK-LABEL: sel_shift_bool_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #64 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csel w0, w8, wzr, ne ; CHECK-NEXT: ret %shl = select i1 %t, i32 64, i32 0 @@ -154,8 +154,8 @@ define i64 @sel_shift_bool_i64(i1 %t) { ; CHECK-LABEL: sel_shift_bool_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: mov w8, #65536 +; CHECK-NEXT: tst w0, #0x1 ; CHECK-NEXT: csel x0, x8, xzr, ne ; CHECK-NEXT: ret %shl = select i1 %t, i64 65536, i64 0 @@ -166,8 +166,8 @@ ; CHECK-LABEL: sel_shift_bool_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.16b, v0.16b, #7 -; CHECK-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-NEXT: movi v1.16b, #128 +; CHECK-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %shl = select <16 x i1> %t, <16 x i8> , <16 x i8> zeroinitializer @@ -178,9 +178,9 @@ ; CHECK-LABEL: sel_shift_bool_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: movi v1.8h, #128 ; CHECK-NEXT: shl v0.8h, v0.8h, #15 ; CHECK-NEXT: sshr v0.8h, v0.8h, #15 -; CHECK-NEXT: movi v1.8h, #128 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %shl= select <8 x i1> %t, <8 x i16> , <8 x i16> zeroinitializer @@ -191,9 +191,9 @@ ; CHECK-LABEL: sel_shift_bool_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v1.4s, #64 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-NEXT: sshr v0.4s, v0.4s, #31 -; CHECK-NEXT: movi v1.4s, #64 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %shl = select <4 x i1> %t, <4 x i32> , <4 x i32> zeroinitializer @@ -205,9 +205,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: mov w8, #65536 +; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: shl v0.2d, v0.2d, #63 ; CHECK-NEXT: sshr v0.2d, v0.2d, #63 -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %shl = select <2 x i1> %t, <2 x i64> , <2 x i64> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/settag-merge-order.ll b/llvm/test/CodeGen/AArch64/settag-merge-order.ll --- a/llvm/test/CodeGen/AArch64/settag-merge-order.ll +++ b/llvm/test/CodeGen/AArch64/settag-merge-order.ll @@ -40,8 +40,8 @@ if.then: ; CHECK: mov x8, #320 -; CHECK: st2g x9, [x9], #32 ; CHECK: sub x8, x8, #32 +; CHECK: st2g x9, [x9], #32 ; CHECK: cbnz x8, call void @llvm.aarch64.settag(i8* %a, i64 160) call void @llvm.aarch64.settag(i8* %a2, i64 160) @@ -49,8 +49,8 @@ if.else: ; CHECK: mov x8, #256 -; CHECK: st2g x9, [x9], #32 ; CHECK: sub x8, x8, #32 +; CHECK: st2g x9, [x9], #32 ; CHECK: cbnz x8, call void @llvm.aarch64.settag(i8* %c, i64 128) call void @llvm.aarch64.settag(i8* %c2, i64 128) diff --git a/llvm/test/CodeGen/AArch64/settag-merge.ll b/llvm/test/CodeGen/AArch64/settag-merge.ll --- a/llvm/test/CodeGen/AArch64/settag-merge.ll +++ b/llvm/test/CodeGen/AArch64/settag-merge.ll @@ -19,9 +19,9 @@ define i32 @stg16_16_16_16_ret() { entry: ; CHECK-LABEL: stg16_16_16_16_ret: +; CHECK: mov w0, wzr ; CHECK: st2g sp, [sp, #32] ; CHECK: st2g sp, [sp], #64 -; CHECK: mov w0, wzr ; CHECK: ret %a = alloca i8, i32 16, align 16 %b = alloca i8, i32 16, align 16 @@ -136,8 +136,8 @@ ; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] ; CHECK: add x9, sp, # ; CHECK: mov x8, #256 -; CHECK: st2g x9, [x9], #32 ; CHECK: sub x8, x8, #32 +; CHECK: st2g x9, [x9], #32 ; CHECK: cbnz x8, ; CHECK: [[LABEL]]: ; CHECK: stg sp, [sp, # @@ -164,8 +164,8 @@ ; CHECK: tbz w0, #0, [[LABEL:.LBB.*]] ; CHECK: add x9, sp, # ; CHECK: mov x8, #1024 -; CHECK: st2g x9, [x9], #32 ; CHECK: sub x8, x8, #32 +; CHECK: st2g x9, [x9], #32 ; CHECK: cbnz x8, ; CHECK: [[LABEL]]: ; CHECK: stg sp, [sp, # @@ -192,8 +192,8 @@ ; CHECK-LABEL: stg128_128_gap_128_128: ; CHECK: mov x9, sp ; CHECK: mov x8, #256 -; CHECK: st2g x9, [x9], #32 ; CHECK: sub x8, x8, #32 +; CHECK: st2g x9, [x9], #32 ; CHECK: cbnz x8, ; CHECK: mov x8, #256 ; CHECK: st2g sp, [sp], #32 diff --git a/llvm/test/CodeGen/AArch64/settag.ll b/llvm/test/CodeGen/AArch64/settag.ll --- a/llvm/test/CodeGen/AArch64/settag.ll +++ b/llvm/test/CodeGen/AArch64/settag.ll @@ -61,8 +61,8 @@ ; CHECK-NEXT: mov x8, #256 ; CHECK-NEXT: .LBB5_1: // %entry ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: st2g x0, [x0], #32 ; CHECK-NEXT: sub x8, x8, #32 +; CHECK-NEXT: st2g x0, [x0], #32 ; CHECK-NEXT: cbnz x8, .LBB5_1 ; CHECK-NEXT: // %bb.2: // %entry ; CHECK-NEXT: ret @@ -74,12 +74,12 @@ define void @stg17(i8* %p) { ; CHECK-LABEL: stg17: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stg x0, [x0], #16 ; CHECK-NEXT: mov x8, #256 +; CHECK-NEXT: stg x0, [x0], #16 ; CHECK-NEXT: .LBB6_1: // %entry ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: st2g x0, [x0], #32 ; CHECK-NEXT: sub x8, x8, #32 +; CHECK-NEXT: st2g x0, [x0], #32 ; CHECK-NEXT: cbnz x8, .LBB6_1 ; CHECK-NEXT: // %bb.2: // %entry ; CHECK-NEXT: ret @@ -102,12 +102,12 @@ define void @stzg17(i8* %p) { ; CHECK-LABEL: stzg17: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stzg x0, [x0], #16 ; CHECK-NEXT: mov x8, #256 +; CHECK-NEXT: stzg x0, [x0], #16 ; CHECK-NEXT: .LBB8_1: // %entry ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: stz2g x0, [x0], #32 ; CHECK-NEXT: sub x8, x8, #32 +; CHECK-NEXT: stz2g x0, [x0], #32 ; CHECK-NEXT: cbnz x8, .LBB8_1 ; CHECK-NEXT: // %bb.2: // %entry ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll --- a/llvm/test/CodeGen/AArch64/shift-amount-mod.ll +++ b/llvm/test/CodeGen/AArch64/shift-amount-mod.ll @@ -21,9 +21,9 @@ define i32 @load32_shl_by_negated(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: load32_shl_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: neg w9, w1 -; CHECK-NEXT: lsl w0, w8, w9 +; CHECK-NEXT: neg w8, w1 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: lsl w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %valptr %negshamt = sub i32 32, %shamt @@ -45,9 +45,9 @@ define void @modify32_shl_by_negated(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: modify32_shl_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: neg w9, w1 -; CHECK-NEXT: lsl w8, w8, w9 +; CHECK-NEXT: neg w8, w1 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: lsl w8, w9, w8 ; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %val = load i32, i32* %valptr @@ -70,9 +70,9 @@ define i64 @load64_shl_by_negated(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: load64_shl_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: neg x9, x1 -; CHECK-NEXT: lsl x0, x8, x9 +; CHECK-NEXT: neg x8, x1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: lsl x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %valptr %negshamt = sub i64 64, %shamt @@ -94,9 +94,9 @@ define void @modify64_shl_by_negated(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: modify64_shl_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: neg x9, x1 -; CHECK-NEXT: lsl x8, x8, x9 +; CHECK-NEXT: neg x8, x1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: lsl x8, x9, x8 ; CHECK-NEXT: str x8, [x0] ; CHECK-NEXT: ret %val = load i64, i64* %valptr @@ -122,9 +122,9 @@ define i32 @load32_lshr_by_negated(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: load32_lshr_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: neg w9, w1 -; CHECK-NEXT: lsr w0, w8, w9 +; CHECK-NEXT: neg w8, w1 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: lsr w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %valptr %negshamt = sub i32 32, %shamt @@ -146,9 +146,9 @@ define void @modify32_lshr_by_negated(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: modify32_lshr_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: neg w9, w1 -; CHECK-NEXT: lsr w8, w8, w9 +; CHECK-NEXT: neg w8, w1 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %val = load i32, i32* %valptr @@ -171,9 +171,9 @@ define i64 @load64_lshr_by_negated(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: load64_lshr_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: neg x9, x1 -; CHECK-NEXT: lsr x0, x8, x9 +; CHECK-NEXT: neg x8, x1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: lsr x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %valptr %negshamt = sub i64 64, %shamt @@ -195,9 +195,9 @@ define void @modify64_lshr_by_negated(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: modify64_lshr_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: neg x9, x1 -; CHECK-NEXT: lsr x8, x8, x9 +; CHECK-NEXT: neg x8, x1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: lsr x8, x9, x8 ; CHECK-NEXT: str x8, [x0] ; CHECK-NEXT: ret %val = load i64, i64* %valptr @@ -223,9 +223,9 @@ define i32 @load32_ashr_by_negated(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: load32_ashr_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: neg w9, w1 -; CHECK-NEXT: asr w0, w8, w9 +; CHECK-NEXT: neg w8, w1 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: asr w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %valptr %negshamt = sub i32 32, %shamt @@ -247,9 +247,9 @@ define void @modify32_ashr_by_negated(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: modify32_ashr_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: neg w9, w1 -; CHECK-NEXT: asr w8, w8, w9 +; CHECK-NEXT: neg w8, w1 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: asr w8, w9, w8 ; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %val = load i32, i32* %valptr @@ -272,9 +272,9 @@ define i64 @load64_ashr_by_negated(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: load64_ashr_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: neg x9, x1 -; CHECK-NEXT: asr x0, x8, x9 +; CHECK-NEXT: neg x8, x1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: asr x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %valptr %negshamt = sub i64 64, %shamt @@ -296,9 +296,9 @@ define void @modify64_ashr_by_negated(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: modify64_ashr_by_negated: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: neg x9, x1 -; CHECK-NEXT: asr x8, x8, x9 +; CHECK-NEXT: neg x8, x1 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: asr x8, x9, x8 ; CHECK-NEXT: str x8, [x0] ; CHECK-NEXT: ret %val = load i64, i64* %valptr @@ -329,10 +329,10 @@ define i32 @load32_shl_by_complemented(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: load32_shl_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #31 -; CHECK-NEXT: sub w9, w9, w1 -; CHECK-NEXT: lsl w0, w8, w9 +; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: sub w8, w8, w1 +; CHECK-NEXT: lsl w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %valptr %negshamt = sub i32 31, %shamt @@ -355,10 +355,10 @@ define void @modify32_shl_by_complemented(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: modify32_shl_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #31 -; CHECK-NEXT: sub w9, w9, w1 -; CHECK-NEXT: lsl w8, w8, w9 +; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: sub w8, w8, w1 +; CHECK-NEXT: lsl w8, w9, w8 ; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %val = load i32, i32* %valptr @@ -382,10 +382,10 @@ define i64 @load64_shl_by_complemented(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: load64_shl_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #63 -; CHECK-NEXT: sub x9, x9, x1 -; CHECK-NEXT: lsl x0, x8, x9 +; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: sub x8, x8, x1 +; CHECK-NEXT: lsl x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %valptr %negshamt = sub i64 63, %shamt @@ -408,10 +408,10 @@ define void @modify64_shl_by_complemented(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: modify64_shl_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #63 -; CHECK-NEXT: sub x9, x9, x1 -; CHECK-NEXT: lsl x8, x8, x9 +; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: sub x8, x8, x1 +; CHECK-NEXT: lsl x8, x9, x8 ; CHECK-NEXT: str x8, [x0] ; CHECK-NEXT: ret %val = load i64, i64* %valptr @@ -438,10 +438,10 @@ define i32 @load32_lshr_by_complemented(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: load32_lshr_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #31 -; CHECK-NEXT: sub w9, w9, w1 -; CHECK-NEXT: lsr w0, w8, w9 +; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: sub w8, w8, w1 +; CHECK-NEXT: lsr w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %valptr %negshamt = sub i32 31, %shamt @@ -464,10 +464,10 @@ define void @modify32_lshr_by_complemented(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: modify32_lshr_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #31 -; CHECK-NEXT: sub w9, w9, w1 -; CHECK-NEXT: lsr w8, w8, w9 +; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: sub w8, w8, w1 +; CHECK-NEXT: lsr w8, w9, w8 ; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %val = load i32, i32* %valptr @@ -491,10 +491,10 @@ define i64 @load64_lshr_by_complemented(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: load64_lshr_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #63 -; CHECK-NEXT: sub x9, x9, x1 -; CHECK-NEXT: lsr x0, x8, x9 +; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: sub x8, x8, x1 +; CHECK-NEXT: lsr x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %valptr %negshamt = sub i64 63, %shamt @@ -517,10 +517,10 @@ define void @modify64_lshr_by_complemented(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: modify64_lshr_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #63 -; CHECK-NEXT: sub x9, x9, x1 -; CHECK-NEXT: lsr x8, x8, x9 +; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: sub x8, x8, x1 +; CHECK-NEXT: lsr x8, x9, x8 ; CHECK-NEXT: str x8, [x0] ; CHECK-NEXT: ret %val = load i64, i64* %valptr @@ -547,10 +547,10 @@ define i32 @load32_ashr_by_complemented(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: load32_ashr_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #31 -; CHECK-NEXT: sub w9, w9, w1 -; CHECK-NEXT: asr w0, w8, w9 +; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: sub w8, w8, w1 +; CHECK-NEXT: asr w0, w9, w8 ; CHECK-NEXT: ret %val = load i32, i32* %valptr %negshamt = sub i32 31, %shamt @@ -573,10 +573,10 @@ define void @modify32_ashr_by_complemented(i32* %valptr, i32 %shamt) nounwind { ; CHECK-LABEL: modify32_ashr_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: mov w9, #31 -; CHECK-NEXT: sub w9, w9, w1 -; CHECK-NEXT: asr w8, w8, w9 +; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: sub w8, w8, w1 +; CHECK-NEXT: asr w8, w9, w8 ; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %val = load i32, i32* %valptr @@ -600,10 +600,10 @@ define i64 @load64_ashr_by_complemented(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: load64_ashr_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #63 -; CHECK-NEXT: sub x9, x9, x1 -; CHECK-NEXT: asr x0, x8, x9 +; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: sub x8, x8, x1 +; CHECK-NEXT: asr x0, x9, x8 ; CHECK-NEXT: ret %val = load i64, i64* %valptr %negshamt = sub i64 63, %shamt @@ -626,10 +626,10 @@ define void @modify64_ashr_by_complemented(i64* %valptr, i64 %shamt) nounwind { ; CHECK-LABEL: modify64_ashr_by_complemented: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: mov w9, #63 -; CHECK-NEXT: sub x9, x9, x1 -; CHECK-NEXT: asr x8, x8, x9 +; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: ldr x9, [x0] +; CHECK-NEXT: sub x8, x8, x1 +; CHECK-NEXT: asr x8, x9, x8 ; CHECK-NEXT: str x8, [x0] ; CHECK-NEXT: ret %val = load i64, i64* %valptr diff --git a/llvm/test/CodeGen/AArch64/shift-by-signext.ll b/llvm/test/CodeGen/AArch64/shift-by-signext.ll --- a/llvm/test/CodeGen/AArch64/shift-by-signext.ll +++ b/llvm/test/CodeGen/AArch64/shift-by-signext.ll @@ -81,11 +81,11 @@ ; CHECK-LABEL: n6_fshl: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w9, w2 -; CHECK-NEXT: lsr w10, w1, #1 -; CHECK-NEXT: lsl w8, w0, w2 -; CHECK-NEXT: lsr w9, w10, w9 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: mvn w8, w2 +; CHECK-NEXT: lsr w9, w1, #1 +; CHECK-NEXT: lsl w10, w0, w2 +; CHECK-NEXT: lsr w8, w9, w8 +; CHECK-NEXT: orr w0, w10, w8 ; CHECK-NEXT: ret %shamt_wide = sext i8 %shamt to i32 %r = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %shamt_wide) @@ -95,11 +95,11 @@ ; CHECK-LABEL: n7_fshr: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: mvn w9, w2 -; CHECK-NEXT: lsl w10, w0, #1 -; CHECK-NEXT: lsr w8, w1, w2 -; CHECK-NEXT: lsl w9, w10, w9 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: mvn w8, w2 +; CHECK-NEXT: lsl w9, w0, #1 +; CHECK-NEXT: lsr w10, w1, w2 +; CHECK-NEXT: lsl w8, w9, w8 +; CHECK-NEXT: orr w0, w8, w10 ; CHECK-NEXT: ret %shamt_wide = sext i8 %shamt to i32 %r = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt_wide) diff --git a/llvm/test/CodeGen/AArch64/shift-mod.ll b/llvm/test/CodeGen/AArch64/shift-mod.ll --- a/llvm/test/CodeGen/AArch64/shift-mod.ll +++ b/llvm/test/CodeGen/AArch64/shift-mod.ll @@ -102,8 +102,8 @@ define <4 x i32> @ashr_add_shl_v4i8(<4 x i32> %r) { ; CHECK-LABEL: ashr_add_shl_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.4s, v0.4s, #24 ; CHECK-NEXT: movi v1.4s, #1, lsl #24 +; CHECK-NEXT: shl v0.4s, v0.4s, #24 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: sshr v0.4s, v0.4s, #24 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll b/llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll --- a/llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll +++ b/llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll @@ -81,11 +81,11 @@ ; CHECK-NEXT: mov x29, sp ; VLA allocation -; CHECK: add [[X1:x[0-9]+]], [[X1]], #15 ; CHECK: mov [[X2:x[0-9]+]], sp +; CHECK: mov [[SAVE:x[0-9]+]], sp +; CHECK: add [[X1:x[0-9]+]], [[X1]], #15 ; CHECK: and [[X1]], [[X1]], #0x7fffffff0 ; Saving the SP via llvm.stacksave() -; CHECK: mov [[SAVE:x[0-9]+]], sp ; CHECK: sub [[X2]], [[X2]], [[X1]] ; The next instruction comes from llvm.stackrestore() diff --git a/llvm/test/CodeGen/AArch64/sibling-call.ll b/llvm/test/CodeGen/AArch64/sibling-call.ll --- a/llvm/test/CodeGen/AArch64/sibling-call.ll +++ b/llvm/test/CodeGen/AArch64/sibling-call.ll @@ -106,8 +106,8 @@ ; CHECK-LABEL: indirect_tail: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, func -; CHECK-NEXT: ldr x1, [x8, :lo12:func] ; CHECK-NEXT: mov w0, #42 +; CHECK-NEXT: ldr x1, [x8, :lo12:func] ; CHECK-NEXT: br x1 %fptr = load void(i32)*, void(i32)** @func diff --git a/llvm/test/CodeGen/AArch64/signbit-shift.ll b/llvm/test/CodeGen/AArch64/signbit-shift.ll --- a/llvm/test/CodeGen/AArch64/signbit-shift.ll +++ b/llvm/test/CodeGen/AArch64/signbit-shift.ll @@ -43,8 +43,8 @@ define i32 @sel_ifpos_tval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifpos_tval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, ge ; CHECK-NEXT: ret %c = icmp sgt i32 %x, -1 @@ -92,8 +92,8 @@ define i32 @sel_ifpos_fval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifpos_fval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, lt ; CHECK-NEXT: ret %c = icmp sgt i32 %x, -1 @@ -128,8 +128,8 @@ define i32 @sel_ifneg_tval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifneg_tval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, lt ; CHECK-NEXT: ret %c = icmp slt i32 %x, 0 @@ -162,8 +162,8 @@ define i32 @sel_ifneg_fval_bigger(i32 %x) { ; CHECK-LABEL: sel_ifneg_fval_bigger: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: cinc w0, w8, ge ; CHECK-NEXT: ret %c = icmp slt i32 %x, 0 diff --git a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll --- a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll +++ b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll @@ -159,9 +159,9 @@ ; CHECK-LABEL: vec_sink_add_of_const_to_add0: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI12_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = add <4 x i32> %a, ; constant always on RHS %r = add <4 x i32> %t0, %b @@ -171,9 +171,9 @@ ; CHECK-LABEL: vec_sink_add_of_const_to_add1: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = add <4 x i32> %a, ; constant always on RHS %r = add <4 x i32> %b, %t0 @@ -187,9 +187,9 @@ ; CHECK-LABEL: vec_sink_sub_of_const_to_add0: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %a, %r = add <4 x i32> %t0, %b @@ -199,9 +199,9 @@ ; CHECK-LABEL: vec_sink_sub_of_const_to_add1: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %a, %r = add <4 x i32> %b, %t0 @@ -215,9 +215,9 @@ ; CHECK-LABEL: vec_sink_sub_from_const_to_add0: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI16_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_0] ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> , %a %r = add <4 x i32> %t0, %b @@ -227,9 +227,9 @@ ; CHECK-LABEL: vec_sink_sub_from_const_to_add1: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI17_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_0] ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> , %a %r = add <4 x i32> %b, %t0 @@ -243,9 +243,9 @@ ; CHECK-LABEL: vec_sink_add_of_const_to_sub: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI18_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = add <4 x i32> %a, ; constant always on RHS %r = sub <4 x i32> %t0, %b @@ -255,9 +255,9 @@ ; CHECK-LABEL: vec_sink_add_of_const_to_sub2: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_0] ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = add <4 x i32> %a, ; constant always on RHS %r = sub <4 x i32> %b, %t0 @@ -271,9 +271,9 @@ ; CHECK-LABEL: vec_sink_sub_of_const_to_sub: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI20_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %a, %r = sub <4 x i32> %t0, %b @@ -283,9 +283,9 @@ ; CHECK-LABEL: vec_sink_sub_of_const_to_sub2: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI21_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_0] ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %a, %r = sub <4 x i32> %b, %t0 @@ -299,9 +299,9 @@ ; CHECK-LABEL: vec_sink_sub_from_const_to_sub: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI22_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> , %a %r = sub <4 x i32> %t0, %b @@ -311,9 +311,9 @@ ; CHECK-LABEL: vec_sink_sub_from_const_to_sub2: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI23_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> , %a %r = sub <4 x i32> %b, %t0 diff --git a/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll b/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll --- a/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll +++ b/llvm/test/CodeGen/AArch64/sitofp-fixed-legal.ll @@ -4,20 +4,20 @@ define <16 x double> @test_sitofp_fixed(<16 x i32> %in) { ; CHECK-LABEL: test_sitofp_fixed: ; CHECK: ; %bb.0: -; CHECK-NEXT: sshll2.2d v4, v0, #0 -; CHECK-NEXT: sshll2.2d v5, v1, #0 -; CHECK-NEXT: sshll2.2d v6, v2, #0 +; CHECK-NEXT: sshll2.2d v4, v2, #0 +; CHECK-NEXT: sshll.2d v16, v1, #0 +; CHECK-NEXT: sshll2.2d v5, v0, #0 +; CHECK-NEXT: sshll2.2d v6, v1, #0 ; CHECK-NEXT: sshll2.2d v7, v3, #0 ; CHECK-NEXT: sshll.2d v0, v0, #0 -; CHECK-NEXT: sshll.2d v16, v1, #0 ; CHECK-NEXT: sshll.2d v17, v2, #0 ; CHECK-NEXT: sshll.2d v18, v3, #0 -; CHECK-NEXT: scvtf.2d v1, v4, #6 -; CHECK-NEXT: scvtf.2d v3, v5, #6 -; CHECK-NEXT: scvtf.2d v5, v6, #6 -; CHECK-NEXT: scvtf.2d v7, v7, #6 -; CHECK-NEXT: scvtf.2d v0, v0, #6 +; CHECK-NEXT: scvtf.2d v1, v5, #6 +; CHECK-NEXT: scvtf.2d v3, v6, #6 ; CHECK-NEXT: scvtf.2d v2, v16, #6 +; CHECK-NEXT: scvtf.2d v5, v4, #6 +; CHECK-NEXT: scvtf.2d v0, v0, #6 +; CHECK-NEXT: scvtf.2d v7, v7, #6 ; CHECK-NEXT: scvtf.2d v4, v17, #6 ; CHECK-NEXT: scvtf.2d v6, v18, #6 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll --- a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll +++ b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll @@ -12,8 +12,8 @@ ; CHECK-NEXT: and x1, x1, x16 ; CHECK-NEXT: csdb ; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp -; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 ; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 ; CHECK-NEXT: mov sp, [[TMPREG]] ; CHECK-NEXT: ret } @@ -128,8 +128,8 @@ ; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: and x1, x1, x16 ; CHECK-NEXT: csdb -; CHECK-NEXT: ld1 { v0.d }[0], [x1] ; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp +; CHECK-NEXT: ld1 { v0.d }[0], [x1] ; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 ; CHECK-NEXT: mov sp, [[TMPREG]] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening.mir b/llvm/test/CodeGen/AArch64/speculation-hardening.mir --- a/llvm/test/CodeGen/AArch64/speculation-hardening.mir +++ b/llvm/test/CodeGen/AArch64/speculation-hardening.mir @@ -167,7 +167,7 @@ bb.0: ; CHECK-LABEL: indirect_call_lr ; CHECK: mov x1, sp - ; CHECK-NEXT: and x1, x1, x16 + ; CHECK: and x1, x1, x16 ; CHECK-NEXT: mov sp, x1 ; CHECK-NEXT: blr x30 liveins: $x0, $lr diff --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll --- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll +++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll @@ -20,40 +20,41 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: sub x9, x9, #2 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x8, #2 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: cmp x9, #2 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: cmp x8, #4 ; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str q2, [x10, x8] +; CHECK-NEXT: str q2, [x10, x9] +; CHECK-NEXT: mov w9, #4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: mov w8, #4 -; CHECK-NEXT: cmp x9, #4 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: lsl x9, x9, #3 ; CHECK-NEXT: addvl x10, sp, #2 -; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: cmp x8, #6 ; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: str q3, [x10, x8] +; CHECK-NEXT: str q3, [x10, x9] +; CHECK-NEXT: mov w9, #6 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: mov w8, #6 -; CHECK-NEXT: cmp x9, #6 -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: addvl x10, sp, #3 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: addvl x9, sp, #3 ; CHECK-NEXT: lsl x8, x8, #3 ; CHECK-NEXT: st1d { z0.d }, p0, [sp, #3, mul vl] -; CHECK-NEXT: str q4, [x10, x8] +; CHECK-NEXT: str q4, [x9, x8] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret + %r = call @llvm.experimental.vector.insert.nxv2i64.v8i64( %a, <8 x i64> %b, i64 0) ret %r } @@ -70,40 +71,41 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: str q1, [sp] -; CHECK-NEXT: sub x9, x9, #2 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: cmp x8, #2 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: cmp x9, #2 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: csel x9, x8, x9, lo ; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: cmp x8, #4 ; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str q2, [x10, x8] +; CHECK-NEXT: str q2, [x10, x9] +; CHECK-NEXT: mov w9, #4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: mov w8, #4 -; CHECK-NEXT: cmp x9, #4 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: lsl x9, x9, #3 ; CHECK-NEXT: addvl x10, sp, #2 -; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: cmp x8, #6 ; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: str q3, [x10, x8] +; CHECK-NEXT: str q3, [x10, x9] +; CHECK-NEXT: mov w9, #6 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: mov w8, #6 -; CHECK-NEXT: cmp x9, #6 -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: addvl x10, sp, #3 +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: addvl x9, sp, #3 ; CHECK-NEXT: lsl x8, x8, #3 ; CHECK-NEXT: st1d { z0.d }, p0, [sp, #3, mul vl] -; CHECK-NEXT: str q4, [x10, x8] +; CHECK-NEXT: str q4, [x9, x8] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #3, mul vl] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret + %r = call @llvm.experimental.vector.insert.nxv2f64.v8f64( %a, <8 x double> %b, i64 0) ret %r } diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll --- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -19,14 +19,14 @@ ; CHECK-LABEL: fsqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte s1, s0 +; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: fmul s2, s1, s1 ; CHECK-NEXT: frsqrts s2, s0, s2 ; CHECK-NEXT: fmul s1, s1, s2 ; CHECK-NEXT: fmul s2, s1, s1 +; CHECK-NEXT: fmul s1, s1, s0 ; CHECK-NEXT: frsqrts s2, s0, s2 -; CHECK-NEXT: fmul s2, s2, s0 -; CHECK-NEXT: fmul s1, s1, s2 -; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: fmul s1, s2, s1 ; CHECK-NEXT: fcsel s0, s0, s1, eq ; CHECK-NEXT: ret %1 = tail call fast float @llvm.sqrt.f32(float %a) @@ -42,14 +42,14 @@ ; CHECK-LABEL: fsqrt_ieee_denorms: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte s1, s0 +; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: fmul s2, s1, s1 ; CHECK-NEXT: frsqrts s2, s0, s2 ; CHECK-NEXT: fmul s1, s1, s2 ; CHECK-NEXT: fmul s2, s1, s1 +; CHECK-NEXT: fmul s1, s1, s0 ; CHECK-NEXT: frsqrts s2, s0, s2 -; CHECK-NEXT: fmul s2, s2, s0 -; CHECK-NEXT: fmul s1, s1, s2 -; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: fmul s1, s2, s1 ; CHECK-NEXT: fcsel s0, s0, s1, eq ; CHECK-NEXT: ret %1 = tail call fast float @llvm.sqrt.f32(float %a) @@ -69,9 +69,9 @@ ; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s ; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s ; CHECK-NEXT: fmul v2.2s, v1.2s, v1.2s +; CHECK-NEXT: fmul v1.2s, v1.2s, v0.2s ; CHECK-NEXT: frsqrts v2.2s, v0.2s, v2.2s -; CHECK-NEXT: fmul v2.2s, v2.2s, v0.2s -; CHECK-NEXT: fmul v1.2s, v1.2s, v2.2s +; CHECK-NEXT: fmul v1.2s, v2.2s, v1.2s ; CHECK-NEXT: fcmeq v2.2s, v0.2s, #0.0 ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-NEXT: ret @@ -92,9 +92,9 @@ ; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s ; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s ; CHECK-NEXT: fmul v2.4s, v1.4s, v1.4s +; CHECK-NEXT: fmul v1.4s, v1.4s, v0.4s ; CHECK-NEXT: frsqrts v2.4s, v0.4s, v2.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v0.4s -; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-NEXT: fmul v1.4s, v2.4s, v1.4s ; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret @@ -112,25 +112,25 @@ ; CHECK-LABEL: f8sqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte v2.4s, v0.4s -; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s -; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s -; CHECK-NEXT: frsqrts v3.4s, v0.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v3.4s, v0.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmeq v3.4s, v0.4s, #0.0 -; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b -; CHECK-NEXT: frsqrte v2.4s, v1.4s -; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s -; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fmul v3.4s, v2.4s, v2.4s -; CHECK-NEXT: frsqrts v3.4s, v1.4s, v3.4s +; CHECK-NEXT: frsqrte v3.4s, v1.4s +; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s +; CHECK-NEXT: frsqrts v4.4s, v0.4s, v4.4s +; CHECK-NEXT: fmul v5.4s, v3.4s, v3.4s +; CHECK-NEXT: frsqrts v5.4s, v1.4s, v5.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v4.4s +; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s +; CHECK-NEXT: fmul v2.4s, v2.4s, v0.4s +; CHECK-NEXT: frsqrts v4.4s, v0.4s, v4.4s +; CHECK-NEXT: fmul v3.4s, v3.4s, v5.4s +; CHECK-NEXT: fmul v5.4s, v3.4s, v3.4s ; CHECK-NEXT: fmul v3.4s, v3.4s, v1.4s -; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmeq v3.4s, v1.4s, #0.0 -; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b +; CHECK-NEXT: frsqrts v5.4s, v1.4s, v5.4s +; CHECK-NEXT: fmul v2.4s, v4.4s, v2.4s +; CHECK-NEXT: fcmeq v4.4s, v0.4s, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-NEXT: fmul v3.4s, v5.4s, v3.4s +; CHECK-NEXT: fcmeq v5.4s, v1.4s, #0.0 +; CHECK-NEXT: bif v1.16b, v3.16b, v5.16b ; CHECK-NEXT: ret %1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a) ret <8 x float> %1 @@ -145,6 +145,7 @@ ; CHECK-LABEL: dsqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 @@ -152,10 +153,9 @@ ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: fmul d1, d1, d0 ; CHECK-NEXT: frsqrts d2, d0, d2 -; CHECK-NEXT: fmul d2, d2, d0 -; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: fmul d1, d2, d1 ; CHECK-NEXT: fcsel d0, d0, d1, eq ; CHECK-NEXT: ret %1 = tail call fast double @llvm.sqrt.f64(double %a) @@ -171,6 +171,7 @@ ; CHECK-LABEL: dsqrt_ieee_denorms: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 @@ -178,10 +179,9 @@ ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: fmul d1, d1, d0 ; CHECK-NEXT: frsqrts d2, d0, d2 -; CHECK-NEXT: fmul d2, d2, d0 -; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: fmul d1, d2, d1 ; CHECK-NEXT: fcsel d0, d0, d1, eq ; CHECK-NEXT: ret %1 = tail call fast double @llvm.sqrt.f64(double %a) @@ -204,9 +204,9 @@ ; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d ; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d ; CHECK-NEXT: fmul v2.2d, v1.2d, v1.2d +; CHECK-NEXT: fmul v1.2d, v1.2d, v0.2d ; CHECK-NEXT: frsqrts v2.2d, v0.2d, v2.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v0.2d -; CHECK-NEXT: fmul v1.2d, v1.2d, v2.2d +; CHECK-NEXT: fmul v1.2d, v2.2d, v1.2d ; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-NEXT: ret @@ -224,31 +224,31 @@ ; CHECK-LABEL: d4sqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte v2.2d, v0.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v3.2d, v0.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v3.2d, v0.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fcmeq v3.2d, v0.2d, #0.0 -; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b -; CHECK-NEXT: frsqrte v2.2d, v1.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmul v3.2d, v2.2d, v2.2d -; CHECK-NEXT: frsqrts v3.2d, v1.2d, v3.2d +; CHECK-NEXT: frsqrte v3.2d, v1.2d +; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d +; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d +; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d +; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d +; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d +; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d +; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d +; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d +; CHECK-NEXT: fmul v2.2d, v2.2d, v0.2d +; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d +; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d ; CHECK-NEXT: fmul v3.2d, v3.2d, v1.2d -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fcmeq v3.2d, v1.2d, #0.0 -; CHECK-NEXT: bif v1.16b, v2.16b, v3.16b +; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d +; CHECK-NEXT: fmul v2.2d, v4.2d, v2.2d +; CHECK-NEXT: fcmeq v4.2d, v0.2d, #0.0 +; CHECK-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-NEXT: fmul v3.2d, v5.2d, v3.2d +; CHECK-NEXT: fcmeq v5.2d, v1.2d, #0.0 +; CHECK-NEXT: bif v1.16b, v3.16b, v5.16b ; CHECK-NEXT: ret %1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a) ret <4 x double> %1 @@ -326,9 +326,9 @@ define <8 x float> @f8rsqrt(<8 x float> %a) #0 { ; FAULT-LABEL: f8rsqrt: ; FAULT: // %bb.0: -; FAULT-NEXT: fsqrt v1.4s, v1.4s ; FAULT-NEXT: fsqrt v0.4s, v0.4s ; FAULT-NEXT: fmov v2.4s, #1.00000000 +; FAULT-NEXT: fsqrt v1.4s, v1.4s ; FAULT-NEXT: fdiv v0.4s, v2.4s, v0.4s ; FAULT-NEXT: fdiv v1.4s, v2.4s, v1.4s ; FAULT-NEXT: ret @@ -336,15 +336,15 @@ ; CHECK-LABEL: f8rsqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte v2.4s, v0.4s -; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrte v3.4s, v1.4s +; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v4.4s, v0.4s, v4.4s +; CHECK-NEXT: fmul v5.4s, v3.4s, v3.4s +; CHECK-NEXT: frsqrts v5.4s, v1.4s, v5.4s ; CHECK-NEXT: fmul v2.4s, v2.4s, v4.4s -; CHECK-NEXT: fmul v4.4s, v3.4s, v3.4s -; CHECK-NEXT: frsqrts v4.4s, v1.4s, v4.4s -; CHECK-NEXT: fmul v3.4s, v3.4s, v4.4s ; CHECK-NEXT: fmul v4.4s, v2.4s, v2.4s ; CHECK-NEXT: frsqrts v0.4s, v0.4s, v4.4s +; CHECK-NEXT: fmul v3.4s, v3.4s, v5.4s ; CHECK-NEXT: fmul v4.4s, v3.4s, v3.4s ; CHECK-NEXT: frsqrts v1.4s, v1.4s, v4.4s ; CHECK-NEXT: fmul v0.4s, v2.4s, v0.4s @@ -410,9 +410,9 @@ define <4 x double> @d4rsqrt(<4 x double> %a) #0 { ; FAULT-LABEL: d4rsqrt: ; FAULT: // %bb.0: -; FAULT-NEXT: fsqrt v1.2d, v1.2d ; FAULT-NEXT: fsqrt v0.2d, v0.2d ; FAULT-NEXT: fmov v2.2d, #1.00000000 +; FAULT-NEXT: fsqrt v1.2d, v1.2d ; FAULT-NEXT: fdiv v0.2d, v2.2d, v0.2d ; FAULT-NEXT: fdiv v1.2d, v2.2d, v1.2d ; FAULT-NEXT: ret @@ -420,21 +420,21 @@ ; CHECK-LABEL: d4rsqrt: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte v2.2d, v0.2d -; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrte v3.2d, v1.2d +; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d +; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d ; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d -; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d -; CHECK-NEXT: frsqrts v4.2d, v1.2d, v4.2d -; CHECK-NEXT: fmul v3.2d, v3.2d, v4.2d ; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v4.2d, v0.2d, v4.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d +; CHECK-NEXT: fmul v5.2d, v3.2d, v3.2d +; CHECK-NEXT: frsqrts v5.2d, v1.2d, v5.2d ; CHECK-NEXT: fmul v2.2d, v2.2d, v4.2d -; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d -; CHECK-NEXT: frsqrts v4.2d, v1.2d, v4.2d -; CHECK-NEXT: fmul v3.2d, v3.2d, v4.2d ; CHECK-NEXT: fmul v4.2d, v2.2d, v2.2d ; CHECK-NEXT: frsqrts v0.2d, v0.2d, v4.2d +; CHECK-NEXT: fmul v3.2d, v3.2d, v5.2d ; CHECK-NEXT: fmul v4.2d, v3.2d, v3.2d ; CHECK-NEXT: frsqrts v1.2d, v1.2d, v4.2d ; CHECK-NEXT: fmul v0.2d, v2.2d, v0.2d @@ -505,6 +505,7 @@ ; CHECK-LABEL: sqrt_fdiv_common_operand_extra_use: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 @@ -514,11 +515,10 @@ ; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: frsqrts d2, d0, d2 ; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: fmul d1, d0, d1 -; CHECK-NEXT: fcsel d0, d0, d1, eq -; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: fcsel d2, d0, d1, eq ; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: str d2, [x0] ; CHECK-NEXT: ret %sqrt = call fast double @llvm.sqrt.f64(double %x) store double %sqrt, double* %p @@ -529,8 +529,8 @@ define double @sqrt_simplify_before_recip_3_uses(double %x, double* %p1, double* %p2) nounwind { ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses: ; FAULT: // %bb.0: -; FAULT-NEXT: mov x8, #4631107791820423168 ; FAULT-NEXT: fsqrt d0, d0 +; FAULT-NEXT: mov x8, #4631107791820423168 ; FAULT-NEXT: fmov d1, #1.00000000 ; FAULT-NEXT: fmov d2, x8 ; FAULT-NEXT: fdiv d1, d1, d0 @@ -542,19 +542,19 @@ ; CHECK-LABEL: sqrt_simplify_before_recip_3_uses: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 -; CHECK-NEXT: fmul d2, d1, d1 -; CHECK-NEXT: frsqrts d2, d0, d2 -; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fmul d2, d1, d1 -; CHECK-NEXT: frsqrts d2, d0, d2 -; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fmul d2, d1, d1 ; CHECK-NEXT: mov x8, #4631107791820423168 -; CHECK-NEXT: frsqrts d2, d0, d2 -; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: fmul d2, d1, d2 +; CHECK-NEXT: fmul d3, d1, d1 +; CHECK-NEXT: frsqrts d3, d0, d3 +; CHECK-NEXT: fmul d1, d1, d3 +; CHECK-NEXT: fmul d3, d1, d1 +; CHECK-NEXT: frsqrts d3, d0, d3 +; CHECK-NEXT: fmul d1, d1, d3 +; CHECK-NEXT: fmul d3, d1, d1 +; CHECK-NEXT: frsqrts d3, d0, d3 +; CHECK-NEXT: fmul d1, d1, d3 ; CHECK-NEXT: fmul d0, d0, d1 +; CHECK-NEXT: fmul d2, d1, d2 ; CHECK-NEXT: str d1, [x0] ; CHECK-NEXT: str d2, [x1] ; CHECK-NEXT: ret @@ -570,12 +570,12 @@ define double @sqrt_simplify_before_recip_3_uses_order(double %x, double* %p1, double* %p2) nounwind { ; FAULT-LABEL: sqrt_simplify_before_recip_3_uses_order: ; FAULT: // %bb.0: -; FAULT-NEXT: mov x9, #140737488355328 -; FAULT-NEXT: mov x8, #4631107791820423168 -; FAULT-NEXT: movk x9, #16453, lsl #48 ; FAULT-NEXT: fsqrt d0, d0 +; FAULT-NEXT: mov x8, #4631107791820423168 ; FAULT-NEXT: fmov d1, x8 -; FAULT-NEXT: fmov d2, x9 +; FAULT-NEXT: mov x8, #140737488355328 +; FAULT-NEXT: movk x8, #16453, lsl #48 +; FAULT-NEXT: fmov d2, x8 ; FAULT-NEXT: fdiv d1, d1, d0 ; FAULT-NEXT: fdiv d2, d2, d0 ; FAULT-NEXT: str d1, [x0] @@ -585,20 +585,20 @@ ; CHECK-LABEL: sqrt_simplify_before_recip_3_uses_order: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: fmul d1, d1, d3 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: frsqrts d3, d0, d3 +; CHECK-NEXT: mov x9, #140737488355328 ; CHECK-NEXT: mov x8, #4631107791820423168 -; CHECK-NEXT: fmul d1, d1, d3 +; CHECK-NEXT: movk x9, #16453, lsl #48 +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: mov x8, #140737488355328 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: movk x8, #16453, lsl #48 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: fmul d1, d1, d3 -; CHECK-NEXT: fmov d3, x8 ; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: fmul d2, d1, d2 ; CHECK-NEXT: fmul d1, d1, d3 @@ -618,14 +618,14 @@ define double @sqrt_simplify_before_recip_4_uses(double %x, double* %p1, double* %p2, double* %p3) nounwind { ; FAULT-LABEL: sqrt_simplify_before_recip_4_uses: ; FAULT: // %bb.0: -; FAULT-NEXT: mov x8, #4631107791820423168 -; FAULT-NEXT: fmov d2, x8 -; FAULT-NEXT: mov x8, #140737488355328 ; FAULT-NEXT: fsqrt d0, d0 ; FAULT-NEXT: fmov d1, #1.00000000 -; FAULT-NEXT: movk x8, #16453, lsl #48 +; FAULT-NEXT: mov x9, #140737488355328 +; FAULT-NEXT: mov x8, #4631107791820423168 +; FAULT-NEXT: movk x9, #16453, lsl #48 +; FAULT-NEXT: fmov d2, x8 +; FAULT-NEXT: fmov d3, x9 ; FAULT-NEXT: fdiv d1, d1, d0 -; FAULT-NEXT: fmov d3, x8 ; FAULT-NEXT: fmul d2, d1, d2 ; FAULT-NEXT: fmul d3, d1, d3 ; FAULT-NEXT: str d1, [x0] @@ -636,29 +636,29 @@ ; CHECK-LABEL: sqrt_simplify_before_recip_4_uses: ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte d1, d0 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: fmul d1, d1, d3 -; CHECK-NEXT: fmul d3, d1, d1 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: fmul d1, d1, d3 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: mov x9, #140737488355328 ; CHECK-NEXT: mov x8, #4631107791820423168 -; CHECK-NEXT: fmul d3, d1, d1 +; CHECK-NEXT: movk x9, #16453, lsl #48 +; CHECK-NEXT: fmov d3, x9 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmul d2, d1, d1 +; CHECK-NEXT: frsqrts d2, d0, d2 +; CHECK-NEXT: fmul d1, d1, d2 +; CHECK-NEXT: fmul d2, d0, d1 +; CHECK-NEXT: fmul d3, d1, d3 +; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: fcsel d2, d0, d2, eq +; CHECK-NEXT: fdiv d0, d0, d2 ; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: mov x8, #140737488355328 -; CHECK-NEXT: frsqrts d3, d0, d3 -; CHECK-NEXT: movk x8, #16453, lsl #48 -; CHECK-NEXT: fmul d1, d1, d3 -; CHECK-NEXT: fcmp d0, #0.0 -; CHECK-NEXT: fmov d4, x8 -; CHECK-NEXT: fmul d3, d0, d1 ; CHECK-NEXT: fmul d2, d1, d2 -; CHECK-NEXT: fmul d4, d1, d4 -; CHECK-NEXT: str d1, [x0] -; CHECK-NEXT: fcsel d1, d0, d3, eq -; CHECK-NEXT: fdiv d0, d0, d1 ; CHECK-NEXT: str d2, [x1] -; CHECK-NEXT: str d4, [x2] +; CHECK-NEXT: str d3, [x2] ; CHECK-NEXT: ret %sqrt = tail call fast double @llvm.sqrt.f64(double %x) %rsqrt = fdiv fast double 1.0, %sqrt diff --git a/llvm/test/CodeGen/AArch64/srem-lkk.ll b/llvm/test/CodeGen/AArch64/srem-lkk.ll --- a/llvm/test/CodeGen/AArch64/srem-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-lkk.ll @@ -120,8 +120,8 @@ ; CHECK-LABEL: dont_fold_srem_i32_smax: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #2147483647 -; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: csel w8, w8, w0, lt ; CHECK-NEXT: and w8, w8, #0x80000000 ; CHECK-NEXT: add w0, w0, w8 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: movk w9, #41, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: mov w9, #48987 -; CHECK-NEXT: and w8, w8, #0x1fffffff ; CHECK-NEXT: movk w9, #82, lsl #16 +; CHECK-NEXT: and w8, w8, #0x1fffffff ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -23,15 +23,15 @@ define i1 @test_srem_even(i4 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: sbfx w8, w0, #0, #4 -; CHECK-NEXT: movk w9, #10922, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x10, x9, #63 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w9, w9, w10 +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: sbfx w9, w0, #0, #4 +; CHECK-NEXT: movk w8, #10922, lsl #16 +; CHECK-NEXT: smull x8, w9, w8 +; CHECK-NEXT: lsr x10, x8, #63 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w8, w8, w10 ; CHECK-NEXT: mov w10, #6 -; CHECK-NEXT: msub w8, w9, w10, w8 +; CHECK-NEXT: msub w8, w8, w10, w9 ; CHECK-NEXT: cmp w8, #1 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret @@ -59,44 +59,44 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; CHECK-LABEL: test_srem_vec: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x10, #7281 -; CHECK-NEXT: movk x10, #29127, lsl #16 -; CHECK-NEXT: movk x10, #50972, lsl #32 -; CHECK-NEXT: sbfx x9, x2, #0, #33 -; CHECK-NEXT: movk x10, #7281, lsl #48 -; CHECK-NEXT: mov x11, #8589934591 -; CHECK-NEXT: mov x12, #7282 -; CHECK-NEXT: movk x12, #29127, lsl #16 -; CHECK-NEXT: dup v0.2d, x11 -; CHECK-NEXT: adrp x11, .LCPI3_0 -; CHECK-NEXT: smulh x10, x9, x10 -; CHECK-NEXT: movk x12, #50972, lsl #32 -; CHECK-NEXT: ldr q1, [x11, :lo12:.LCPI3_0] -; CHECK-NEXT: adrp x11, .LCPI3_1 -; CHECK-NEXT: sub x10, x10, x9 -; CHECK-NEXT: sbfx x8, x1, #0, #33 -; CHECK-NEXT: movk x12, #7281, lsl #48 -; CHECK-NEXT: ldr q2, [x11, :lo12:.LCPI3_1] -; CHECK-NEXT: asr x11, x10, #3 -; CHECK-NEXT: add x10, x11, x10, lsr #63 -; CHECK-NEXT: smulh x11, x8, x12 -; CHECK-NEXT: add x11, x11, x11, lsr #63 -; CHECK-NEXT: add x11, x11, x11, lsl #3 -; CHECK-NEXT: sub x8, x8, x11 -; CHECK-NEXT: sbfx x11, x0, #0, #33 -; CHECK-NEXT: smulh x12, x11, x12 +; CHECK-NEXT: mov x11, #7282 +; CHECK-NEXT: sbfx x10, x0, #0, #33 +; CHECK-NEXT: movk x11, #29127, lsl #16 +; CHECK-NEXT: mov x9, #7281 +; CHECK-NEXT: movk x11, #50972, lsl #32 +; CHECK-NEXT: movk x9, #29127, lsl #16 +; CHECK-NEXT: movk x11, #7281, lsl #48 +; CHECK-NEXT: movk x9, #50972, lsl #32 +; CHECK-NEXT: sbfx x13, x1, #0, #33 +; CHECK-NEXT: sbfx x8, x2, #0, #33 +; CHECK-NEXT: smulh x12, x10, x11 +; CHECK-NEXT: movk x9, #7281, lsl #48 +; CHECK-NEXT: smulh x11, x13, x11 +; CHECK-NEXT: smulh x9, x8, x9 ; CHECK-NEXT: add x12, x12, x12, lsr #63 +; CHECK-NEXT: sub x9, x9, x8 +; CHECK-NEXT: add x11, x11, x11, lsr #63 ; CHECK-NEXT: add x12, x12, x12, lsl #3 -; CHECK-NEXT: sub x11, x11, x12 -; CHECK-NEXT: add x10, x10, x10, lsl #3 -; CHECK-NEXT: fmov d3, x11 -; CHECK-NEXT: add x9, x9, x10 -; CHECK-NEXT: mov v3.d[1], x8 -; CHECK-NEXT: fmov d4, x9 -; CHECK-NEXT: and v4.16b, v4.16b, v0.16b -; CHECK-NEXT: and v0.16b, v3.16b, v0.16b -; CHECK-NEXT: cmeq v0.2d, v0.2d, v1.2d -; CHECK-NEXT: cmeq v1.2d, v4.2d, v2.2d +; CHECK-NEXT: asr x14, x9, #3 +; CHECK-NEXT: sub x10, x10, x12 +; CHECK-NEXT: add x9, x14, x9, lsr #63 +; CHECK-NEXT: add x11, x11, x11, lsl #3 +; CHECK-NEXT: sub x11, x13, x11 +; CHECK-NEXT: add x9, x9, x9, lsl #3 +; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: mov x9, #8589934591 +; CHECK-NEXT: mov v0.d[1], x11 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: dup v2.2d, x9 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: adrp x9, .LCPI3_1 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI3_1] +; CHECK-NEXT: cmeq v0.2d, v0.2d, v2.2d +; CHECK-NEXT: cmeq v1.2d, v1.2d, v3.2d ; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: mvn v1.16b, v1.16b ; CHECK-NEXT: xtn v0.2s, v0.2d diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll b/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-optsize.ll @@ -5,12 +5,12 @@ ; CHECK-LABEL: test_minsize: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: mov w9, #42 ; CHECK-NEXT: sdiv w8, w0, w8 ; CHECK-NEXT: add w8, w8, w8, lsl #2 -; CHECK-NEXT: mov w9, #-10 ; CHECK-NEXT: cmp w0, w8 -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: csel w0, w8, w9, eq +; CHECK-NEXT: mov w8, #-10 +; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret %rem = srem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 @@ -25,12 +25,12 @@ ; CHECK-NEXT: mov w9, #39321 ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: mov w10, #858993459 ; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w11, #-10 -; CHECK-NEXT: cmp w8, w10 -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: csel w0, w8, w11, lo +; CHECK-NEXT: mov w9, #858993459 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: mov w8, #-10 +; CHECK-NEXT: mov w9, #42 +; CHECK-NEXT: csel w0, w9, w8, lo ; CHECK-NEXT: ret %rem = srem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -6,19 +6,19 @@ ; CHECK-LABEL: test_srem_odd_even: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: adrp x9, .LCPI0_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: adrp x8, .LCPI0_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] ; CHECK-NEXT: adrp x8, .LCPI0_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] -; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x9, .LCPI0_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI0_3] ; CHECK-NEXT: adrp x8, .LCPI0_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -35,17 +35,17 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x10, .LCPI1_0 ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: ldr q1, [x10, :lo12:.LCPI1_0] ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: dup v3.4s, w9 -; CHECK-NEXT: mla v3.4s, v0.4s, v2.4s -; CHECK-NEXT: cmhs v0.4s, v1.4s, v3.4s +; CHECK-NEXT: adrp x10, .LCPI1_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x10, :lo12:.LCPI1_0] ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -56,17 +56,17 @@ define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x10, .LCPI2_0 ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: ldr q1, [x10, :lo12:.LCPI2_0] ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: dup v3.4s, w9 -; CHECK-NEXT: mla v3.4s, v0.4s, v2.4s -; CHECK-NEXT: cmhi v0.4s, v3.4s, v1.4s +; CHECK-NEXT: adrp x10, .LCPI2_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x10, :lo12:.LCPI2_0] ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmhi v0.4s, v2.4s, v0.4s ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -83,15 +83,15 @@ ; CHECK-NEXT: mov w9, #9362 ; CHECK-NEXT: movk w8, #46811, lsl #16 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: adrp x10, .LCPI3_0 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI3_0] +; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s ; CHECK-NEXT: shl v0.4s, v2.4s, #31 ; CHECK-NEXT: ushr v1.4s, v2.4s, #1 ; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -107,15 +107,15 @@ ; CHECK-NEXT: mov w9, #9362 ; CHECK-NEXT: movk w8, #46811, lsl #16 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: adrp x10, .LCPI4_0 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_0 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s ; CHECK-NEXT: shl v0.4s, v2.4s, #31 ; CHECK-NEXT: ushr v1.4s, v2.4s, #1 ; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhi v0.4s, v0.4s, v3.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -130,19 +130,19 @@ ; CHECK-LABEL: test_srem_odd_even_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: adrp x9, .LCPI5_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] -; CHECK-NEXT: adrp x8, .LCPI5_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] ; CHECK-NEXT: adrp x8, .LCPI5_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] -; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI5_1] +; CHECK-NEXT: adrp x9, .LCPI5_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI5_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI5_3] ; CHECK-NEXT: adrp x8, .LCPI5_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -156,19 +156,19 @@ ; CHECK-LABEL: test_srem_odd_even_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: adrp x9, .LCPI6_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] -; CHECK-NEXT: adrp x8, .LCPI6_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] ; CHECK-NEXT: adrp x8, .LCPI6_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2] -; CHECK-NEXT: adrp x8, .LCPI6_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_1] +; CHECK-NEXT: adrp x9, .LCPI6_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI6_3] ; CHECK-NEXT: adrp x8, .LCPI6_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_4] ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -186,19 +186,19 @@ ; CHECK-LABEL: test_srem_odd_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: adrp x9, .LCPI7_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] -; CHECK-NEXT: adrp x8, .LCPI7_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1] ; CHECK-NEXT: adrp x8, .LCPI7_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2] -; CHECK-NEXT: adrp x8, .LCPI7_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI7_1] +; CHECK-NEXT: adrp x9, .LCPI7_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI7_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI7_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI7_3] ; CHECK-NEXT: adrp x8, .LCPI7_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -214,19 +214,19 @@ ; CHECK-LABEL: test_srem_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: adrp x9, .LCPI8_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: adrp x8, .LCPI8_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1] ; CHECK-NEXT: adrp x8, .LCPI8_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI8_2] -; CHECK-NEXT: adrp x8, .LCPI8_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_1] +; CHECK-NEXT: adrp x9, .LCPI8_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI8_3] ; CHECK-NEXT: adrp x8, .LCPI8_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -242,19 +242,19 @@ ; CHECK-LABEL: test_srem_odd_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: adrp x9, .LCPI9_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: adrp x8, .LCPI9_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] ; CHECK-NEXT: adrp x8, .LCPI9_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] -; CHECK-NEXT: adrp x8, .LCPI9_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI9_1] +; CHECK-NEXT: adrp x9, .LCPI9_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI9_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI9_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI9_3] ; CHECK-NEXT: adrp x8, .LCPI9_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -271,17 +271,17 @@ define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_one: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x10, .LCPI10_0 ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: ldr q1, [x10, :lo12:.LCPI10_0] ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: dup v3.4s, w9 -; CHECK-NEXT: mla v3.4s, v0.4s, v2.4s -; CHECK-NEXT: cmhs v0.4s, v1.4s, v3.4s +; CHECK-NEXT: adrp x10, .LCPI10_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x10, :lo12:.LCPI10_0] ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -298,15 +298,15 @@ ; CHECK-NEXT: mov w9, #9362 ; CHECK-NEXT: movk w8, #46811, lsl #16 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: adrp x10, .LCPI11_0 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI11_0] +; CHECK-NEXT: adrp x8, .LCPI11_0 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s ; CHECK-NEXT: shl v0.4s, v2.4s, #31 ; CHECK-NEXT: ushr v1.4s, v2.4s, #1 ; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -321,19 +321,19 @@ ; CHECK-LABEL: test_srem_odd_even_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: adrp x9, .LCPI12_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] -; CHECK-NEXT: adrp x8, .LCPI12_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] ; CHECK-NEXT: adrp x8, .LCPI12_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] -; CHECK-NEXT: adrp x8, .LCPI12_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI12_1] +; CHECK-NEXT: adrp x9, .LCPI12_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI12_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI12_3] ; CHECK-NEXT: adrp x8, .LCPI12_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -353,20 +353,20 @@ ; CHECK-NEXT: adrp x8, .LCPI13_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: adrp x8, .LCPI13_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1] ; CHECK-NEXT: adrp x8, .LCPI13_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2] -; CHECK-NEXT: adrp x8, .LCPI13_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI13_3] ; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_2] +; CHECK-NEXT: adrp x8, .LCPI13_3 +; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s ; CHECK-NEXT: usra v2.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_3] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -382,20 +382,20 @@ ; CHECK-NEXT: adrp x8, .LCPI14_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: adrp x8, .LCPI14_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: adrp x8, .LCPI14_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2] -; CHECK-NEXT: adrp x8, .LCPI14_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_3] ; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_2] +; CHECK-NEXT: adrp x8, .LCPI14_3 +; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s ; CHECK-NEXT: usra v2.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_3] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -411,20 +411,20 @@ ; CHECK-NEXT: adrp x8, .LCPI15_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: adrp x8, .LCPI15_1 +; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: adrp x8, .LCPI15_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] -; CHECK-NEXT: adrp x8, .LCPI15_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_3] ; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_2] +; CHECK-NEXT: adrp x8, .LCPI15_3 +; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s ; CHECK-NEXT: usra v2.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_3] +; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -440,19 +440,19 @@ ; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: adrp x9, .LCPI16_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] -; CHECK-NEXT: adrp x8, .LCPI16_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] ; CHECK-NEXT: adrp x8, .LCPI16_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] -; CHECK-NEXT: adrp x8, .LCPI16_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI16_1] +; CHECK-NEXT: adrp x9, .LCPI16_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI16_3] ; CHECK-NEXT: adrp x8, .LCPI16_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -468,19 +468,19 @@ ; CHECK-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: adrp x9, .LCPI17_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] -; CHECK-NEXT: adrp x8, .LCPI17_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1] ; CHECK-NEXT: adrp x8, .LCPI17_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2] -; CHECK-NEXT: adrp x8, .LCPI17_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI17_1] +; CHECK-NEXT: adrp x9, .LCPI17_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI17_3] ; CHECK-NEXT: adrp x8, .LCPI17_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -496,19 +496,19 @@ ; CHECK-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: adrp x9, .LCPI18_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] -; CHECK-NEXT: adrp x8, .LCPI18_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] ; CHECK-NEXT: adrp x8, .LCPI18_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] -; CHECK-NEXT: adrp x8, .LCPI18_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_1] +; CHECK-NEXT: adrp x9, .LCPI18_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI18_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI18_3] ; CHECK-NEXT: adrp x8, .LCPI18_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -525,17 +525,17 @@ define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_and_one: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x10, .LCPI19_0 ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: mov w9, #39321 -; CHECK-NEXT: ldr q1, [x10, :lo12:.LCPI19_0] ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: dup v3.4s, w9 -; CHECK-NEXT: mla v3.4s, v0.4s, v2.4s -; CHECK-NEXT: cmhs v0.4s, v1.4s, v3.4s +; CHECK-NEXT: adrp x10, .LCPI19_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x10, :lo12:.LCPI19_0] ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -552,15 +552,15 @@ ; CHECK-NEXT: mov w9, #9362 ; CHECK-NEXT: movk w8, #46811, lsl #16 ; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: adrp x10, .LCPI20_0 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI20_0] +; CHECK-NEXT: adrp x8, .LCPI20_0 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s ; CHECK-NEXT: shl v0.4s, v2.4s, #31 ; CHECK-NEXT: ushr v1.4s, v2.4s, #1 ; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -575,19 +575,19 @@ ; CHECK-LABEL: test_srem_odd_even_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI21_0 +; CHECK-NEXT: adrp x9, .LCPI21_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] -; CHECK-NEXT: adrp x8, .LCPI21_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] ; CHECK-NEXT: adrp x8, .LCPI21_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2] -; CHECK-NEXT: adrp x8, .LCPI21_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI21_1] +; CHECK-NEXT: adrp x9, .LCPI21_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI21_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI21_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI21_3] ; CHECK-NEXT: adrp x8, .LCPI21_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -605,19 +605,19 @@ ; CHECK-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI22_0 +; CHECK-NEXT: adrp x9, .LCPI22_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] -; CHECK-NEXT: adrp x8, .LCPI22_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1] ; CHECK-NEXT: adrp x8, .LCPI22_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2] -; CHECK-NEXT: adrp x8, .LCPI22_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI22_1] +; CHECK-NEXT: adrp x9, .LCPI22_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI22_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI22_3] ; CHECK-NEXT: adrp x8, .LCPI22_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -633,19 +633,19 @@ ; CHECK-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: adrp x9, .LCPI23_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] -; CHECK-NEXT: adrp x8, .LCPI23_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1] ; CHECK-NEXT: adrp x8, .LCPI23_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2] -; CHECK-NEXT: adrp x8, .LCPI23_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI23_1] +; CHECK-NEXT: adrp x9, .LCPI23_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI23_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI23_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI23_3] ; CHECK-NEXT: adrp x8, .LCPI23_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -661,19 +661,19 @@ ; CHECK-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI24_0 +; CHECK-NEXT: adrp x9, .LCPI24_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] -; CHECK-NEXT: adrp x8, .LCPI24_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] ; CHECK-NEXT: adrp x8, .LCPI24_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2] -; CHECK-NEXT: adrp x8, .LCPI24_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI24_1] +; CHECK-NEXT: adrp x9, .LCPI24_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI24_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI24_3] ; CHECK-NEXT: adrp x8, .LCPI24_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -690,19 +690,19 @@ ; CHECK-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI25_0 +; CHECK-NEXT: adrp x9, .LCPI25_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] -; CHECK-NEXT: adrp x8, .LCPI25_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_1] ; CHECK-NEXT: adrp x8, .LCPI25_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_2] -; CHECK-NEXT: adrp x8, .LCPI25_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI25_1] +; CHECK-NEXT: adrp x9, .LCPI25_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI25_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI25_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI25_3] ; CHECK-NEXT: adrp x8, .LCPI25_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -717,19 +717,19 @@ ; CHECK-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI26_0 +; CHECK-NEXT: adrp x9, .LCPI26_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] -; CHECK-NEXT: adrp x8, .LCPI26_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] ; CHECK-NEXT: adrp x8, .LCPI26_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_2] -; CHECK-NEXT: adrp x8, .LCPI26_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI26_1] +; CHECK-NEXT: adrp x9, .LCPI26_3 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI26_3] +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI26_2] +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI26_3] ; CHECK-NEXT: adrp x8, .LCPI26_4 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_4] -; CHECK-NEXT: ushl v3.4s, v2.4s, v3.4s ; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_4] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -9,14 +9,14 @@ ; CHECK-NEXT: mov w9, #47185 ; CHECK-NEXT: movk w8, #49807, lsl #16 ; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: mov w10, #28834 -; CHECK-NEXT: movk w10, #2621, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: dup v3.4s, w10 +; CHECK-NEXT: mov w8, #28834 +; CHECK-NEXT: movk w8, #2621, lsl #16 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhs v0.4s, v3.4s, v2.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -35,14 +35,14 @@ ; CHECK-NEXT: movk w9, #1310, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w10, #23592 +; CHECK-NEXT: mov w8, #23592 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: movk w10, #655, lsl #16 +; CHECK-NEXT: movk w8, #655, lsl #16 ; CHECK-NEXT: shl v0.4s, v2.4s, #30 ; CHECK-NEXT: ushr v1.4s, v2.4s, #2 -; CHECK-NEXT: dup v3.4s, w10 ; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -62,14 +62,14 @@ ; CHECK-NEXT: mov w9, #47185 ; CHECK-NEXT: movk w8, #49807, lsl #16 ; CHECK-NEXT: movk w9, #1310, lsl #16 -; CHECK-NEXT: mov w10, #28834 -; CHECK-NEXT: movk w10, #2621, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: dup v3.4s, w10 +; CHECK-NEXT: mov w8, #28834 +; CHECK-NEXT: movk w8, #2621, lsl #16 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhs v0.4s, v3.4s, v2.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -88,14 +88,14 @@ ; CHECK-NEXT: movk w9, #1310, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: mov w10, #23592 +; CHECK-NEXT: mov w8, #23592 ; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-NEXT: movk w10, #655, lsl #16 +; CHECK-NEXT: movk w8, #655, lsl #16 ; CHECK-NEXT: shl v0.4s, v2.4s, #30 ; CHECK-NEXT: ushr v1.4s, v2.4s, #2 -; CHECK-NEXT: dup v3.4s, w10 ; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b -; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -114,16 +114,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #34079 ; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: movi v1.4s, #25 ; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s ; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s ; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s ; CHECK-NEXT: sshr v3.4s, v2.4s, #3 -; CHECK-NEXT: movi v1.4s, #25 ; CHECK-NEXT: usra v3.4s, v2.4s, #31 ; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -137,16 +137,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #34079 ; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: movi v1.4s, #100 ; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s ; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s ; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s ; CHECK-NEXT: sshr v3.4s, v2.4s, #5 -; CHECK-NEXT: movi v1.4s, #100 ; CHECK-NEXT: usra v3.4s, v2.4s, #31 ; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -184,13 +184,13 @@ define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_pow2: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v1.4s, v0.4s, #31 -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: usra v2.4s, v1.4s, #28 -; CHECK-NEXT: bic v2.4s, #15 -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: sshr v2.4s, v0.4s, #31 +; CHECK-NEXT: mov v3.16b, v0.16b ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: usra v3.4s, v2.4s, #28 +; CHECK-NEXT: bic v3.4s, #15 +; CHECK-NEXT: sub v0.4s, v0.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, @@ -203,14 +203,14 @@ define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_int_min: ; CHECK: // %bb.0: -; CHECK-NEXT: sshr v1.4s, v0.4s, #31 -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: movi v3.4s, #128, lsl #24 -; CHECK-NEXT: usra v2.4s, v1.4s, #1 -; CHECK-NEXT: and v1.16b, v2.16b, v3.16b +; CHECK-NEXT: sshr v2.4s, v0.4s, #31 +; CHECK-NEXT: mov v3.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #128, lsl #24 +; CHECK-NEXT: usra v3.4s, v2.4s, #1 +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -47,8 +47,8 @@ ; CHECK-LABEL: test_srem_odd_bit30: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #27306, lsl #16 ; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: movk w8, #27306, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: cmp w8, #3 ; CHECK-NEXT: cset w0, lo @@ -64,8 +64,8 @@ ; CHECK-LABEL: test_srem_odd_bit31: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #21845 -; CHECK-NEXT: movk w8, #54613, lsl #16 ; CHECK-NEXT: orr w9, wzr, #0x1 +; CHECK-NEXT: movk w8, #54613, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: cmp w8, #3 ; CHECK-NEXT: cset w0, lo @@ -106,8 +106,8 @@ ; CHECK-NEXT: movk w9, #1310, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: mov w9, #23593 -; CHECK-NEXT: ror w8, w8, #2 ; CHECK-NEXT: movk w9, #655, lsl #16 +; CHECK-NEXT: ror w8, w8, #2 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -122,8 +122,8 @@ ; CHECK-LABEL: test_srem_even_bit30: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #20165 -; CHECK-NEXT: movk w8, #64748, lsl #16 ; CHECK-NEXT: orr w9, wzr, #0x8 +; CHECK-NEXT: movk w8, #64748, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: ror w8, w8, #3 ; CHECK-NEXT: cmp w8, #3 @@ -140,8 +140,8 @@ ; CHECK-LABEL: test_srem_even_bit31: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #1285 -; CHECK-NEXT: movk w8, #50437, lsl #16 ; CHECK-NEXT: orr w9, wzr, #0x2 +; CHECK-NEXT: movk w8, #50437, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: cmp w8, #3 @@ -252,8 +252,8 @@ ; CHECK-LABEL: test_srem_int_min: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #2147483647 -; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: add w8, w0, w8 ; CHECK-NEXT: csel w8, w8, w0, lt ; CHECK-NEXT: and w8, w8, #0x80000000 ; CHECK-NEXT: cmn w0, w8 diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll --- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -4,48 +4,48 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #63421 -; CHECK-NEXT: mov w12, #33437 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: movk w9, #31710, lsl #16 -; CHECK-NEXT: smov w11, v0.h[2] +; CHECK-NEXT: smov w9, v0.h[0] +; CHECK-NEXT: mov w10, #63421 +; CHECK-NEXT: mov w11, #37253 +; CHECK-NEXT: movk w10, #31710, lsl #16 +; CHECK-NEXT: movk w11, #44150, lsl #16 +; CHECK-NEXT: smov w13, v0.h[2] +; CHECK-NEXT: mov w12, #33437 +; CHECK-NEXT: smull x10, w8, w10 ; CHECK-NEXT: movk w12, #21399, lsl #16 -; CHECK-NEXT: smull x12, w11, w12 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x13, x12, #63 -; CHECK-NEXT: asr x12, x12, #37 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w12, w12, w13 -; CHECK-NEXT: mov w13, #98 -; CHECK-NEXT: sub w9, w9, w8 -; CHECK-NEXT: msub w11, w12, w13, w11 -; CHECK-NEXT: asr w13, w9, #6 -; CHECK-NEXT: add w9, w13, w9, lsr #31 -; CHECK-NEXT: mov w13, #37253 -; CHECK-NEXT: mov w10, #-124 -; CHECK-NEXT: smov w12, v0.h[0] -; CHECK-NEXT: movk w13, #44150, lsl #16 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: smull x10, w12, w13 +; CHECK-NEXT: smull x11, w9, w11 ; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w12 -; CHECK-NEXT: asr w13, w10, #6 -; CHECK-NEXT: mov w9, #95 -; CHECK-NEXT: add w10, w13, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w9, w12 -; CHECK-NEXT: mov w10, #63249 -; CHECK-NEXT: smov w13, v0.h[3] -; CHECK-NEXT: movk w10, #48808, lsl #16 -; CHECK-NEXT: smull x10, w13, w10 -; CHECK-NEXT: lsr x12, x10, #63 -; CHECK-NEXT: asr x10, x10, #40 +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: sub w10, w10, w8 +; CHECK-NEXT: add w11, w11, w9 +; CHECK-NEXT: asr w14, w10, #6 +; CHECK-NEXT: asr w15, w11, #6 +; CHECK-NEXT: add w10, w14, w10, lsr #31 +; CHECK-NEXT: add w11, w15, w11, lsr #31 +; CHECK-NEXT: mov w14, #95 +; CHECK-NEXT: mov w15, #-124 +; CHECK-NEXT: smull x12, w13, w12 +; CHECK-NEXT: msub w9, w11, w14, w9 +; CHECK-NEXT: msub w8, w10, w15, w8 +; CHECK-NEXT: lsr x10, x12, #63 +; CHECK-NEXT: asr x11, x12, #37 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: add w10, w11, w10 +; CHECK-NEXT: mov w11, #98 ; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w10, w10, w12 +; CHECK-NEXT: mov w9, #63249 +; CHECK-NEXT: movk w9, #48808, lsl #16 +; CHECK-NEXT: msub w10, w10, w11, w13 +; CHECK-NEXT: smull x9, w12, w9 ; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #-1003 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w10, w8, w13 +; CHECK-NEXT: lsr x8, x9, #63 +; CHECK-NEXT: asr x9, x9, #40 +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: mov w9, #-1003 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: msub w8, w8, w9, w12 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -56,41 +56,41 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_srem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #37253 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: movk w9, #44150, lsl #16 -; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: smull x13, w8, w9 -; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smull x14, w10, w9 +; CHECK-NEXT: smov w9, v0.h[0] +; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: smov w10, v0.h[1] +; CHECK-NEXT: smov w14, v0.h[2] +; CHECK-NEXT: mov w12, #95 +; CHECK-NEXT: smull x11, w9, w8 +; CHECK-NEXT: smull x13, w10, w8 +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: add w11, w11, w9 ; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: smull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: add w13, w13, w8 -; CHECK-NEXT: smull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w14, w14, w10 +; CHECK-NEXT: asr w15, w11, #6 +; CHECK-NEXT: add w13, w13, w10 +; CHECK-NEXT: add w11, w15, w11, lsr #31 +; CHECK-NEXT: smov w15, v0.h[3] ; CHECK-NEXT: asr w16, w13, #6 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w15, w15, w11 +; CHECK-NEXT: msub w9, w11, w12, w9 ; CHECK-NEXT: add w13, w16, w13, lsr #31 -; CHECK-NEXT: asr w16, w14, #6 -; CHECK-NEXT: add w9, w9, w12 -; CHECK-NEXT: add w14, w16, w14, lsr #31 -; CHECK-NEXT: asr w16, w15, #6 -; CHECK-NEXT: add w15, w16, w15, lsr #31 -; CHECK-NEXT: asr w16, w9, #6 -; CHECK-NEXT: add w9, w16, w9, lsr #31 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 +; CHECK-NEXT: smull x11, w14, w8 +; CHECK-NEXT: msub w10, w13, w12, w10 +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: smull x8, w15, w8 +; CHECK-NEXT: add w11, w11, w14 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: asr w9, w11, #6 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w9, w9, w11, lsr #31 +; CHECK-NEXT: add w8, w8, w15 +; CHECK-NEXT: mov v0.h[1], w10 +; CHECK-NEXT: asr w10, w8, #6 +; CHECK-NEXT: msub w9, w9, w12, w14 +; CHECK-NEXT: add w8, w10, w8, lsr #31 +; CHECK-NEXT: msub w8, w8, w12, w15 +; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -103,47 +103,47 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; CHECK-LABEL: combine_srem_sdiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #37253 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w9, v0.h[0] +; CHECK-NEXT: mov w8, #37253 ; CHECK-NEXT: movk w8, #44150, lsl #16 -; CHECK-NEXT: smov w9, v0.h[1] -; CHECK-NEXT: smov w10, v0.h[0] -; CHECK-NEXT: smull x13, w9, w8 +; CHECK-NEXT: smov w10, v0.h[1] ; CHECK-NEXT: smov w11, v0.h[2] -; CHECK-NEXT: smull x14, w10, w8 -; CHECK-NEXT: lsr x13, x13, #32 ; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: smull x15, w11, w8 -; CHECK-NEXT: lsr x14, x14, #32 +; CHECK-NEXT: mov w14, #95 +; CHECK-NEXT: smull x13, w9, w8 +; CHECK-NEXT: smull x15, w10, w8 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: smull x16, w11, w8 ; CHECK-NEXT: add w13, w13, w9 -; CHECK-NEXT: smull x8, w12, w8 ; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w14, w14, w10 -; CHECK-NEXT: asr w16, w13, #6 +; CHECK-NEXT: asr w17, w13, #6 +; CHECK-NEXT: add w15, w15, w10 +; CHECK-NEXT: add w13, w17, w13, lsr #31 +; CHECK-NEXT: asr w17, w15, #6 +; CHECK-NEXT: add w15, w17, w15, lsr #31 +; CHECK-NEXT: smull x8, w12, w8 +; CHECK-NEXT: msub w9, w13, w14, w9 +; CHECK-NEXT: lsr x16, x16, #32 +; CHECK-NEXT: add w16, w16, w11 +; CHECK-NEXT: msub w10, w15, w14, w10 +; CHECK-NEXT: asr w17, w16, #6 ; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w15, w15, w11 -; CHECK-NEXT: add w13, w16, w13, lsr #31 -; CHECK-NEXT: asr w16, w14, #6 +; CHECK-NEXT: fmov s1, w13 +; CHECK-NEXT: add w16, w17, w16, lsr #31 +; CHECK-NEXT: fmov s0, w9 ; CHECK-NEXT: add w8, w8, w12 -; CHECK-NEXT: add w14, w16, w14, lsr #31 -; CHECK-NEXT: asr w16, w15, #6 -; CHECK-NEXT: add w15, w16, w15, lsr #31 -; CHECK-NEXT: asr w16, w8, #6 -; CHECK-NEXT: add w8, w16, w8, lsr #31 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: msub w9, w13, w16, w9 -; CHECK-NEXT: fmov s0, w14 -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w13 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: msub w12, w8, w16, w12 -; CHECK-NEXT: mov v0.h[2], w15 -; CHECK-NEXT: mov v1.h[2], w11 -; CHECK-NEXT: mov v1.h[3], w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: asr w9, w8, #6 +; CHECK-NEXT: add w8, w9, w8, lsr #31 +; CHECK-NEXT: msub w9, w16, w14, w11 +; CHECK-NEXT: mov v0.h[1], w10 +; CHECK-NEXT: mov v1.h[1], w15 +; CHECK-NEXT: msub w10, w8, w14, w12 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: mov v1.h[2], w16 +; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, @@ -156,39 +156,39 @@ ; CHECK-LABEL: dont_fold_srem_power_of_two: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: add w12, w8, #31 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: mov w11, #37253 -; CHECK-NEXT: csel w12, w12, w8, lt -; CHECK-NEXT: smov w9, v0.h[0] -; CHECK-NEXT: smov w10, v0.h[3] -; CHECK-NEXT: movk w11, #44150, lsl #16 -; CHECK-NEXT: and w12, w12, #0xffffffe0 -; CHECK-NEXT: sub w8, w8, w12 -; CHECK-NEXT: add w12, w9, #63 -; CHECK-NEXT: smull x11, w10, w11 +; CHECK-NEXT: smov w9, v0.h[1] +; CHECK-NEXT: smov w10, v0.h[0] +; CHECK-NEXT: mov w8, #37253 +; CHECK-NEXT: movk w8, #44150, lsl #16 +; CHECK-NEXT: add w11, w9, #31 ; CHECK-NEXT: cmp w9, #0 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: csel w12, w12, w9, lt -; CHECK-NEXT: add w11, w11, w10 +; CHECK-NEXT: add w12, w10, #63 +; CHECK-NEXT: csel w11, w11, w9, lt +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: and w11, w11, #0xffffffe0 +; CHECK-NEXT: csel w12, w12, w10, lt +; CHECK-NEXT: sub w9, w9, w11 ; CHECK-NEXT: and w12, w12, #0xffffffc0 -; CHECK-NEXT: sub w9, w9, w12 -; CHECK-NEXT: asr w12, w11, #6 -; CHECK-NEXT: add w11, w12, w11, lsr #31 -; CHECK-NEXT: smov w12, v0.h[2] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: add w9, w12, #7 -; CHECK-NEXT: cmp w12, #0 -; CHECK-NEXT: csel w9, w9, w12, lt +; CHECK-NEXT: sub w10, w10, w12 +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: smov w10, v0.h[2] +; CHECK-NEXT: smull x8, w12, w8 +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: add w9, w10, #7 +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: csel w9, w9, w10, lt +; CHECK-NEXT: add w8, w8, w12 ; CHECK-NEXT: and w9, w9, #0xfffffff8 -; CHECK-NEXT: sub w9, w12, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #95 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w11, w8, w10 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: sub w9, w10, w9 +; CHECK-NEXT: asr w10, w8, #6 +; CHECK-NEXT: add w8, w10, w8, lsr #31 +; CHECK-NEXT: mov w10, #95 +; CHECK-NEXT: mov v1.h[2], w9 +; CHECK-NEXT: msub w8, w8, w10, w12 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -198,39 +198,39 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w9, w9, w8 -; CHECK-NEXT: asr w12, w9, #4 -; CHECK-NEXT: add w9, w12, w9, lsr #31 -; CHECK-NEXT: mov w12, #30865 -; CHECK-NEXT: mov w10, #23 -; CHECK-NEXT: smov w11, v0.h[1] -; CHECK-NEXT: movk w12, #51306, lsl #16 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: smull x10, w11, w12 +; CHECK-NEXT: smov w8, v0.h[1] +; CHECK-NEXT: smov w9, v0.h[2] +; CHECK-NEXT: mov w10, #30865 +; CHECK-NEXT: mov w11, #17097 +; CHECK-NEXT: movk w10, #51306, lsl #16 +; CHECK-NEXT: movk w11, #45590, lsl #16 +; CHECK-NEXT: mov w12, #654 +; CHECK-NEXT: smull x10, w8, w10 +; CHECK-NEXT: smull x11, w9, w11 ; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: asr w12, w10, #9 -; CHECK-NEXT: mov w9, #654 -; CHECK-NEXT: add w10, w12, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w9, w11 -; CHECK-NEXT: mov w10, #47143 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movk w10, #24749, lsl #16 -; CHECK-NEXT: smull x10, w12, w10 -; CHECK-NEXT: lsr x11, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: add w10, w10, w8 +; CHECK-NEXT: add w11, w11, w9 +; CHECK-NEXT: asr w13, w10, #9 +; CHECK-NEXT: add w10, w13, w10, lsr #31 +; CHECK-NEXT: asr w13, w11, #4 +; CHECK-NEXT: add w11, w13, w11, lsr #31 +; CHECK-NEXT: smov w13, v0.h[3] +; CHECK-NEXT: msub w8, w10, w12, w8 ; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: add w10, w10, w11 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: mov w9, #5423 -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: msub w8, w10, w9, w12 +; CHECK-NEXT: mov w12, #47143 +; CHECK-NEXT: mov w10, #23 +; CHECK-NEXT: movk w12, #24749, lsl #16 +; CHECK-NEXT: msub w9, w11, w10, w9 +; CHECK-NEXT: smull x10, w13, w12 +; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: lsr x8, x10, #63 +; CHECK-NEXT: asr x10, x10, #43 +; CHECK-NEXT: add w8, w10, w8 +; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: msub w8, w8, w10, w13 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -242,38 +242,38 @@ define <4 x i16> @dont_fold_srem_i16_smax(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_i16_smax: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w10, #17097 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w9, v0.h[2] -; CHECK-NEXT: movk w10, #45590, lsl #16 -; CHECK-NEXT: smull x10, w9, w10 -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: add w10, w10, w9 -; CHECK-NEXT: asr w12, w10, #4 -; CHECK-NEXT: mov w11, #23 -; CHECK-NEXT: add w10, w12, w10, lsr #31 -; CHECK-NEXT: msub w9, w10, w11, w9 -; CHECK-NEXT: mov w10, #47143 -; CHECK-NEXT: smov w12, v0.h[3] -; CHECK-NEXT: movk w10, #24749, lsl #16 -; CHECK-NEXT: smull x10, w12, w10 -; CHECK-NEXT: lsr x11, x10, #63 -; CHECK-NEXT: asr x10, x10, #43 -; CHECK-NEXT: smov w8, v0.h[1] -; CHECK-NEXT: add w10, w10, w11 +; CHECK-NEXT: smov w8, v0.h[2] +; CHECK-NEXT: mov w9, #17097 +; CHECK-NEXT: smov w10, v0.h[1] +; CHECK-NEXT: movk w9, #45590, lsl #16 ; CHECK-NEXT: mov w11, #32767 -; CHECK-NEXT: add w11, w8, w11 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w11, w11, w8, lt +; CHECK-NEXT: smov w12, v0.h[3] +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: smull x9, w8, w9 +; CHECK-NEXT: add w11, w10, w11 +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: csel w11, w11, w10, lt +; CHECK-NEXT: add w9, w9, w8 ; CHECK-NEXT: and w11, w11, #0xffff8000 -; CHECK-NEXT: sub w8, w8, w11 -; CHECK-NEXT: movi d0, #0000000000000000 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #5423 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w10, w8, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: asr w13, w9, #4 +; CHECK-NEXT: sub w10, w10, w11 +; CHECK-NEXT: mov w11, #47143 +; CHECK-NEXT: add w9, w13, w9, lsr #31 +; CHECK-NEXT: mov w13, #23 +; CHECK-NEXT: movk w11, #24749, lsl #16 +; CHECK-NEXT: mov v1.h[1], w10 +; CHECK-NEXT: msub w8, w9, w13, w8 +; CHECK-NEXT: smull x9, w12, w11 +; CHECK-NEXT: lsr x10, x9, #63 +; CHECK-NEXT: asr x9, x9, #43 +; CHECK-NEXT: add w9, w9, w10 +; CHECK-NEXT: mov w10, #5423 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: msub w8, w9, w10, w12 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -283,41 +283,41 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) { ; CHECK-LABEL: dont_fold_srem_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #6055 -; CHECK-NEXT: movk x9, #58853, lsl #16 -; CHECK-NEXT: movk x9, #47142, lsl #32 -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: movk x9, #24749, lsl #48 -; CHECK-NEXT: smulh x9, x8, x9 -; CHECK-NEXT: asr x12, x9, #11 -; CHECK-NEXT: mov w10, #5423 -; CHECK-NEXT: add x9, x12, x9, lsr #63 -; CHECK-NEXT: msub x8, x9, x10, x8 -; CHECK-NEXT: mov x9, #21445 -; CHECK-NEXT: movk x9, #1603, lsl #16 -; CHECK-NEXT: movk x9, #15432, lsl #32 -; CHECK-NEXT: mov x12, v0.d[1] -; CHECK-NEXT: movk x9, #25653, lsl #48 -; CHECK-NEXT: smulh x9, x12, x9 -; CHECK-NEXT: asr x10, x9, #8 -; CHECK-NEXT: add x9, x10, x9, lsr #63 -; CHECK-NEXT: mov w10, #654 -; CHECK-NEXT: msub x9, x9, x10, x12 -; CHECK-NEXT: mov x10, #8549 -; CHECK-NEXT: movk x10, #22795, lsl #16 -; CHECK-NEXT: movk x10, #17096, lsl #32 -; CHECK-NEXT: fmov x11, d1 -; CHECK-NEXT: movk x10, #45590, lsl #48 -; CHECK-NEXT: smulh x10, x11, x10 -; CHECK-NEXT: add x10, x10, x11 -; CHECK-NEXT: asr x12, x10, #4 -; CHECK-NEXT: add x10, x12, x10, lsr #63 -; CHECK-NEXT: mov w12, #23 -; CHECK-NEXT: msub x10, x10, x12, x11 +; CHECK-NEXT: mov x8, #8549 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: movk x8, #22795, lsl #16 +; CHECK-NEXT: mov x12, #6055 +; CHECK-NEXT: movk x8, #17096, lsl #32 +; CHECK-NEXT: movk x12, #58853, lsl #16 +; CHECK-NEXT: movk x8, #45590, lsl #48 +; CHECK-NEXT: mov x14, #21445 +; CHECK-NEXT: mov x10, v1.d[1] +; CHECK-NEXT: movk x12, #47142, lsl #32 +; CHECK-NEXT: smulh x8, x9, x8 +; CHECK-NEXT: movk x14, #1603, lsl #16 +; CHECK-NEXT: mov x11, v0.d[1] +; CHECK-NEXT: movk x12, #24749, lsl #48 +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: movk x14, #15432, lsl #32 +; CHECK-NEXT: asr x13, x8, #4 +; CHECK-NEXT: movk x14, #25653, lsl #48 +; CHECK-NEXT: add x8, x13, x8, lsr #63 +; CHECK-NEXT: mov w13, #23 +; CHECK-NEXT: smulh x12, x10, x12 +; CHECK-NEXT: smulh x14, x11, x14 +; CHECK-NEXT: msub x8, x8, x13, x9 +; CHECK-NEXT: asr x13, x12, #11 +; CHECK-NEXT: add x12, x13, x12, lsr #63 +; CHECK-NEXT: asr x13, x14, #8 +; CHECK-NEXT: mov w9, #5423 +; CHECK-NEXT: add x13, x13, x14, lsr #63 +; CHECK-NEXT: mov w14, #654 +; CHECK-NEXT: msub x9, x12, x9, x10 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: msub x10, x13, x14, x11 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: fmov d1, x10 -; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: mov v0.d[1], x9 +; CHECK-NEXT: mov v1.d[1], x9 +; CHECK-NEXT: mov v0.d[1], x10 ; CHECK-NEXT: ret %1 = srem <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/AArch64/ssub_sat.ll b/llvm/test/CodeGen/AArch64/ssub_sat.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat.ll @@ -40,8 +40,8 @@ ; CHECK-NEXT: sub w8, w8, w1, sxth ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: csel w8, w8, w9, lt -; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NEXT: mov w9, #-32768 +; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %tmp = call i16 @llvm.ssub.sat.i16(i16 %x, i16 %y); @@ -52,12 +52,12 @@ ; CHECK-LABEL: func8: ; CHECK: // %bb.0: ; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: sub w8, w8, w1, sxtb ; CHECK-NEXT: mov w9, #127 +; CHECK-NEXT: sub w8, w8, w1, sxtb ; CHECK-NEXT: cmp w8, #127 ; CHECK-NEXT: csel w8, w8, w9, lt -; CHECK-NEXT: cmn w8, #128 ; CHECK-NEXT: mov w9, #-128 +; CHECK-NEXT: cmn w8, #128 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %tmp = call i8 @llvm.ssub.sat.i8(i8 %x, i8 %y); @@ -70,11 +70,11 @@ ; CHECK-NEXT: lsl w8, w1, #28 ; CHECK-NEXT: sbfx w9, w0, #0, #4 ; CHECK-NEXT: sub w8, w9, w8, asr #28 -; CHECK-NEXT: mov w10, #7 +; CHECK-NEXT: mov w9, #7 ; CHECK-NEXT: cmp w8, #7 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: cmn w8, #8 +; CHECK-NEXT: csel w8, w8, w9, lt ; CHECK-NEXT: mov w9, #-8 +; CHECK-NEXT: cmn w8, #8 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %tmp = call i4 @llvm.ssub.sat.i4(i4 %x, i4 %y); diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll b/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll @@ -37,14 +37,14 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; CHECK-LABEL: func16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth w8, w0 ; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: mov w10, #32767 -; CHECK-NEXT: sub w8, w8, w9, sxth -; CHECK-NEXT: cmp w8, w10 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 +; CHECK-NEXT: sxth w10, w0 +; CHECK-NEXT: mov w8, #32767 +; CHECK-NEXT: sub w9, w10, w9, sxth +; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: csel w8, w9, w8, lt ; CHECK-NEXT: mov w9, #-32768 +; CHECK-NEXT: cmn w8, #8, lsl #12 // =32768 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %a = mul i16 %y, %z @@ -55,14 +55,14 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; CHECK-LABEL: func8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 ; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: sub w8, w8, w9, sxtb -; CHECK-NEXT: mov w10, #127 -; CHECK-NEXT: cmp w8, #127 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: cmn w8, #128 +; CHECK-NEXT: sxtb w10, w0 +; CHECK-NEXT: mov w8, #127 +; CHECK-NEXT: sub w9, w10, w9, sxtb +; CHECK-NEXT: cmp w9, #127 +; CHECK-NEXT: csel w8, w9, w8, lt ; CHECK-NEXT: mov w9, #-128 +; CHECK-NEXT: cmn w8, #128 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %a = mul i8 %y, %z @@ -74,14 +74,14 @@ ; CHECK-LABEL: func4: ; CHECK: // %bb.0: ; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: sbfx w8, w0, #0, #4 +; CHECK-NEXT: sbfx w10, w0, #0, #4 +; CHECK-NEXT: mov w8, #7 ; CHECK-NEXT: lsl w9, w9, #28 -; CHECK-NEXT: sub w8, w8, w9, asr #28 -; CHECK-NEXT: mov w10, #7 -; CHECK-NEXT: cmp w8, #7 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: cmn w8, #8 +; CHECK-NEXT: sub w9, w10, w9, asr #28 +; CHECK-NEXT: cmp w9, #7 +; CHECK-NEXT: csel w8, w9, w8, lt ; CHECK-NEXT: mov w9, #-8 +; CHECK-NEXT: cmn w8, #8 ; CHECK-NEXT: csel w0, w8, w9, gt ; CHECK-NEXT: ret %a = mul i4 %y, %z diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -55,9 +55,9 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; CHECK-LABEL: v64i8: ; CHECK: // %bb.0: +; CHECK-NEXT: sqsub v2.16b, v2.16b, v6.16b ; CHECK-NEXT: sqsub v0.16b, v0.16b, v4.16b ; CHECK-NEXT: sqsub v1.16b, v1.16b, v5.16b -; CHECK-NEXT: sqsub v2.16b, v2.16b, v6.16b ; CHECK-NEXT: sqsub v3.16b, v3.16b, v7.16b ; CHECK-NEXT: ret %z = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %x, <64 x i8> %y) @@ -86,9 +86,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; CHECK-LABEL: v32i16: ; CHECK: // %bb.0: +; CHECK-NEXT: sqsub v2.8h, v2.8h, v6.8h ; CHECK-NEXT: sqsub v0.8h, v0.8h, v4.8h ; CHECK-NEXT: sqsub v1.8h, v1.8h, v5.8h -; CHECK-NEXT: sqsub v2.8h, v2.8h, v6.8h ; CHECK-NEXT: sqsub v3.8h, v3.8h, v7.8h ; CHECK-NEXT: ret %z = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %x, <32 x i16> %y) @@ -117,8 +117,8 @@ ; CHECK-NEXT: ldr s1, [x1] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: shl v1.4h, v1.4h, #8 ; CHECK-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-NEXT: shl v1.4h, v1.4h, #8 ; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: xtn v0.8b, v0.8h @@ -135,11 +135,11 @@ ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1 { v0.b }[0], [x1] +; CHECK-NEXT: add x8, x1, #1 ; CHECK-NEXT: ld1 { v1.b }[0], [x0] -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: add x9, x1, #1 -; CHECK-NEXT: ld1 { v0.b }[4], [x9] -; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: add x9, x0, #1 +; CHECK-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-NEXT: ld1 { v1.b }[4], [x9] ; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: shl v1.2s, v1.2s, #24 ; CHECK-NEXT: sqsub v0.2s, v1.2s, v0.2s @@ -175,11 +175,11 @@ ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1 { v0.h }[0], [x1] +; CHECK-NEXT: add x8, x1, #2 ; CHECK-NEXT: ld1 { v1.h }[0], [x0] -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: add x9, x1, #2 -; CHECK-NEXT: ld1 { v0.h }[2], [x9] -; CHECK-NEXT: ld1 { v1.h }[2], [x8] +; CHECK-NEXT: add x9, x0, #2 +; CHECK-NEXT: ld1 { v0.h }[2], [x8] +; CHECK-NEXT: ld1 { v1.h }[2], [x9] ; CHECK-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-NEXT: shl v1.2s, v1.2s, #16 ; CHECK-NEXT: sqsub v0.2s, v1.2s, v0.2s @@ -208,10 +208,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { ; CHECK-LABEL: v12i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: sqsub v1.8h, v1.8h, v2.8h -; CHECK-NEXT: sqsub v0.8h, v0.8h, v3.8h +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: sqsub v0.8h, v1.8h, v0.8h +; CHECK-NEXT: sqsub v1.8h, v2.8h, v3.8h ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: str d1, [x2, #16] ; CHECK-NEXT: ret @@ -255,10 +255,10 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; CHECK-LABEL: v16i4: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.16b, v0.16b, #4 ; CHECK-NEXT: shl v1.16b, v1.16b, #4 -; CHECK-NEXT: sshr v0.16b, v0.16b, #4 +; CHECK-NEXT: shl v0.16b, v0.16b, #4 ; CHECK-NEXT: sshr v1.16b, v1.16b, #4 +; CHECK-NEXT: sshr v0.16b, v0.16b, #4 ; CHECK-NEXT: shl v1.16b, v1.16b, #4 ; CHECK-NEXT: shl v0.16b, v0.16b, #4 ; CHECK-NEXT: sqsub v0.16b, v0.16b, v1.16b @@ -310,9 +310,9 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; CHECK-LABEL: v16i32: ; CHECK: // %bb.0: +; CHECK-NEXT: sqsub v2.4s, v2.4s, v6.4s ; CHECK-NEXT: sqsub v0.4s, v0.4s, v4.4s ; CHECK-NEXT: sqsub v1.4s, v1.4s, v5.4s -; CHECK-NEXT: sqsub v2.4s, v2.4s, v6.4s ; CHECK-NEXT: sqsub v3.4s, v3.4s, v7.4s ; CHECK-NEXT: ret %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y) @@ -341,9 +341,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: v8i64: ; CHECK: // %bb.0: +; CHECK-NEXT: sqsub v2.2d, v2.2d, v6.2d ; CHECK-NEXT: sqsub v0.2d, v0.2d, v4.2d ; CHECK-NEXT: sqsub v1.2d, v1.2d, v5.2d -; CHECK-NEXT: sqsub v2.2d, v2.2d, v6.2d ; CHECK-NEXT: sqsub v3.2d, v3.2d, v7.2d ; CHECK-NEXT: ret %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y) @@ -354,23 +354,23 @@ ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x2, x6 -; CHECK-NEXT: sbcs x11, x3, x7 -; CHECK-NEXT: eor x9, x3, x7 -; CHECK-NEXT: eor x12, x3, x11 -; CHECK-NEXT: tst x9, x12 -; CHECK-NEXT: asr x9, x11, #63 -; CHECK-NEXT: eor x12, x9, #0x8000000000000000 -; CHECK-NEXT: csel x2, x9, x8, lt -; CHECK-NEXT: csel x3, x12, x11, lt +; CHECK-NEXT: eor x10, x3, x7 +; CHECK-NEXT: sbcs x9, x3, x7 +; CHECK-NEXT: eor x11, x3, x9 +; CHECK-NEXT: asr x12, x9, #63 +; CHECK-NEXT: tst x10, x11 +; CHECK-NEXT: eor x10, x1, x5 +; CHECK-NEXT: csel x2, x12, x8, lt +; CHECK-NEXT: eor x8, x12, #0x8000000000000000 +; CHECK-NEXT: csel x3, x8, x9, lt ; CHECK-NEXT: subs x8, x0, x4 ; CHECK-NEXT: sbcs x9, x1, x5 -; CHECK-NEXT: eor x10, x1, x5 -; CHECK-NEXT: eor x12, x1, x9 -; CHECK-NEXT: asr x11, x9, #63 -; CHECK-NEXT: tst x10, x12 -; CHECK-NEXT: eor x13, x11, #0x8000000000000000 -; CHECK-NEXT: csel x8, x11, x8, lt -; CHECK-NEXT: csel x1, x13, x9, lt +; CHECK-NEXT: eor x11, x1, x9 +; CHECK-NEXT: asr x12, x9, #63 +; CHECK-NEXT: tst x10, x11 +; CHECK-NEXT: eor x10, x12, #0x8000000000000000 +; CHECK-NEXT: csel x8, x12, x8, lt +; CHECK-NEXT: csel x1, x10, x9, lt ; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll b/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll --- a/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll @@ -28,15 +28,15 @@ ; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr x9, [x9] ; CHECK-NEXT: str x8, [sp] -; CHECK-NEXT: stur x9, [x29, #-8] ; CHECK-NEXT: Lloh6: -; CHECK-NEXT: adrp x9, ___stack_chk_guard@GOTPAGE -; CHECK-NEXT: ldur x8, [x29, #-8] +; CHECK-NEXT: adrp x8, ___stack_chk_guard@GOTPAGE +; CHECK-NEXT: stur x9, [x29, #-8] ; CHECK-NEXT: Lloh7: -; CHECK-NEXT: ldr x9, [x9, ___stack_chk_guard@GOTPAGEOFF] +; CHECK-NEXT: ldr x8, [x8, ___stack_chk_guard@GOTPAGEOFF] +; CHECK-NEXT: ldur x9, [x29, #-8] ; CHECK-NEXT: Lloh8: -; CHECK-NEXT: ldr x9, [x9] -; CHECK-NEXT: cmp x9, x8 +; CHECK-NEXT: ldr x8, [x8] +; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: b.ne LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %entry ; CHECK-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/stack-guard-sysreg.ll b/llvm/test/CodeGen/AArch64/stack-guard-sysreg.ll --- a/llvm/test/CodeGen/AArch64/stack-guard-sysreg.ll +++ b/llvm/test/CodeGen/AArch64/stack-guard-sysreg.ll @@ -48,35 +48,35 @@ ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: mrs x8, SP_EL0 +; CHECK-NEXT: lsl x9, x0, #2 ; CHECK-NO-OFFSET: ldr x8, [x8] ; CHECK-POSITIVE-OFFSET: ldr x8, [x8, #8] ; CHECK-NEGATIVE-OFFSET: ldur x8, [x8, #-8] ; CHECK-NPOT-OFFSET: ldur x8, [x8, #1] ; CHECK-NPOT-NEG-OFFSET: ldur x8, [x8, #-1] ; CHECK-257-OFFSET: add x8, x8, #257 -; CHECK-257-OFFSET-NEXT: ldr x8, [x8] ; CHECK-MINUS-257-OFFSET: sub x8, x8, #257 -; CHECK-MINUS-257-OFFSET-NEXT: ldr x8, [x8] -; CHECK-NEXT: lsl x9, x0, #2 ; CHECK-NEXT: add x9, x9, #15 ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-257-OFFSET-NEXT: ldr x8, [x8] +; CHECK-MINUS-257-OFFSET-NEXT: ldr x8, [x8] ; CHECK-NEXT: stur x8, [x29, #-8] ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: sub x0, x8, x9 ; CHECK-NEXT: mov sp, x0 ; CHECK-NEXT: bl baz -; CHECK-NEXT: ldur x8, [x29, #-8] -; CHECK-NEXT: mrs x9, SP_EL0 -; CHECK-NO-OFFSET: ldr x9, [x9] -; CHECK-POSITIVE-OFFSET: ldr x9, [x9, #8] -; CHECK-NEGATIVE-OFFSET: ldur x9, [x9, #-8] -; CHECK-NPOT-OFFSET: ldur x9, [x9, #1] -; CHECK-NPOT-NEG-OFFSET: ldur x9, [x9, #-1] -; CHECK-257-OFFSET: add x9, x9, #257 -; CHECK-257-OFFSET-NEXT: ldr x9, [x9] -; CHECK-MINUS-257-OFFSET: sub x9, x9, #257 -; CHECK-MINUS-257-OFFSET-NEXT: ldr x9, [x9] -; CHECK-NEXT: cmp x9, x8 +; CHECK-NEXT: mrs x8, SP_EL0 +; CHECK-NO-OFFSET: ldr x8, [x8] +; CHECK-POSITIVE-OFFSET: ldr x8, [x8, #8] +; CHECK-NEGATIVE-OFFSET: ldur x8, [x8, #-8] +; CHECK-NPOT-OFFSET: ldur x8, [x8, #1] +; CHECK-NPOT-NEG-OFFSET: ldur x8, [x8, #-1] +; CHECK-257-OFFSET: add x8, x8, #257 +; CHECK-257-OFFSET-NEXT: ldr x8, [x8] +; CHECK-MINUS-257-OFFSET: sub x8, x8, #257 +; CHECK-MINUS-257-OFFSET-NEXT: ldr x8, [x8] +; CHECK-NEXT: ldur x9, [x29, #-8] +; CHECK-NEXT: cmp x8, x9 ; CHECK-NEXT: b.ne .LBB0_2 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: mov sp, x29 diff --git a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll --- a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll +++ b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll @@ -178,13 +178,13 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ldr x8, [sp, #48] +; CHECK-NEXT: mov x18, xzr ; CHECK-NEXT: ldr q0, [sp, #32] +; CHECK-NEXT: ldr x8, [sp, #48] ; CHECK-NEXT: mov w0, #42 ; CHECK-NEXT: mov w1, #17 -; CHECK-NEXT: mov x18, xzr -; CHECK-NEXT: str x8, [sp, #16] ; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: str x8, [sp, #16] ; CHECK-NEXT: bl consume_attributes ; CHECK-NEXT: .Ltmp11: ; CHECK-NEXT: add sp, sp, #32 diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -12,13 +12,13 @@ ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0] ; CHECK-NEXT: ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x2] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: fmov s0, #1.00000000 ; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z16.d }, p0, [sp] ; CHECK-NEXT: st1d { z17.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1d { z18.d }, p0, [x8, #2, mul vl] @@ -44,12 +44,11 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0] ; CHECK-NEXT: ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x8, sp, #16 -; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: fmov s0, #1.00000000 +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: mov w1, #1 ; CHECK-NEXT: mov w2, #2 ; CHECK-NEXT: mov w3, #3 @@ -57,7 +56,8 @@ ; CHECK-NEXT: mov w5, #5 ; CHECK-NEXT: mov w6, #6 ; CHECK-NEXT: mov w7, #7 -; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z16.d }, p0, [x9] ; CHECK-NEXT: st1d { z17.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1d { z18.d }, p0, [x8, #2, mul vl] @@ -83,14 +83,14 @@ ; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-3 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x0] ; CHECK-NEXT: ld3d { z16.d, z17.d, z18.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: fmov s0, #1.00000000 ; CHECK-NEXT: fmov s1, #2.00000000 ; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z16.d }, p0, [sp] ; CHECK-NEXT: st1d { z17.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1d { z18.d }, p0, [x8, #2, mul vl] diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll --- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll @@ -24,8 +24,8 @@ ; CHECK-LABEL: sdiv_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #21846 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: lsr z1.h, z0.h, #15 ; CHECK-NEXT: add z0.h, z0.h, z1.h @@ -38,9 +38,9 @@ ; CHECK-LABEL: sdiv_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #21846 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movk w8, #21845, lsl #16 ; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: lsr z1.s, z0.s, #31 ; CHECK-NEXT: add z0.s, z0.s, z1.s @@ -53,9 +53,9 @@ ; CHECK-LABEL: sdiv_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #6148914691236517205 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movk x8, #21846 ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: lsr z1.d, z0.d, #63 ; CHECK-NEXT: add z0.d, z0.d, z1.d @@ -84,8 +84,8 @@ ; CHECK-LABEL: udiv_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-21845 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: lsr z0.h, z0.h, #1 ; CHECK-NEXT: ret @@ -97,9 +97,9 @@ ; CHECK-LABEL: udiv_i32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: lsr z0.s, z0.s, #1 ; CHECK-NEXT: ret @@ -111,9 +111,9 @@ ; CHECK-LABEL: udiv_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-6148914691236517206 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movk x8, #43691 ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: lsr z0.d, z0.d, #1 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll --- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll @@ -506,9 +506,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: whilels p1.s, xzr, x8 ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb w8, p1, z0.s +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb w8, p0, z0.s ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %b = extractelement %a, i32 %x diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -17,15 +17,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: sub x9, x9, #2 -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: cmp x9, #2 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: cmp x8, #2 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsl x8, x8, #3 ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -50,15 +50,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: cntw x9 -; CHECK-NEXT: sub x9, x9, #4 -; CHECK-NEXT: mov w8, #4 -; CHECK-NEXT: cmp x9, #4 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: sub x8, x8, #4 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: cmp x8, #4 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: lsl x8, x8, #2 +; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsl x8, x8, #2 ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -83,15 +83,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: cnth x9 -; CHECK-NEXT: sub x9, x9, #8 -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: cmp x9, #8 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: sub x8, x8, #8 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: cmp x8, #8 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: lsl x8, x8, #1 +; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsl x8, x8, #1 ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -116,13 +116,13 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: sub x9, x9, #16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: sub x8, x8, #16 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov w8, #16 -; CHECK-NEXT: cmp x9, #16 +; CHECK-NEXT: cmp x8, #16 ; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 @@ -140,15 +140,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: sub x9, x9, #2 -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: cmp x9, #2 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: cmp x8, #2 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsl x8, x8, #3 ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -163,12 +163,12 @@ ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: cntd x9 +; CHECK-NEXT: mov w10, #4 ; CHECK-NEXT: subs x9, x9, #4 -; CHECK-NEXT: csel x9, xzr, x9, lo ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov w10, #4 -; CHECK-NEXT: cmp x9, #4 +; CHECK-NEXT: csel x9, xzr, x9, lo ; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: cmp x9, #4 ; CHECK-NEXT: csel x9, x9, x10, lo ; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x10, x9, lsl #3] diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll @@ -67,20 +67,20 @@ ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: punpkhi p2.h, p1.b ; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpklo p2.h, p2.b -; CHECK-NEXT: punpkhi p3.h, p1.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpkhi p3.h, p1.b ; CHECK-NEXT: punpkhi p4.h, p2.b +; CHECK-NEXT: punpklo p1.h, p1.b ; CHECK-NEXT: punpklo p2.h, p2.b ; CHECK-NEXT: punpkhi p5.h, p3.b +; CHECK-NEXT: uzp1 p4.s, p4.s, p0.s ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpklo p3.h, p3.b ; CHECK-NEXT: uzp1 p2.s, p5.s, p2.s ; CHECK-NEXT: punpkhi p5.h, p1.b ; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: uzp1 p4.s, p4.s, p0.s +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uzp1 p3.s, p5.s, p3.s ; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s ; CHECK-NEXT: uzp1 p1.h, p2.h, p4.h @@ -551,69 +551,69 @@ ; CHECK-LABEL: extract_nxv14i8_nxv28i8_14: ; CHECK: // %bb.0: ; CHECK-NEXT: uunpkhi z0.h, z0.b +; CHECK-NEXT: uunpklo z2.h, z1.b ; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: uunpklo z4.s, z2.h ; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: uunpklo z5.d, z4.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uunpkhi z4.d, z4.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uunpkhi z2.s, z2.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uunpklo z2.h, z1.b +; CHECK-NEXT: uunpkhi z1.h, z1.b ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z3.s, z2.h -; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: uunpklo z4.d, z3.s -; CHECK-NEXT: uunpklo z5.d, z5.s -; CHECK-NEXT: uzp1 z4.s, z5.s, z4.s +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: uunpkhi z0.s, z0.h -; CHECK-NEXT: uzp1 z0.h, z4.h, z0.h +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: uzp1 z3.s, z3.s, z5.s +; CHECK-NEXT: uzp1 z0.h, z3.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpkhi z4.s, z0.h -; CHECK-NEXT: uunpkhi z3.d, z3.s -; CHECK-NEXT: uunpkhi z4.d, z4.s -; CHECK-NEXT: uzp1 z3.s, z3.s, z4.s +; CHECK-NEXT: uunpkhi z3.s, z0.h ; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpkhi z3.d, z3.s +; CHECK-NEXT: uzp1 z3.s, z4.s, z3.s +; CHECK-NEXT: uunpklo z4.d, z2.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uunpkhi z2.d, z2.s ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpkhi z2.s, z2.h ; CHECK-NEXT: uunpkhi z3.s, z0.h -; CHECK-NEXT: uunpklo z5.d, z2.s -; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uzp1 z3.s, z3.s, z5.s +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: uzp1 z3.s, z3.s, z4.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h ; CHECK-NEXT: uzp1 z3.b, z0.b, z0.b ; CHECK-NEXT: uunpkhi z3.h, z3.b -; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: uunpkhi z2.d, z2.s -; CHECK-NEXT: uunpkhi z5.d, z5.s +; CHECK-NEXT: uunpklo z4.s, z3.h ; CHECK-NEXT: uunpkhi z3.s, z3.h -; CHECK-NEXT: uzp1 z2.s, z2.s, z5.s +; CHECK-NEXT: uunpkhi z4.d, z4.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z4.s +; CHECK-NEXT: uunpklo z4.d, z1.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z3.h +; CHECK-NEXT: uunpkhi z1.d, z1.s ; CHECK-NEXT: uzp1 z2.b, z0.b, z2.b -; CHECK-NEXT: uunpkhi z1.h, z1.b ; CHECK-NEXT: uunpkhi z2.h, z2.b -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z2.h -; CHECK-NEXT: uunpklo z4.d, z1.s -; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z4.s ; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h ; CHECK-NEXT: uzp1 z2.b, z0.b, z2.b ; CHECK-NEXT: uunpkhi z2.h, z2.b ; CHECK-NEXT: uunpkhi z3.s, z2.h -; CHECK-NEXT: uunpkhi z1.d, z1.s -; CHECK-NEXT: uunpkhi z3.d, z3.s ; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpkhi z3.d, z3.s ; CHECK-NEXT: uzp1 z1.s, z1.s, z3.s ; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h ; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b ; CHECK-NEXT: uunpkhi z1.h, z1.b ; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll @@ -20,9 +20,9 @@ ; CHECK-LABEL: test_copysign_v2f32_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-NEXT: fcvt z1.s, p0/m, z1.d ; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to @@ -50,11 +50,11 @@ ; CHECK-LABEL: test_copysign_v4f32_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-NEXT: fcvt z2.s, p0/m, z2.d ; CHECK-NEXT: fcvt z1.s, p0/m, z1.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s ; CHECK-NEXT: and z1.s, z1.s, #0x80000000 -; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to @@ -70,9 +70,9 @@ ; CHECK-LABEL: test_copysign_v2f64_v232: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; CHECK-NEXT: fcvt z1.d, p0/m, z1.s ; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 -; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: ret %tmp0 = fpext %b to @@ -99,15 +99,15 @@ define @test_copysign_v4f64_v4f32( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpkhi z3.d, z2.s ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpkhi z3.d, z2.s ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: fcvt z3.d, p0/m, z3.s ; CHECK-NEXT: fcvt z2.d, p0/m, z2.s ; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff -; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff ; CHECK-NEXT: and z2.d, z2.d, #0x8000000000000000 ; CHECK-NEXT: and z3.d, z3.d, #0x8000000000000000 +; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff ; CHECK-NEXT: orr z0.d, z2.d, z0.d ; CHECK-NEXT: orr z1.d, z3.d, z1.d ; CHECK-NEXT: ret @@ -120,8 +120,8 @@ define @test_copysign_v4f64_v4f64( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f64_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; CHECK-NEXT: and z2.d, z2.d, #0x8000000000000000 +; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff ; CHECK-NEXT: and z3.d, z3.d, #0x8000000000000000 ; CHECK-NEXT: orr z0.d, z2.d, z0.d @@ -150,9 +150,9 @@ ; CHECK-LABEL: test_copysign_v4f16_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: fcvt z1.h, p0/m, z1.s ; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to @@ -164,11 +164,11 @@ ; CHECK-LABEL: test_copysign_v4f16_v4f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: fcvt z2.h, p0/m, z2.d ; CHECK-NEXT: fcvt z1.h, p0/m, z1.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z2.s ; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to @@ -195,11 +195,11 @@ ; CHECK-LABEL: test_copysign_v8f16_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: fcvt z2.h, p0/m, z2.s ; CHECK-NEXT: fcvt z1.h, p0/m, z1.s ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: and z1.h, z1.h, #0x8000 -; CHECK-NEXT: and z0.h, z0.h, #0x7fff ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: ret %tmp0 = fptrunc %b to diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll --- a/llvm/test/CodeGen/AArch64/sve-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll @@ -454,9 +454,9 @@ define @scvtf_h_nxv2i1( %a) { ; CHECK-LABEL: scvtf_h_nxv2i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: scvtf z0.h, p0/m, z0.d +; CHECK-NEXT: scvtf z0.h, p1/m, z0.d ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -495,9 +495,9 @@ define @scvtf_h_nxv3i1( %a) { ; CHECK-LABEL: scvtf_h_nxv3i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: scvtf z0.h, p0/m, z0.s +; CHECK-NEXT: scvtf z0.h, p1/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -516,9 +516,9 @@ define @scvtf_h_nxv4i1( %a) { ; CHECK-LABEL: scvtf_h_nxv4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: scvtf z0.h, p0/m, z0.s +; CHECK-NEXT: scvtf z0.h, p1/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -547,9 +547,9 @@ define @scvtf_h_nxv7i1( %a) { ; CHECK-LABEL: scvtf_h_nxv7i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: scvtf z0.h, p0/m, z0.h +; CHECK-NEXT: scvtf z0.h, p1/m, z0.h ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -568,9 +568,9 @@ define @scvtf_h_nxv8i1( %a) { ; CHECK-LABEL: scvtf_h_nxv8i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: scvtf z0.h, p0/m, z0.h +; CHECK-NEXT: scvtf z0.h, p1/m, z0.h ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -589,9 +589,9 @@ define @scvtf_s_nxv2i1( %a) { ; CHECK-LABEL: scvtf_s_nxv2i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: scvtf z0.s, p1/m, z0.d ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -620,9 +620,9 @@ define @scvtf_s_nxv3i1( %a) { ; CHECK-LABEL: scvtf_s_nxv3i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: scvtf z0.s, p1/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -641,9 +641,9 @@ define @scvtf_s_nxv4i1( %a) { ; CHECK-LABEL: scvtf_s_nxv4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: scvtf z0.s, p1/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -662,9 +662,9 @@ define @scvtf_d_nxv2i1( %a) { ; CHECK-LABEL: scvtf_d_nxv2i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: scvtf z0.d, p1/m, z0.d ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -695,9 +695,9 @@ define @ucvtf_h_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv2i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d +; CHECK-NEXT: ucvtf z0.h, p1/m, z0.d ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -736,9 +736,9 @@ define @ucvtf_h_nxv3i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv3i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s +; CHECK-NEXT: ucvtf z0.h, p1/m, z0.s ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -767,9 +767,9 @@ define @ucvtf_h_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s +; CHECK-NEXT: ucvtf z0.h, p1/m, z0.s ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -798,9 +798,9 @@ define @ucvtf_h_nxv8i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv8i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h +; CHECK-NEXT: ucvtf z0.h, p1/m, z0.h ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -819,9 +819,9 @@ define @ucvtf_s_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_s_nxv2i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: ucvtf z0.s, p1/m, z0.d ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -850,9 +850,9 @@ define @ucvtf_s_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_s_nxv4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: ucvtf z0.s, p1/m, z0.s ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -871,9 +871,9 @@ define @ucvtf_d_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_d_nxv2i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d +; CHECK-NEXT: ucvtf z0.d, p1/m, z0.d ; CHECK-NEXT: ret %res = uitofp %a to ret %res diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll @@ -45,9 +45,9 @@ define void @concat_v32i8(<16 x i8>* %a, <16 x i8>* %b, <32 x i8>* %c) #0 { ; CHECK-LABEL: concat_v32i8: -; CHECK: ldr q[[OP1:[0-9]+]], [x0] -; CHECK-NEXT: ldr q[[OP2:[0-9]+]], [x1] +; CHECK: ldr q[[OP2:[0-9]+]], [x1] ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].b, vl16 +; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0] ; CHECK-NEXT: splice [[RES:z[0-9]+]].b, [[PG1]], z[[OP1]].b, z[[OP2]].b ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].b, vl32 ; CHECK-NEXT: st1b { [[RES]].b }, [[PG2]], [x2] @@ -187,9 +187,9 @@ define void @concat_v16i16(<8 x i16>* %a, <8 x i16>* %b, <16 x i16>* %c) #0 { ; CHECK-LABEL: concat_v16i16: -; CHECK: ldr q[[OP1:[0-9]+]], [x0] -; CHECK-NEXT: ldr q[[OP2:[0-9]+]], [x1] +; CHECK: ldr q[[OP2:[0-9]+]], [x1] ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8 +; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0] ; CHECK-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], z[[OP1]].h, z[[OP2]].h ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h, vl16 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG2]], [x2] @@ -299,9 +299,9 @@ define void @concat_v8i32(<4 x i32>* %a, <4 x i32>* %b, <8 x i32>* %c) #0 { ; CHECK-LABEL: concat_v8i32: -; CHECK: ldr q[[OP1:[0-9]+]], [x0] -; CHECK-NEXT: ldr q[[OP2:[0-9]+]], [x1] +; CHECK: ldr q[[OP2:[0-9]+]], [x1] ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4 +; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0] ; CHECK-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], z[[OP1]].s, z[[OP2]].s ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG2]], [x2] @@ -387,9 +387,9 @@ define void @concat_v4i64(<2 x i64>* %a, <2 x i64>* %b, <4 x i64>* %c) #0 { ; CHECK-LABEL: concat_v4i64: -; CHECK: ldr q[[OP1:[0-9]+]], [x0] -; CHECK-NEXT: ldr q[[OP2:[0-9]+]], [x1] +; CHECK: ldr q[[OP2:[0-9]+]], [x1] ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl2 +; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0] ; CHECK-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], z[[OP1]].d, z[[OP2]].d ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG2]], [x2] @@ -478,9 +478,9 @@ define void @concat_v16f16(<8 x half>* %a, <8 x half>* %b, <16 x half>* %c) #0 { ; CHECK-LABEL: concat_v16f16: -; CHECK: ldr q[[OP1:[0-9]+]], [x0] -; CHECK-NEXT: ldr q[[OP2:[0-9]+]], [x1] +; CHECK: ldr q[[OP2:[0-9]+]], [x1] ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8 +; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0] ; CHECK-NEXT: splice [[RES:z[0-9]+]].h, [[PG1]], z[[OP1]].h, z[[OP2]].h ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h, vl16 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG2]], [x2] @@ -590,9 +590,9 @@ define void @concat_v8f32(<4 x float>* %a, <4 x float>* %b, <8 x float>* %c) #0 { ; CHECK-LABEL: concat_v8f32: -; CHECK: ldr q[[OP1:[0-9]+]], [x0] -; CHECK-NEXT: ldr q[[OP2:[0-9]+]], [x1] +; CHECK: ldr q[[OP2:[0-9]+]], [x1] ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4 +; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0] ; CHECK-NEXT: splice [[RES:z[0-9]+]].s, [[PG1]], z[[OP1]].s, z[[OP2]].s ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG2]], [x2] @@ -678,9 +678,9 @@ define void @concat_v4f64(<2 x double>* %a, <2 x double>* %b, <4 x double>* %c) #0 { ; CHECK-LABEL: concat_v4f64: -; CHECK: ldr q[[OP1:[0-9]+]], [x0] -; CHECK-NEXT: ldr q[[OP2:[0-9]+]], [x1] +; CHECK: ldr q[[OP2:[0-9]+]], [x1] ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl2 +; CHECK-NEXT: ldr q[[OP1:[0-9]+]], [x0] ; CHECK-NEXT: splice [[RES:z[0-9]+]].d, [[PG1]], z[[OP1]].d, z[[OP2]].d ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d, vl4 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG2]], [x2] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll @@ -67,8 +67,8 @@ define half @extractelement_v64f16(<64 x half>* %a) #0 { ; CHECK-LABEL: extractelement_v64f16: ; VBITS_GE_1024: ptrue p0.h, vl64 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: mov w8, #63 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: whilels p0.h, xzr, x8 ; VBITS_GE_1024-NEXT: lastb h0, p0, z0.h ; VBITS_GE_1024-NEXT: ret @@ -80,8 +80,8 @@ define half @extractelement_v128f16(<128 x half>* %a) #0 { ; CHECK-LABEL: extractelement_v128f16: ; VBITS_GE_2048: ptrue p0.h, vl128 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: mov w8, #127 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: whilels p0.h, xzr, x8 ; VBITS_GE_2048-NEXT: lastb h0, p0, z0.h ; VBITS_GE_2048-NEXT: ret @@ -133,8 +133,8 @@ define float @extractelement_v32f32(<32 x float>* %a) #0 { ; CHECK-LABEL: extractelement_v32f32: ; VBITS_GE_1024: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: mov w8, #31 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: whilels p0.s, xzr, x8 ; VBITS_GE_1024-NEXT: lastb s0, p0, z0.s ; VBITS_GE_1024-NEXT: ret @@ -146,8 +146,8 @@ define float @extractelement_v64f32(<64 x float>* %a) #0 { ; CHECK-LABEL: extractelement_v64f32: ; VBITS_GE_2048: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: mov w8, #63 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: whilels p0.s, xzr, x8 ; VBITS_GE_2048-NEXT: lastb s0, p0, z0.s ; VBITS_GE_2048-NEXT: ret @@ -198,8 +198,8 @@ define double @extractelement_v16f64(<16 x double>* %a) #0 { ; CHECK-LABEL: extractelement_v16f64: ; VBITS_GE_1024: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: mov w8, #15 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: whilels p0.d, xzr, x8 ; VBITS_GE_1024-NEXT: lastb d0, p0, z0.d ; VBITS_GE_1024-NEXT: ret @@ -211,8 +211,8 @@ define double @extractelement_v32f64(<32 x double>* %a) #0 { ; CHECK-LABEL: extractelement_v32f64: ; VBITS_GE_2048: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: mov w8, #31 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: whilels p0.d, xzr, x8 ; VBITS_GE_2048-NEXT: lastb d0, p0, z0.d ; VBITS_GE_2048-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll @@ -365,12 +365,12 @@ ; CHECK-LABEL: fcmp_ueq_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: fcmne p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret @@ -412,12 +412,12 @@ ; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret @@ -459,12 +459,12 @@ ; CHECK-LABEL: fcmp_ugt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret @@ -506,12 +506,12 @@ ; CHECK-LABEL: fcmp_ult_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret @@ -553,12 +553,12 @@ ; CHECK-LABEL: fcmp_uge_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret @@ -600,12 +600,12 @@ ; CHECK-LABEL: fcmp_ule_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret @@ -647,12 +647,12 @@ ; CHECK-LABEL: fcmp_ord_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: mov w8, #65535 ; CHECK-NEXT: fcmuo p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -72,18 +72,18 @@ ; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 ; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: mov x8, sp +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_EQ_256-NEXT: mov x8, #8 ; VBITS_EQ_256-NEXT: ldp q0, q1, [sp] ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: mov x8, #8 ; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h ; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.h +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.h ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_EQ_256-NEXT: mov sp, x29 ; VBITS_EQ_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_EQ_256-NEXT: ret @@ -185,17 +185,17 @@ ; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ldr q0, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: uunpklo z1.s, z0.h -; VBITS_EQ_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h ; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.h -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.h +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64: @@ -299,18 +299,18 @@ ; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 ; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: mov x8, sp +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: ldp q0, q1, [sp] ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: fcvt z0.d, p0/m, z0.s +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_EQ_256-NEXT: fcvt z1.d, p0/m, z1.s ; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_EQ_256-NEXT: mov sp, x29 ; VBITS_EQ_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_EQ_256-NEXT: ret @@ -411,12 +411,12 @@ ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ptrue p0.s -; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 ; VBITS_EQ_256-NEXT: fcvt z0.h, p0/m, z0.s ; VBITS_EQ_256-NEXT: fcvt z1.h, p0/m, z1.s ; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 +; VBITS_EQ_256-NEXT: splice z1.h, p0, z1.h, z0.h ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_EQ_256-NEXT: ret @@ -427,8 +427,8 @@ ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.s -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a @@ -444,8 +444,8 @@ ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.s ; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.s -; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <32 x float>, <32 x float>* %a @@ -461,8 +461,8 @@ ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s ; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.s -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 +; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x float>, <64 x float>* %a @@ -557,9 +557,9 @@ ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.d ; VBITS_GE_1024-NEXT: fcvt z0.h, p0/m, z0.d +; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a @@ -575,9 +575,9 @@ ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d ; VBITS_GE_2048-NEXT: fcvt z0.h, p0/m, z0.d +; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a @@ -635,12 +635,12 @@ ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ptrue p0.d -; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 ; VBITS_EQ_256-NEXT: fcvt z0.s, p0/m, z0.d ; VBITS_EQ_256-NEXT: fcvt z1.s, p0/m, z1.d ; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_EQ_256-NEXT: ret @@ -651,8 +651,8 @@ ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.d -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a @@ -668,8 +668,8 @@ ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.d ; VBITS_GE_1024-NEXT: fcvt z0.s, p0/m, z0.d -; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 +; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a @@ -685,8 +685,8 @@ ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d ; VBITS_GE_2048-NEXT: fcvt z0.s, p0/m, z0.d -; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 +; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll @@ -49,13 +49,13 @@ define void @select_v16f16(<16 x half>* %a, <16 x half>* %b, i1 %mask) #0 { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: and z2.h, z2.h, #0x1 -; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] @@ -70,13 +70,13 @@ define void @select_v32f16(<32 x half>* %a, <32 x half>* %b, i1 %mask) #0 { ; VBITS_GE_512-LABEL: select_v32f16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: and w8, w2, #0x1 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: ptrue p1.h ; VBITS_GE_512-NEXT: mov z2.h, w8 ; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.h ; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] @@ -91,13 +91,13 @@ define void @select_v64f16(<64 x half>* %a, <64 x half>* %b, i1 %mask) #0 { ; VBITS_GE_1024-LABEL: select_v64f16: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 ; VBITS_GE_1024-NEXT: and w8, w2, #0x1 +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_1024-NEXT: ptrue p1.h ; VBITS_GE_1024-NEXT: mov z2.h, w8 ; VBITS_GE_1024-NEXT: and z2.h, z2.h, #0x1 -; VBITS_GE_1024-NEXT: ptrue p1.h ; VBITS_GE_1024-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_1024-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] @@ -112,13 +112,13 @@ define void @select_v128f16(<128 x half>* %a, <128 x half>* %b, i1 %mask) #0 { ; VBITS_GE_2048-LABEL: select_v128f16: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 ; VBITS_GE_2048-NEXT: and w8, w2, #0x1 +; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_2048-NEXT: ptrue p1.h ; VBITS_GE_2048-NEXT: mov z2.h, w8 ; VBITS_GE_2048-NEXT: and z2.h, z2.h, #0x1 -; VBITS_GE_2048-NEXT: ptrue p1.h ; VBITS_GE_2048-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_2048-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -159,13 +159,13 @@ define void @select_v8f32(<8 x float>* %a, <8 x float>* %b, i1 %mask) #0 { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: and z2.s, z2.s, #0x1 -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] @@ -180,13 +180,13 @@ define void @select_v16f32(<16 x float>* %a, <16 x float>* %b, i1 %mask) #0 { ; VBITS_GE_512-LABEL: select_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: and w8, w2, #0x1 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: ptrue p1.s ; VBITS_GE_512-NEXT: mov z2.s, w8 ; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.s ; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] @@ -201,13 +201,13 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, i1 %mask) #0 { ; VBITS_GE_1024-LABEL: select_v32f32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: and w8, w2, #0x1 +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: ptrue p1.s ; VBITS_GE_1024-NEXT: mov z2.s, w8 ; VBITS_GE_1024-NEXT: and z2.s, z2.s, #0x1 -; VBITS_GE_1024-NEXT: ptrue p1.s ; VBITS_GE_1024-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_1024-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] @@ -222,13 +222,13 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, i1 %mask) #0 { ; VBITS_GE_2048-LABEL: select_v64f32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 ; VBITS_GE_2048-NEXT: and w8, w2, #0x1 +; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_2048-NEXT: ptrue p1.s ; VBITS_GE_2048-NEXT: mov z2.s, w8 ; VBITS_GE_2048-NEXT: and z2.s, z2.s, #0x1 -; VBITS_GE_2048-NEXT: ptrue p1.s ; VBITS_GE_2048-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_2048-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] @@ -269,13 +269,13 @@ define void @select_v4f64(<4 x double>* %a, <4 x double>* %b, i1 %mask) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: and z2.d, z2.d, #0x1 -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] @@ -290,13 +290,13 @@ define void @select_v8f64(<8 x double>* %a, <8 x double>* %b, i1 %mask) #0 { ; VBITS_GE_512-LABEL: select_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: and w8, w2, #0x1 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ptrue p1.d ; VBITS_GE_512-NEXT: mov z2.d, x8 ; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.d ; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] @@ -311,13 +311,13 @@ define void @select_v16f64(<16 x double>* %a, <16 x double>* %b, i1 %mask) #0 { ; VBITS_GE_1024-LABEL: select_v16f64: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; VBITS_GE_1024-NEXT: and w8, w2, #0x1 +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_1024-NEXT: ptrue p1.d ; VBITS_GE_1024-NEXT: mov z2.d, x8 ; VBITS_GE_1024-NEXT: and z2.d, z2.d, #0x1 -; VBITS_GE_1024-NEXT: ptrue p1.d ; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_1024-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] @@ -332,13 +332,13 @@ define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, i1 %mask) #0 { ; VBITS_GE_2048-LABEL: select_v32f64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: and w8, w2, #0x1 +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: ptrue p1.d ; VBITS_GE_2048-NEXT: mov z2.d, x8 ; VBITS_GE_2048-NEXT: and z2.d, z2.d, #0x1 -; VBITS_GE_2048-NEXT: ptrue p1.d ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -325,13 +325,14 @@ ; Don't use SVE for 128-bit vectors. define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) #0 { ; CHECK-LABEL: fcvtzu_v4f32_v4i16: -; CHECK: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] +; CHECK: fcvtzu v1.4s, v0.4s +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[3] ; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: ret %res = fptoui <4 x float> %op1 to <4 x i16> ret <4 x i16> %res @@ -356,8 +357,8 @@ ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_512-NEXT: fcvtzu [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s -; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16 +; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG3]], [x1] ; VBITS_GE_512-NEXT: ret @@ -388,8 +389,8 @@ ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_1024-NEXT: fcvtzu [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s -; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32 +; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <32 x float>, <32 x float>* %a @@ -404,8 +405,8 @@ ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_2048-NEXT: fcvtzu [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].s -; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl64 +; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x float>, <64 x float>* %a @@ -673,9 +674,9 @@ ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_1024-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d +; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16 ; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h -; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a @@ -690,9 +691,9 @@ ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_2048-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d +; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h -; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a @@ -744,8 +745,8 @@ ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_512-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d -; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].s, vl8 +; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG3]], [x1] ; VBITS_GE_512-NEXT: ret @@ -776,8 +777,8 @@ ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_1024-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d -; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].s, vl16 +; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG3]], [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a @@ -792,8 +793,8 @@ ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_2048-NEXT: fcvtzu [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d -; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].s, vl32 +; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG3]], [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a @@ -1193,13 +1194,14 @@ ; Don't use SVE for 128-bit vectors. define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) #0 { ; CHECK-LABEL: fcvtzs_v4f32_v4i16: -; CHECK: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] +; CHECK: fcvtzs v1.4s, v0.4s +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[3] ; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: ret %res = fptosi <4 x float> %op1 to <4 x i16> ret <4 x i16> %res @@ -1224,8 +1226,8 @@ ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_512-NEXT: fcvtzs [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s -; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16 +; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG3]], [x1] ; VBITS_GE_512-NEXT: ret @@ -1256,8 +1258,8 @@ ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_1024-NEXT: fcvtzs [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].s -; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32 +; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <32 x float>, <32 x float>* %a @@ -1272,8 +1274,8 @@ ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_2048-NEXT: fcvtzs [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].s -; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl64 +; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x float>, <64 x float>* %a @@ -1541,9 +1543,9 @@ ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_1024-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d +; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16 ; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h -; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a @@ -1558,9 +1560,9 @@ ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_2048-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d +; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h -; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a @@ -1612,8 +1614,8 @@ ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_512-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d -; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].s, vl8 +; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG3]], [x1] ; VBITS_GE_512-NEXT: ret @@ -1644,8 +1646,8 @@ ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_1024-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d -; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].s, vl16 +; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG3]], [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a @@ -1660,8 +1662,8 @@ ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_2048-NEXT: fcvtzs [[CVT:z[0-9]+]].d, [[PG2]]/m, [[OP]].d -; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].s, vl32 +; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG3]], [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -55,43 +55,43 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldrh w8, [x2] +; CHECK-NEXT: ldrh w9, [x2] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: sbfx w9, w8, #15, #1 -; CHECK-NEXT: strh w9, [sp, #30] -; CHECK-NEXT: sbfx w9, w8, #14, #1 -; CHECK-NEXT: strh w9, [sp, #28] -; CHECK-NEXT: sbfx w9, w8, #13, #1 -; CHECK-NEXT: strh w9, [sp, #26] -; CHECK-NEXT: sbfx w9, w8, #12, #1 -; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: sbfx w9, w8, #11, #1 -; CHECK-NEXT: strh w9, [sp, #22] -; CHECK-NEXT: sbfx w9, w8, #10, #1 -; CHECK-NEXT: strh w9, [sp, #20] -; CHECK-NEXT: sbfx w9, w8, #9, #1 -; CHECK-NEXT: strh w9, [sp, #18] -; CHECK-NEXT: sbfx w9, w8, #8, #1 -; CHECK-NEXT: strh w9, [sp, #16] -; CHECK-NEXT: sbfx w9, w8, #7, #1 -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: sbfx w9, w8, #6, #1 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: sbfx w9, w8, #5, #1 -; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: sbfx w9, w8, #4, #1 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: sbfx w9, w8, #3, #1 -; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: sbfx w9, w8, #2, #1 -; CHECK-NEXT: strh w9, [sp, #4] -; CHECK-NEXT: sbfx w9, w8, #1, #1 -; CHECK-NEXT: sbfx w8, w8, #0, #1 -; CHECK-NEXT: strh w9, [sp, #2] -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x9] +; CHECK-NEXT: sbfx w10, w9, #15, #1 +; CHECK-NEXT: sbfx w11, w9, #14, #1 +; CHECK-NEXT: sbfx w12, w9, #13, #1 +; CHECK-NEXT: strh w10, [sp, #30] +; CHECK-NEXT: sbfx w10, w9, #12, #1 +; CHECK-NEXT: strh w11, [sp, #28] +; CHECK-NEXT: sbfx w11, w9, #11, #1 +; CHECK-NEXT: strh w12, [sp, #26] +; CHECK-NEXT: sbfx w12, w9, #10, #1 +; CHECK-NEXT: strh w10, [sp, #24] +; CHECK-NEXT: sbfx w10, w9, #9, #1 +; CHECK-NEXT: strh w11, [sp, #22] +; CHECK-NEXT: sbfx w11, w9, #8, #1 +; CHECK-NEXT: strh w12, [sp, #20] +; CHECK-NEXT: sbfx w12, w9, #7, #1 +; CHECK-NEXT: strh w10, [sp, #18] +; CHECK-NEXT: sbfx w10, w9, #6, #1 +; CHECK-NEXT: strh w11, [sp, #16] +; CHECK-NEXT: sbfx w11, w9, #5, #1 +; CHECK-NEXT: strh w12, [sp, #14] +; CHECK-NEXT: sbfx w12, w9, #4, #1 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: sbfx w10, w9, #3, #1 +; CHECK-NEXT: strh w11, [sp, #10] +; CHECK-NEXT: sbfx w11, w9, #2, #1 +; CHECK-NEXT: strh w12, [sp, #8] +; CHECK-NEXT: sbfx w12, w9, #1, #1 +; CHECK-NEXT: sbfx w9, w9, #0, #1 +; CHECK-NEXT: strh w10, [sp, #6] +; CHECK-NEXT: strh w11, [sp, #4] +; CHECK-NEXT: strh w12, [sp, #2] +; CHECK-NEXT: strh w9, [sp] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] ; CHECK-NEXT: and z0.h, z0.h, #0x1 @@ -119,75 +119,75 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: ldr w8, [x2] +; VBITS_GE_512-NEXT: ldr w9, [x2] +; VBITS_GE_512-NEXT: mov x8, sp ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ptrue p1.h -; VBITS_GE_512-NEXT: asr w9, w8, #31 -; VBITS_GE_512-NEXT: strh w9, [sp, #62] -; VBITS_GE_512-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #60] -; VBITS_GE_512-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #58] -; VBITS_GE_512-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #56] -; VBITS_GE_512-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #54] -; VBITS_GE_512-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #52] -; VBITS_GE_512-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #50] -; VBITS_GE_512-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #48] -; VBITS_GE_512-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #46] -; VBITS_GE_512-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #44] -; VBITS_GE_512-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #42] -; VBITS_GE_512-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #40] -; VBITS_GE_512-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #38] -; VBITS_GE_512-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #36] -; VBITS_GE_512-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #34] -; VBITS_GE_512-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #32] -; VBITS_GE_512-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #30] -; VBITS_GE_512-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #28] -; VBITS_GE_512-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #26] -; VBITS_GE_512-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #24] -; VBITS_GE_512-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #22] -; VBITS_GE_512-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #20] -; VBITS_GE_512-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #18] -; VBITS_GE_512-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #16] -; VBITS_GE_512-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #14] -; VBITS_GE_512-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #12] -; VBITS_GE_512-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #10] -; VBITS_GE_512-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #8] -; VBITS_GE_512-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #6] -; VBITS_GE_512-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #4] -; VBITS_GE_512-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_512-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #2] -; VBITS_GE_512-NEXT: mov x9, sp -; VBITS_GE_512-NEXT: strh w8, [sp] -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x9] +; VBITS_GE_512-NEXT: asr w10, w9, #31 +; VBITS_GE_512-NEXT: sbfx w11, w9, #30, #1 +; VBITS_GE_512-NEXT: sbfx w12, w9, #29, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #62] +; VBITS_GE_512-NEXT: sbfx w10, w9, #28, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #60] +; VBITS_GE_512-NEXT: sbfx w11, w9, #27, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #58] +; VBITS_GE_512-NEXT: sbfx w12, w9, #26, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #56] +; VBITS_GE_512-NEXT: sbfx w10, w9, #25, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #54] +; VBITS_GE_512-NEXT: sbfx w11, w9, #24, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #52] +; VBITS_GE_512-NEXT: sbfx w12, w9, #23, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #50] +; VBITS_GE_512-NEXT: sbfx w10, w9, #22, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #48] +; VBITS_GE_512-NEXT: sbfx w11, w9, #21, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #46] +; VBITS_GE_512-NEXT: sbfx w12, w9, #20, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #44] +; VBITS_GE_512-NEXT: sbfx w10, w9, #19, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #42] +; VBITS_GE_512-NEXT: sbfx w11, w9, #18, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #40] +; VBITS_GE_512-NEXT: sbfx w12, w9, #17, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #38] +; VBITS_GE_512-NEXT: sbfx w10, w9, #16, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #36] +; VBITS_GE_512-NEXT: sbfx w11, w9, #15, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #34] +; VBITS_GE_512-NEXT: sbfx w12, w9, #14, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #32] +; VBITS_GE_512-NEXT: sbfx w10, w9, #13, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #30] +; VBITS_GE_512-NEXT: sbfx w11, w9, #12, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #28] +; VBITS_GE_512-NEXT: sbfx w12, w9, #11, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #26] +; VBITS_GE_512-NEXT: sbfx w10, w9, #10, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #24] +; VBITS_GE_512-NEXT: sbfx w11, w9, #9, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #22] +; VBITS_GE_512-NEXT: sbfx w12, w9, #8, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #20] +; VBITS_GE_512-NEXT: sbfx w10, w9, #7, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #18] +; VBITS_GE_512-NEXT: sbfx w11, w9, #6, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #16] +; VBITS_GE_512-NEXT: sbfx w12, w9, #5, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #14] +; VBITS_GE_512-NEXT: sbfx w10, w9, #4, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #12] +; VBITS_GE_512-NEXT: sbfx w11, w9, #3, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #10] +; VBITS_GE_512-NEXT: sbfx w12, w9, #2, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #8] +; VBITS_GE_512-NEXT: sbfx w10, w9, #1, #1 +; VBITS_GE_512-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #6] +; VBITS_GE_512-NEXT: strh w12, [sp, #4] +; VBITS_GE_512-NEXT: strh w10, [sp, #2] +; VBITS_GE_512-NEXT: strh w9, [sp] +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x8] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x1 @@ -215,139 +215,139 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: ldr x8, [x2] +; VBITS_GE_1024-NEXT: ldr x9, [x2] +; VBITS_GE_1024-NEXT: mov x8, sp ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 ; VBITS_GE_1024-NEXT: ptrue p1.h -; VBITS_GE_1024-NEXT: asr x9, x8, #63 -; VBITS_GE_1024-NEXT: strh w9, [sp, #126] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #62, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #124] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #61, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #122] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #60, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #120] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #59, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #118] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #58, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #116] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #57, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #114] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #56, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #112] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #55, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #110] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #54, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #108] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #53, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #106] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #52, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #104] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #51, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #102] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #50, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #100] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #49, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #98] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #48, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #96] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #47, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #94] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #46, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #92] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #45, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #90] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #44, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #88] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #43, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #86] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #42, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #84] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #41, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #82] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #40, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #80] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #39, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #78] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #38, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #76] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #37, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #74] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #36, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #72] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #35, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #70] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #34, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #68] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #33, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #66] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #32, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #64] -; VBITS_GE_1024-NEXT: asr w9, w8, #31 -; VBITS_GE_1024-NEXT: strh w9, [sp, #62] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #60] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #58] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #56] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #54] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #52] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #50] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #48] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #46] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #44] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #42] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #40] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #38] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #36] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #34] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #32] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #30] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #28] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #26] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #24] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #22] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #20] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #18] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #16] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #14] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #12] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #10] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #8] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #6] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #4] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_1024-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #2] -; VBITS_GE_1024-NEXT: mov x9, sp -; VBITS_GE_1024-NEXT: strh w8, [sp] -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x9] +; VBITS_GE_1024-NEXT: asr x10, x9, #63 +; VBITS_GE_1024-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_1024-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #126] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #124] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #122] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #58, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #120] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #118] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #116] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #55, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #114] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #112] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #110] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #52, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #108] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #106] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #104] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #102] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #100] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #98] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #46, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #96] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #94] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #92] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #43, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #90] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #88] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #86] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #40, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #84] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #82] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #80] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #78] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #76] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #74] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #34, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #72] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #70] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #68] +; VBITS_GE_1024-NEXT: asr w12, w9, #31 +; VBITS_GE_1024-NEXT: strh w10, [sp, #66] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #64] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #62] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #28, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #60] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #58] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #56] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #54] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #52] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #50] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #22, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #48] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #46] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #44] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #19, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #42] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #40] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #38] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #16, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #36] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #34] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #32] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #30] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #28] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #26] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #10, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #24] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #22] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #20] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #7, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #18] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #16] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #14] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #4, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #12] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #10] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #8] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_1024-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #6] +; VBITS_GE_1024-NEXT: strh w11, [sp, #4] +; VBITS_GE_1024-NEXT: strh w12, [sp, #2] +; VBITS_GE_1024-NEXT: strh w9, [sp] +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x8] ; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1h { z2.h }, p0/z, [x1] ; VBITS_GE_1024-NEXT: and z0.h, z0.h, #0x1 @@ -375,268 +375,268 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: ldr x8, [x2, #8] +; VBITS_GE_2048-NEXT: ldr x9, [x2, #8] +; VBITS_GE_2048-NEXT: mov x8, sp ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 ; VBITS_GE_2048-NEXT: ptrue p1.h -; VBITS_GE_2048-NEXT: asr x9, x8, #63 -; VBITS_GE_2048-NEXT: strh w9, [sp, #254] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #62, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #252] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #61, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #250] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #248] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #59, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #246] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #58, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #244] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #242] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #56, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #240] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #55, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #238] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #236] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #53, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #234] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #52, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #232] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #230] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #50, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #228] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #49, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #226] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #224] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #47, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #222] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #46, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #220] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #218] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #44, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #216] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #43, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #214] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #212] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #41, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #210] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #40, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #208] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #206] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #38, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #204] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #37, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #202] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #200] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #35, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #198] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #34, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #196] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #194] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #32, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #192] -; VBITS_GE_2048-NEXT: asr w9, w8, #31 -; VBITS_GE_2048-NEXT: strh w9, [sp, #190] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #188] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #186] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #184] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #182] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #180] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #178] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #176] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #174] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #172] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #170] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #168] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #166] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #164] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #162] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #160] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #158] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #156] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #154] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #152] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #150] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #148] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #146] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #144] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #142] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #140] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #138] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #136] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #134] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #132] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #130] -; VBITS_GE_2048-NEXT: strh w8, [sp, #128] -; VBITS_GE_2048-NEXT: ldr x8, [x2] -; VBITS_GE_2048-NEXT: mov x9, sp -; VBITS_GE_2048-NEXT: asr x10, x8, #63 +; VBITS_GE_2048-NEXT: asr x10, x9, #63 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #254] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #252] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #250] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #58, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #248] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #246] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #244] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #55, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #242] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #240] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #238] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #52, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #236] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #234] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #232] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #230] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #228] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #226] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #46, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #224] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #222] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #220] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #43, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #218] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #216] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #214] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #40, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #212] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #210] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #208] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #206] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #204] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #202] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #34, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #200] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #198] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #196] +; VBITS_GE_2048-NEXT: asr w12, w9, #31 +; VBITS_GE_2048-NEXT: strh w10, [sp, #194] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #192] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #190] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #28, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #188] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #186] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #184] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #182] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #180] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #178] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #22, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #176] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #174] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #172] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #19, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #170] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #168] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #166] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #16, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #164] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #162] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #160] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #158] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #156] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #154] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #10, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #152] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #150] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #148] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #7, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #146] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #144] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #142] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #4, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #140] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #138] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #136] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_2048-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #134] +; VBITS_GE_2048-NEXT: strh w11, [sp, #132] +; VBITS_GE_2048-NEXT: strh w12, [sp, #130] +; VBITS_GE_2048-NEXT: strh w9, [sp, #128] +; VBITS_GE_2048-NEXT: ldr x9, [x2] +; VBITS_GE_2048-NEXT: asr x10, x9, #63 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x9, #61, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #126] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #62, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #124] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #61, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #122] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #60, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #124] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #122] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #58, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #120] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #59, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #118] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #58, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #116] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #57, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #118] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #116] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #55, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #114] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #56, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #112] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #55, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #110] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #54, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #112] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #110] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #52, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #108] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #53, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #106] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #52, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #104] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #51, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #106] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #104] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #49, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #102] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #50, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #100] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #49, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #98] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #48, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #100] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #98] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #46, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #96] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #47, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #94] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #46, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #92] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #45, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #94] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #92] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #43, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #90] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #44, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #88] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #43, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #86] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #42, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #88] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #86] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #40, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #84] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #41, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #82] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #40, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #80] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #39, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #82] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #80] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #37, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #78] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #38, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #76] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #37, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #74] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #36, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #76] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #74] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #34, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #72] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #35, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #70] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #34, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #68] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #33, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #70] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #68] +; VBITS_GE_2048-NEXT: asr w12, w9, #31 ; VBITS_GE_2048-NEXT: strh w10, [sp, #66] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #32, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #64] -; VBITS_GE_2048-NEXT: asr w10, w8, #31 -; VBITS_GE_2048-NEXT: strh w10, [sp, #62] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #30, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #64] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #62] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #28, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #60] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #29, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #58] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #28, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #56] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #27, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #58] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #56] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #25, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #54] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #26, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #52] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #25, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #50] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #24, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #52] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #50] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #22, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #48] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #23, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #46] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #22, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #44] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #21, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #46] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #44] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #19, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #42] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #20, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #40] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #19, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #38] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #18, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #40] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #38] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #16, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #36] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #17, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #34] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #16, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #32] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #15, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #34] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #32] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #13, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #30] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #14, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #28] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #13, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #26] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #12, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #28] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #26] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #10, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #24] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #11, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #22] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #10, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #20] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #9, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #22] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #20] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #7, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #18] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #8, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #16] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #7, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #14] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #6, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #16] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #14] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #4, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #12] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #5, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #10] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #4, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #8] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #3, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #10] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #8] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_2048-NEXT: sbfx w9, w9, #0, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #6] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #4] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #1, #1 -; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #2] -; VBITS_GE_2048-NEXT: strh w8, [sp] -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x9] +; VBITS_GE_2048-NEXT: strh w11, [sp, #4] +; VBITS_GE_2048-NEXT: strh w12, [sp, #2] +; VBITS_GE_2048-NEXT: strh w9, [sp] +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x8] ; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1h { z2.h }, p0/z, [x1] ; VBITS_GE_2048-NEXT: and z0.h, z0.h, #0x1 @@ -689,23 +689,23 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldrb w8, [x2] +; CHECK-NEXT: ldrb w9, [x2] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: sbfx w10, w8, #7, #1 -; CHECK-NEXT: sbfx w11, w8, #6, #1 +; CHECK-NEXT: sbfx w10, w9, #7, #1 +; CHECK-NEXT: sbfx w11, w9, #6, #1 +; CHECK-NEXT: sbfx w12, w9, #5, #1 +; CHECK-NEXT: sbfx w13, w9, #4, #1 ; CHECK-NEXT: stp w11, w10, [sp, #24] -; CHECK-NEXT: sbfx w10, w8, #3, #1 -; CHECK-NEXT: sbfx w11, w8, #2, #1 -; CHECK-NEXT: sbfx w12, w8, #5, #1 -; CHECK-NEXT: sbfx w13, w8, #4, #1 -; CHECK-NEXT: stp w11, w10, [sp, #8] -; CHECK-NEXT: sbfx w10, w8, #1, #1 -; CHECK-NEXT: sbfx w8, w8, #0, #1 +; CHECK-NEXT: sbfx w10, w9, #3, #1 +; CHECK-NEXT: sbfx w11, w9, #2, #1 ; CHECK-NEXT: stp w13, w12, [sp, #16] -; CHECK-NEXT: stp w8, w10, [sp] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9] +; CHECK-NEXT: sbfx w12, w9, #1, #1 +; CHECK-NEXT: sbfx w9, w9, #0, #1 +; CHECK-NEXT: stp w11, w10, [sp, #8] +; CHECK-NEXT: stp w9, w12, [sp] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] ; CHECK-NEXT: and z0.s, z0.s, #0x1 @@ -733,35 +733,35 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: ldrh w8, [x2] +; VBITS_GE_512-NEXT: ldrh w9, [x2] +; VBITS_GE_512-NEXT: mov x8, sp ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: mov x9, sp ; VBITS_GE_512-NEXT: ptrue p1.s -; VBITS_GE_512-NEXT: sbfx w10, w8, #15, #1 -; VBITS_GE_512-NEXT: sbfx w11, w8, #14, #1 +; VBITS_GE_512-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_512-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_512-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_512-NEXT: sbfx w13, w9, #12, #1 ; VBITS_GE_512-NEXT: stp w11, w10, [sp, #56] -; VBITS_GE_512-NEXT: sbfx w10, w8, #7, #1 -; VBITS_GE_512-NEXT: sbfx w11, w8, #6, #1 -; VBITS_GE_512-NEXT: sbfx w12, w8, #13, #1 -; VBITS_GE_512-NEXT: sbfx w13, w8, #12, #1 -; VBITS_GE_512-NEXT: stp w11, w10, [sp, #24] -; VBITS_GE_512-NEXT: sbfx w10, w8, #3, #1 -; VBITS_GE_512-NEXT: sbfx w11, w8, #2, #1 -; VBITS_GE_512-NEXT: sbfx w14, w8, #11, #1 -; VBITS_GE_512-NEXT: sbfx w15, w8, #10, #1 -; VBITS_GE_512-NEXT: sbfx w16, w8, #9, #1 -; VBITS_GE_512-NEXT: sbfx w17, w8, #8, #1 +; VBITS_GE_512-NEXT: sbfx w10, w9, #11, #1 +; VBITS_GE_512-NEXT: sbfx w11, w9, #10, #1 ; VBITS_GE_512-NEXT: stp w13, w12, [sp, #48] -; VBITS_GE_512-NEXT: sbfx w12, w8, #5, #1 -; VBITS_GE_512-NEXT: sbfx w13, w8, #4, #1 -; VBITS_GE_512-NEXT: stp w11, w10, [sp, #8] -; VBITS_GE_512-NEXT: sbfx w10, w8, #1, #1 -; VBITS_GE_512-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_512-NEXT: stp w15, w14, [sp, #40] -; VBITS_GE_512-NEXT: stp w17, w16, [sp, #32] +; VBITS_GE_512-NEXT: sbfx w12, w9, #9, #1 +; VBITS_GE_512-NEXT: sbfx w13, w9, #8, #1 +; VBITS_GE_512-NEXT: stp w11, w10, [sp, #40] +; VBITS_GE_512-NEXT: sbfx w10, w9, #7, #1 +; VBITS_GE_512-NEXT: sbfx w11, w9, #6, #1 +; VBITS_GE_512-NEXT: stp w13, w12, [sp, #32] +; VBITS_GE_512-NEXT: sbfx w12, w9, #5, #1 +; VBITS_GE_512-NEXT: sbfx w13, w9, #4, #1 +; VBITS_GE_512-NEXT: stp w11, w10, [sp, #24] +; VBITS_GE_512-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_512-NEXT: sbfx w11, w9, #2, #1 ; VBITS_GE_512-NEXT: stp w13, w12, [sp, #16] -; VBITS_GE_512-NEXT: stp w8, w10, [sp] -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x9] +; VBITS_GE_512-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_512-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_512-NEXT: stp w11, w10, [sp, #8] +; VBITS_GE_512-NEXT: stp w9, w12, [sp] +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x8] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: and z0.s, z0.s, #0x1 @@ -782,68 +782,66 @@ define void @select_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i1>* %c) #0 { ; VBITS_GE_1024-LABEL: select_v32f32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; VBITS_GE_1024-NEXT: sub x9, sp, #224 -; VBITS_GE_1024-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_GE_1024-NEXT: sub x9, sp, #240 ; VBITS_GE_1024-NEXT: mov x29, sp ; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 -; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 32 -; VBITS_GE_1024-NEXT: .cfi_offset w19, -16 -; VBITS_GE_1024-NEXT: .cfi_offset w30, -24 -; VBITS_GE_1024-NEXT: .cfi_offset w29, -32 -; VBITS_GE_1024-NEXT: ldr w8, [x2] +; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 +; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 +; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 +; VBITS_GE_1024-NEXT: ldr w9, [x2] +; VBITS_GE_1024-NEXT: mov x8, sp ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: mov x9, sp ; VBITS_GE_1024-NEXT: ptrue p1.s -; VBITS_GE_1024-NEXT: asr w10, w8, #31 -; VBITS_GE_1024-NEXT: sbfx w11, w8, #30, #1 +; VBITS_GE_1024-NEXT: asr w10, w9, #31 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #30, #1 +; VBITS_GE_1024-NEXT: sbfx w12, w9, #29, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #28, #1 ; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #120] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #15, #1 -; VBITS_GE_1024-NEXT: sbfx w11, w8, #14, #1 -; VBITS_GE_1024-NEXT: sbfx w12, w8, #29, #1 -; VBITS_GE_1024-NEXT: sbfx w13, w8, #28, #1 -; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #56] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #7, #1 -; VBITS_GE_1024-NEXT: sbfx w11, w8, #6, #1 -; VBITS_GE_1024-NEXT: sbfx w14, w8, #27, #1 -; VBITS_GE_1024-NEXT: sbfx w15, w8, #26, #1 -; VBITS_GE_1024-NEXT: sbfx w16, w8, #25, #1 -; VBITS_GE_1024-NEXT: sbfx w17, w8, #24, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #26, #1 ; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #112] -; VBITS_GE_1024-NEXT: sbfx w12, w8, #13, #1 -; VBITS_GE_1024-NEXT: sbfx w13, w8, #12, #1 -; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #24] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #3, #1 -; VBITS_GE_1024-NEXT: sbfx w11, w8, #2, #1 -; VBITS_GE_1024-NEXT: sbfx w18, w8, #23, #1 -; VBITS_GE_1024-NEXT: sbfx w2, w8, #22, #1 -; VBITS_GE_1024-NEXT: sbfx w3, w8, #21, #1 -; VBITS_GE_1024-NEXT: sbfx w4, w8, #20, #1 -; VBITS_GE_1024-NEXT: sbfx w5, w8, #19, #1 -; VBITS_GE_1024-NEXT: sbfx w6, w8, #18, #1 -; VBITS_GE_1024-NEXT: sbfx w7, w8, #17, #1 -; VBITS_GE_1024-NEXT: sbfx w19, w8, #16, #1 -; VBITS_GE_1024-NEXT: stp w15, w14, [sp, #104] -; VBITS_GE_1024-NEXT: stp w17, w16, [sp, #96] -; VBITS_GE_1024-NEXT: sbfx w14, w8, #11, #1 -; VBITS_GE_1024-NEXT: sbfx w15, w8, #10, #1 -; VBITS_GE_1024-NEXT: sbfx w16, w8, #9, #1 -; VBITS_GE_1024-NEXT: sbfx w17, w8, #8, #1 +; VBITS_GE_1024-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #24, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #104] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #23, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #22, #1 +; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #96] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #21, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #20, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #88] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #19, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #18, #1 +; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #80] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #17, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #16, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #72] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #64] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #12, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #56] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #11, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #10, #1 ; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #48] -; VBITS_GE_1024-NEXT: sbfx w12, w8, #5, #1 -; VBITS_GE_1024-NEXT: sbfx w13, w8, #4, #1 -; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #8] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #1, #1 -; VBITS_GE_1024-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_1024-NEXT: stp w2, w18, [sp, #88] -; VBITS_GE_1024-NEXT: stp w4, w3, [sp, #80] -; VBITS_GE_1024-NEXT: stp w6, w5, [sp, #72] -; VBITS_GE_1024-NEXT: stp w19, w7, [sp, #64] -; VBITS_GE_1024-NEXT: stp w15, w14, [sp, #40] -; VBITS_GE_1024-NEXT: stp w17, w16, [sp, #32] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #9, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #8, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #40] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #7, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #6, #1 +; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #32] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #5, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #4, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #24] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #2, #1 ; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #16] -; VBITS_GE_1024-NEXT: stp w8, w10, [sp] -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x9] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_1024-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #8] +; VBITS_GE_1024-NEXT: stp w9, w12, [sp] +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x8] ; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1w { z2.s }, p0/z, [x1] ; VBITS_GE_1024-NEXT: and z0.s, z0.s, #0x1 @@ -851,8 +849,7 @@ ; VBITS_GE_1024-NEXT: sel z0.s, p1, z1.s, z2.s ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_1024-NEXT: mov sp, x29 -; VBITS_GE_1024-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; VBITS_GE_1024-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; VBITS_GE_1024-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_GE_1024-NEXT: ret %mask = load <32 x i1>, <32 x i1>* %c %op1 = load <32 x float>, <32 x float>* %a @@ -865,161 +862,114 @@ define void @select_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i1>* %c) #0 { ; VBITS_GE_2048-LABEL: select_v64f32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: sub x9, sp, #672 -; VBITS_GE_2048-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_GE_2048-NEXT: sub x9, sp, #496 ; VBITS_GE_2048-NEXT: mov x29, sp ; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 -; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 96 -; VBITS_GE_2048-NEXT: .cfi_offset w19, -8 -; VBITS_GE_2048-NEXT: .cfi_offset w20, -16 -; VBITS_GE_2048-NEXT: .cfi_offset w21, -24 -; VBITS_GE_2048-NEXT: .cfi_offset w22, -32 -; VBITS_GE_2048-NEXT: .cfi_offset w23, -40 -; VBITS_GE_2048-NEXT: .cfi_offset w24, -48 -; VBITS_GE_2048-NEXT: .cfi_offset w25, -56 -; VBITS_GE_2048-NEXT: .cfi_offset w26, -64 -; VBITS_GE_2048-NEXT: .cfi_offset w27, -72 -; VBITS_GE_2048-NEXT: .cfi_offset w28, -80 -; VBITS_GE_2048-NEXT: .cfi_offset w30, -88 -; VBITS_GE_2048-NEXT: .cfi_offset w29, -96 -; VBITS_GE_2048-NEXT: ldr x8, [x2] +; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 +; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 +; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 +; VBITS_GE_2048-NEXT: ldr x9, [x2] +; VBITS_GE_2048-NEXT: mov x8, sp ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: add x9, sp, #256 ; VBITS_GE_2048-NEXT: ptrue p1.s -; VBITS_GE_2048-NEXT: asr x10, x8, #63 -; VBITS_GE_2048-NEXT: str w10, [sp, #508] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #37, #1 -; VBITS_GE_2048-NEXT: sbfx x11, x8, #62, #1 -; VBITS_GE_2048-NEXT: str w10, [sp, #404] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #18, #1 -; VBITS_GE_2048-NEXT: sbfx x12, x8, #61, #1 -; VBITS_GE_2048-NEXT: sbfx x13, x8, #60, #1 -; VBITS_GE_2048-NEXT: sbfx x14, x8, #59, #1 -; VBITS_GE_2048-NEXT: str w11, [sp, #504] -; VBITS_GE_2048-NEXT: sbfx x11, x8, #36, #1 -; VBITS_GE_2048-NEXT: str w10, [sp, #328] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #9, #1 -; VBITS_GE_2048-NEXT: sbfx x15, x8, #58, #1 -; VBITS_GE_2048-NEXT: sbfx x16, x8, #57, #1 -; VBITS_GE_2048-NEXT: sbfx x17, x8, #56, #1 -; VBITS_GE_2048-NEXT: sbfx x18, x8, #55, #1 -; VBITS_GE_2048-NEXT: str w12, [sp, #500] -; VBITS_GE_2048-NEXT: sbfx x12, x8, #35, #1 -; VBITS_GE_2048-NEXT: str w13, [sp, #496] -; VBITS_GE_2048-NEXT: sbfx x13, x8, #34, #1 -; VBITS_GE_2048-NEXT: str w14, [sp, #492] -; VBITS_GE_2048-NEXT: sbfx x14, x8, #33, #1 -; VBITS_GE_2048-NEXT: str w11, [sp, #400] -; VBITS_GE_2048-NEXT: sbfx w11, w8, #17, #1 -; VBITS_GE_2048-NEXT: str w10, [sp, #292] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #4, #1 -; VBITS_GE_2048-NEXT: sbfx x2, x8, #54, #1 -; VBITS_GE_2048-NEXT: sbfx x3, x8, #53, #1 -; VBITS_GE_2048-NEXT: sbfx x4, x8, #52, #1 -; VBITS_GE_2048-NEXT: sbfx x5, x8, #51, #1 -; VBITS_GE_2048-NEXT: sbfx x6, x8, #50, #1 -; VBITS_GE_2048-NEXT: sbfx x7, x8, #49, #1 -; VBITS_GE_2048-NEXT: sbfx x19, x8, #48, #1 -; VBITS_GE_2048-NEXT: sbfx x20, x8, #47, #1 -; VBITS_GE_2048-NEXT: sbfx x21, x8, #46, #1 -; VBITS_GE_2048-NEXT: sbfx x22, x8, #45, #1 -; VBITS_GE_2048-NEXT: str w15, [sp, #488] -; VBITS_GE_2048-NEXT: sbfx x15, x8, #32, #1 -; VBITS_GE_2048-NEXT: str w16, [sp, #484] -; VBITS_GE_2048-NEXT: asr w16, w8, #31 -; VBITS_GE_2048-NEXT: str w17, [sp, #480] -; VBITS_GE_2048-NEXT: sbfx w17, w8, #30, #1 -; VBITS_GE_2048-NEXT: str w18, [sp, #476] -; VBITS_GE_2048-NEXT: sbfx w18, w8, #29, #1 -; VBITS_GE_2048-NEXT: str w12, [sp, #396] -; VBITS_GE_2048-NEXT: str w13, [sp, #392] -; VBITS_GE_2048-NEXT: str w14, [sp, #388] -; VBITS_GE_2048-NEXT: sbfx w12, w8, #16, #1 -; VBITS_GE_2048-NEXT: sbfx w13, w8, #15, #1 -; VBITS_GE_2048-NEXT: sbfx w14, w8, #14, #1 -; VBITS_GE_2048-NEXT: str w11, [sp, #324] -; VBITS_GE_2048-NEXT: sbfx w11, w8, #8, #1 -; VBITS_GE_2048-NEXT: str w10, [sp, #272] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1 -; VBITS_GE_2048-NEXT: sbfx x23, x8, #44, #1 -; VBITS_GE_2048-NEXT: sbfx x24, x8, #43, #1 -; VBITS_GE_2048-NEXT: sbfx x25, x8, #42, #1 -; VBITS_GE_2048-NEXT: sbfx x26, x8, #41, #1 -; VBITS_GE_2048-NEXT: sbfx x27, x8, #40, #1 -; VBITS_GE_2048-NEXT: sbfx x28, x8, #39, #1 -; VBITS_GE_2048-NEXT: sbfx x30, x8, #38, #1 -; VBITS_GE_2048-NEXT: str w2, [sp, #472] -; VBITS_GE_2048-NEXT: sbfx w2, w8, #28, #1 -; VBITS_GE_2048-NEXT: str w3, [sp, #468] -; VBITS_GE_2048-NEXT: sbfx w3, w8, #27, #1 -; VBITS_GE_2048-NEXT: str w4, [sp, #464] -; VBITS_GE_2048-NEXT: sbfx w4, w8, #26, #1 -; VBITS_GE_2048-NEXT: str w5, [sp, #460] -; VBITS_GE_2048-NEXT: str w6, [sp, #456] -; VBITS_GE_2048-NEXT: sbfx w5, w8, #25, #1 -; VBITS_GE_2048-NEXT: str w7, [sp, #452] -; VBITS_GE_2048-NEXT: str w19, [sp, #448] -; VBITS_GE_2048-NEXT: sbfx w6, w8, #24, #1 -; VBITS_GE_2048-NEXT: str w20, [sp, #444] -; VBITS_GE_2048-NEXT: str w21, [sp, #440] -; VBITS_GE_2048-NEXT: sbfx w7, w8, #23, #1 -; VBITS_GE_2048-NEXT: str w22, [sp, #436] -; VBITS_GE_2048-NEXT: sbfx w19, w8, #22, #1 -; VBITS_GE_2048-NEXT: sbfx w20, w8, #21, #1 -; VBITS_GE_2048-NEXT: sbfx w21, w8, #20, #1 -; VBITS_GE_2048-NEXT: sbfx w22, w8, #19, #1 -; VBITS_GE_2048-NEXT: str w15, [sp, #384] -; VBITS_GE_2048-NEXT: str w16, [sp, #380] -; VBITS_GE_2048-NEXT: str w17, [sp, #376] -; VBITS_GE_2048-NEXT: str w18, [sp, #372] -; VBITS_GE_2048-NEXT: sbfx w15, w8, #13, #1 -; VBITS_GE_2048-NEXT: sbfx w16, w8, #12, #1 -; VBITS_GE_2048-NEXT: sbfx w17, w8, #11, #1 -; VBITS_GE_2048-NEXT: sbfx w18, w8, #10, #1 -; VBITS_GE_2048-NEXT: str w12, [sp, #320] -; VBITS_GE_2048-NEXT: str w13, [sp, #316] -; VBITS_GE_2048-NEXT: str w14, [sp, #312] -; VBITS_GE_2048-NEXT: sbfx w12, w8, #7, #1 -; VBITS_GE_2048-NEXT: sbfx w13, w8, #6, #1 -; VBITS_GE_2048-NEXT: sbfx w14, w8, #5, #1 -; VBITS_GE_2048-NEXT: str w11, [sp, #288] -; VBITS_GE_2048-NEXT: sbfx w11, w8, #3, #1 -; VBITS_GE_2048-NEXT: str w10, [sp, #264] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #1, #1 -; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_2048-NEXT: str w23, [sp, #432] -; VBITS_GE_2048-NEXT: str w24, [sp, #428] -; VBITS_GE_2048-NEXT: str w25, [sp, #424] -; VBITS_GE_2048-NEXT: str w26, [sp, #420] -; VBITS_GE_2048-NEXT: str w27, [sp, #416] -; VBITS_GE_2048-NEXT: str w28, [sp, #412] -; VBITS_GE_2048-NEXT: str w30, [sp, #408] -; VBITS_GE_2048-NEXT: str w2, [sp, #368] -; VBITS_GE_2048-NEXT: str w3, [sp, #364] -; VBITS_GE_2048-NEXT: str w4, [sp, #360] -; VBITS_GE_2048-NEXT: str w5, [sp, #356] -; VBITS_GE_2048-NEXT: str w6, [sp, #352] -; VBITS_GE_2048-NEXT: str w7, [sp, #348] -; VBITS_GE_2048-NEXT: str w19, [sp, #344] -; VBITS_GE_2048-NEXT: str w20, [sp, #340] -; VBITS_GE_2048-NEXT: str w21, [sp, #336] -; VBITS_GE_2048-NEXT: str w22, [sp, #332] -; VBITS_GE_2048-NEXT: str w15, [sp, #308] -; VBITS_GE_2048-NEXT: str w16, [sp, #304] -; VBITS_GE_2048-NEXT: str w17, [sp, #300] -; VBITS_GE_2048-NEXT: str w18, [sp, #296] -; VBITS_GE_2048-NEXT: str w12, [sp, #284] -; VBITS_GE_2048-NEXT: str w13, [sp, #280] -; VBITS_GE_2048-NEXT: str w14, [sp, #276] -; VBITS_GE_2048-NEXT: str w11, [sp, #268] -; VBITS_GE_2048-NEXT: str w10, [sp, #260] -; VBITS_GE_2048-NEXT: str w8, [sp, #256] -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x9] +; VBITS_GE_2048-NEXT: asr x10, x9, #63 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #60, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #248] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #59, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #58, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #240] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #57, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #56, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #232] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #55, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #54, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #224] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #53, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #52, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #216] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #208] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #48, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #200] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #47, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #46, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #192] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #45, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #44, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #184] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #43, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #42, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #176] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #41, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #40, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #168] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #160] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #36, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #152] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #35, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #34, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #144] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #33, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #32, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #136] +; VBITS_GE_2048-NEXT: asr w10, w9, #31 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #30, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #128] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #29, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #28, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #120] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #112] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #24, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #104] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #23, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #22, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #96] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #21, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #20, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #88] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #19, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #18, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #80] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #17, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #16, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #72] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #64] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #12, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #56] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #11, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #10, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #48] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #9, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #8, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #40] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #7, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #6, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #32] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #5, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #4, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #24] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #16] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_2048-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #8] +; VBITS_GE_2048-NEXT: stp w9, w12, [sp] +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x8] ; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { z2.s }, p0/z, [x1] ; VBITS_GE_2048-NEXT: and z0.s, z0.s, #0x1 @@ -1027,12 +977,7 @@ ; VBITS_GE_2048-NEXT: sel z0.s, p1, z1.s, z2.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_2048-NEXT: mov sp, x29 -; VBITS_GE_2048-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload +; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_GE_2048-NEXT: ret %mask = load <64 x i1>, <64 x i1>* %c %op1 = load <64 x float>, <64 x float>* %a @@ -1078,20 +1023,20 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldrb w8, [x2] +; CHECK-NEXT: ldrb w9, [x2] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: lsr w10, w8, #3 -; CHECK-NEXT: lsr w11, w8, #2 +; CHECK-NEXT: lsr w10, w9, #3 +; CHECK-NEXT: lsr w11, w9, #2 +; CHECK-NEXT: sbfx x12, x9, #0, #1 +; CHECK-NEXT: lsr w9, w9, #1 ; CHECK-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NEXT: sbfx x11, x11, #0, #1 +; CHECK-NEXT: sbfx x9, x9, #0, #1 ; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: sbfx x10, x8, #0, #1 -; CHECK-NEXT: lsr w8, w8, #1 -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: stp x10, x8, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x9] +; CHECK-NEXT: stp x12, x9, [sp] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] ; CHECK-NEXT: and z0.d, z0.d, #0x1 @@ -1119,30 +1064,30 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: ldrb w8, [x2] +; VBITS_GE_512-NEXT: ldrb w9, [x2] +; VBITS_GE_512-NEXT: mov x8, sp ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: mov x9, sp ; VBITS_GE_512-NEXT: ptrue p1.d -; VBITS_GE_512-NEXT: lsr w10, w8, #7 -; VBITS_GE_512-NEXT: lsr w11, w8, #6 -; VBITS_GE_512-NEXT: lsr w12, w8, #5 -; VBITS_GE_512-NEXT: lsr w13, w8, #4 +; VBITS_GE_512-NEXT: lsr w10, w9, #7 +; VBITS_GE_512-NEXT: lsr w11, w9, #6 +; VBITS_GE_512-NEXT: lsr w12, w9, #5 +; VBITS_GE_512-NEXT: lsr w13, w9, #4 ; VBITS_GE_512-NEXT: sbfx x10, x10, #0, #1 ; VBITS_GE_512-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_512-NEXT: sbfx x12, x12, #0, #1 +; VBITS_GE_512-NEXT: sbfx x13, x13, #0, #1 +; VBITS_GE_512-NEXT: lsr w14, w9, #3 ; VBITS_GE_512-NEXT: stp x11, x10, [sp, #48] -; VBITS_GE_512-NEXT: sbfx x11, x12, #0, #1 -; VBITS_GE_512-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_512-NEXT: lsr w10, w8, #3 -; VBITS_GE_512-NEXT: stp x12, x11, [sp, #32] -; VBITS_GE_512-NEXT: lsr w11, w8, #2 +; VBITS_GE_512-NEXT: lsr w10, w9, #2 +; VBITS_GE_512-NEXT: stp x13, x12, [sp, #32] +; VBITS_GE_512-NEXT: sbfx x12, x9, #0, #1 +; VBITS_GE_512-NEXT: lsr w9, w9, #1 +; VBITS_GE_512-NEXT: sbfx x11, x14, #0, #1 ; VBITS_GE_512-NEXT: sbfx x10, x10, #0, #1 -; VBITS_GE_512-NEXT: sbfx x11, x11, #0, #1 -; VBITS_GE_512-NEXT: stp x11, x10, [sp, #16] -; VBITS_GE_512-NEXT: sbfx x10, x8, #0, #1 -; VBITS_GE_512-NEXT: lsr w8, w8, #1 -; VBITS_GE_512-NEXT: sbfx x8, x8, #0, #1 -; VBITS_GE_512-NEXT: stp x10, x8, [sp] -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x9] +; VBITS_GE_512-NEXT: sbfx x9, x9, #0, #1 +; VBITS_GE_512-NEXT: stp x10, x11, [sp, #16] +; VBITS_GE_512-NEXT: stp x12, x9, [sp] +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x8] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: and z0.d, z0.d, #0x1 @@ -1170,50 +1115,50 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: ldrh w8, [x2] +; VBITS_GE_1024-NEXT: ldrh w9, [x2] +; VBITS_GE_1024-NEXT: mov x8, sp ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: mov x9, sp ; VBITS_GE_1024-NEXT: ptrue p1.d -; VBITS_GE_1024-NEXT: lsr w10, w8, #15 -; VBITS_GE_1024-NEXT: lsr w11, w8, #14 -; VBITS_GE_1024-NEXT: lsr w12, w8, #13 -; VBITS_GE_1024-NEXT: lsr w13, w8, #12 +; VBITS_GE_1024-NEXT: lsr w10, w9, #15 +; VBITS_GE_1024-NEXT: lsr w11, w9, #14 +; VBITS_GE_1024-NEXT: lsr w12, w9, #13 +; VBITS_GE_1024-NEXT: lsr w13, w9, #12 ; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1 ; VBITS_GE_1024-NEXT: sbfx x11, x11, #0, #1 -; VBITS_GE_1024-NEXT: lsr w14, w8, #11 -; VBITS_GE_1024-NEXT: lsr w15, w8, #10 +; VBITS_GE_1024-NEXT: sbfx x12, x12, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x13, x13, #0, #1 +; VBITS_GE_1024-NEXT: lsr w14, w9, #11 +; VBITS_GE_1024-NEXT: lsr w15, w9, #10 ; VBITS_GE_1024-NEXT: stp x11, x10, [sp, #112] -; VBITS_GE_1024-NEXT: sbfx x11, x12, #0, #1 -; VBITS_GE_1024-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_1024-NEXT: lsr w16, w8, #9 -; VBITS_GE_1024-NEXT: lsr w17, w8, #8 -; VBITS_GE_1024-NEXT: stp x12, x11, [sp, #96] -; VBITS_GE_1024-NEXT: sbfx x12, x14, #0, #1 -; VBITS_GE_1024-NEXT: sbfx x13, x15, #0, #1 -; VBITS_GE_1024-NEXT: lsr w10, w8, #7 -; VBITS_GE_1024-NEXT: lsr w11, w8, #6 -; VBITS_GE_1024-NEXT: stp x13, x12, [sp, #80] -; VBITS_GE_1024-NEXT: sbfx x13, x16, #0, #1 -; VBITS_GE_1024-NEXT: sbfx x14, x17, #0, #1 -; VBITS_GE_1024-NEXT: lsr w12, w8, #5 -; VBITS_GE_1024-NEXT: stp x14, x13, [sp, #64] -; VBITS_GE_1024-NEXT: lsr w13, w8, #4 +; VBITS_GE_1024-NEXT: lsr w10, w9, #9 +; VBITS_GE_1024-NEXT: stp x13, x12, [sp, #96] +; VBITS_GE_1024-NEXT: lsr w13, w9, #8 +; VBITS_GE_1024-NEXT: sbfx x11, x14, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x12, x15, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x13, x13, #0, #1 +; VBITS_GE_1024-NEXT: lsr w14, w9, #3 +; VBITS_GE_1024-NEXT: stp x12, x11, [sp, #80] +; VBITS_GE_1024-NEXT: lsr w11, w9, #6 +; VBITS_GE_1024-NEXT: stp x13, x10, [sp, #64] +; VBITS_GE_1024-NEXT: lsr w10, w9, #7 +; VBITS_GE_1024-NEXT: lsr w12, w9, #5 +; VBITS_GE_1024-NEXT: lsr w13, w9, #4 ; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1 ; VBITS_GE_1024-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x12, x12, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x13, x13, #0, #1 ; VBITS_GE_1024-NEXT: stp x11, x10, [sp, #48] -; VBITS_GE_1024-NEXT: sbfx x11, x12, #0, #1 -; VBITS_GE_1024-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_1024-NEXT: lsr w10, w8, #3 -; VBITS_GE_1024-NEXT: stp x12, x11, [sp, #32] -; VBITS_GE_1024-NEXT: lsr w11, w8, #2 -; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1 +; VBITS_GE_1024-NEXT: lsr w11, w9, #2 +; VBITS_GE_1024-NEXT: stp x13, x12, [sp, #32] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #0, #1 +; VBITS_GE_1024-NEXT: lsr w9, w9, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x14, #0, #1 ; VBITS_GE_1024-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x9, x9, #0, #1 ; VBITS_GE_1024-NEXT: stp x11, x10, [sp, #16] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #0, #1 -; VBITS_GE_1024-NEXT: lsr w8, w8, #1 -; VBITS_GE_1024-NEXT: sbfx x8, x8, #0, #1 -; VBITS_GE_1024-NEXT: stp x10, x8, [sp] -; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x9] +; VBITS_GE_1024-NEXT: stp x12, x9, [sp] +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x8] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_1024-NEXT: and z0.d, z0.d, #0x1 @@ -1234,130 +1179,128 @@ define void @select_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i1>* %c) #0 { ; VBITS_GE_2048-LABEL: select_v32f64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: sub x9, sp, #480 -; VBITS_GE_2048-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_GE_2048-NEXT: sub x9, sp, #496 ; VBITS_GE_2048-NEXT: mov x29, sp ; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 -; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 32 -; VBITS_GE_2048-NEXT: .cfi_offset w19, -16 -; VBITS_GE_2048-NEXT: .cfi_offset w30, -24 -; VBITS_GE_2048-NEXT: .cfi_offset w29, -32 -; VBITS_GE_2048-NEXT: ldr w8, [x2] +; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 +; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 +; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 +; VBITS_GE_2048-NEXT: ldr w9, [x2] +; VBITS_GE_2048-NEXT: mov x8, sp ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: mov x9, sp ; VBITS_GE_2048-NEXT: ptrue p1.d -; VBITS_GE_2048-NEXT: ubfx x10, x8, #31, #1 -; VBITS_GE_2048-NEXT: ubfx x11, x8, #30, #2 -; VBITS_GE_2048-NEXT: ubfx x12, x8, #29, #3 +; VBITS_GE_2048-NEXT: ubfx x10, x9, #31, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #30, #2 ; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10 ; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 -; VBITS_GE_2048-NEXT: ubfx x13, x8, #28, #4 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #29, #3 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #28, #4 ; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1 ; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 -; VBITS_GE_2048-NEXT: ubfx x14, x8, #27, #5 -; VBITS_GE_2048-NEXT: ubfx x15, x8, #26, #6 -; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #240] -; VBITS_GE_2048-NEXT: sbfx x11, x12, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 -; VBITS_GE_2048-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x16, x8, #25, #7 -; VBITS_GE_2048-NEXT: ubfx x17, x8, #24, #8 -; VBITS_GE_2048-NEXT: stp x12, x11, [sp, #224] +; VBITS_GE_2048-NEXT: ubfx x14, x9, #27, #5 +; VBITS_GE_2048-NEXT: ubfx x15, x9, #26, #6 ; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 -; VBITS_GE_2048-NEXT: sbfx x12, x14, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w15 killed $w15 killed $x15 def $x15 -; VBITS_GE_2048-NEXT: sbfx x13, x15, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x18, x8, #23, #9 -; VBITS_GE_2048-NEXT: ubfx x2, x8, #22, #10 -; VBITS_GE_2048-NEXT: stp x13, x12, [sp, #208] -; VBITS_GE_2048-NEXT: // kill: def $w16 killed $w16 killed $x16 def $x16 -; VBITS_GE_2048-NEXT: sbfx x13, x16, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w17 killed $w17 killed $x17 def $x17 -; VBITS_GE_2048-NEXT: sbfx x14, x17, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x3, x8, #21, #11 -; VBITS_GE_2048-NEXT: ubfx x4, x8, #20, #12 -; VBITS_GE_2048-NEXT: ubfx x10, x8, #15, #17 -; VBITS_GE_2048-NEXT: ubfx x11, x8, #14, #18 -; VBITS_GE_2048-NEXT: stp x14, x13, [sp, #192] -; VBITS_GE_2048-NEXT: // kill: def $w18 killed $w18 killed $x18 def $x18 -; VBITS_GE_2048-NEXT: sbfx x14, x18, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w2 killed $w2 killed $x2 def $x2 -; VBITS_GE_2048-NEXT: sbfx x15, x2, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x5, x8, #19, #13 -; VBITS_GE_2048-NEXT: ubfx x6, x8, #18, #14 -; VBITS_GE_2048-NEXT: ubfx x12, x8, #13, #19 -; VBITS_GE_2048-NEXT: stp x15, x14, [sp, #176] -; VBITS_GE_2048-NEXT: // kill: def $w3 killed $w3 killed $x3 def $x3 -; VBITS_GE_2048-NEXT: sbfx x15, x3, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w4 killed $w4 killed $x4 def $x4 -; VBITS_GE_2048-NEXT: sbfx x16, x4, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #240] +; VBITS_GE_2048-NEXT: sbfx x10, x12, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x13, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x14, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #25, #7 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #23, #9 +; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x12, x10, [sp, #224] +; VBITS_GE_2048-NEXT: sbfx x10, x15, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #24, #8 +; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 +; VBITS_GE_2048-NEXT: stp x10, x13, [sp, #208] +; VBITS_GE_2048-NEXT: sbfx x10, x11, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x12, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #22, #10 +; VBITS_GE_2048-NEXT: sbfx x13, x14, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #21, #11 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x12, x10, [sp, #192] +; VBITS_GE_2048-NEXT: sbfx x10, x11, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #20, #12 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #19, #13 ; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 -; VBITS_GE_2048-NEXT: ubfx x7, x8, #17, #15 -; VBITS_GE_2048-NEXT: ubfx x19, x8, #16, #16 -; VBITS_GE_2048-NEXT: ubfx x13, x8, #12, #20 -; VBITS_GE_2048-NEXT: stp x16, x15, [sp, #160] -; VBITS_GE_2048-NEXT: // kill: def $w5 killed $w5 killed $x5 def $x5 -; VBITS_GE_2048-NEXT: sbfx x16, x5, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w6 killed $w6 killed $x6 def $x6 -; VBITS_GE_2048-NEXT: sbfx x17, x6, #0, #1 -; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1 -; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 -; VBITS_GE_2048-NEXT: ubfx x14, x8, #11, #21 -; VBITS_GE_2048-NEXT: ubfx x15, x8, #10, #22 -; VBITS_GE_2048-NEXT: stp x17, x16, [sp, #144] -; VBITS_GE_2048-NEXT: // kill: def $w7 killed $w7 killed $x7 def $x7 -; VBITS_GE_2048-NEXT: sbfx x17, x7, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w19 killed $w19 killed $x19 def $x19 -; VBITS_GE_2048-NEXT: sbfx x18, x19, #0, #1 -; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #112] -; VBITS_GE_2048-NEXT: sbfx x11, x12, #0, #1 +; VBITS_GE_2048-NEXT: stp x10, x13, [sp, #176] +; VBITS_GE_2048-NEXT: sbfx x10, x14, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #18, #14 +; VBITS_GE_2048-NEXT: sbfx x12, x12, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 -; VBITS_GE_2048-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x16, x8, #9, #23 -; VBITS_GE_2048-NEXT: stp x18, x17, [sp, #128] -; VBITS_GE_2048-NEXT: ubfx x17, x8, #8, #24 -; VBITS_GE_2048-NEXT: ubfx x10, x8, #7, #25 -; VBITS_GE_2048-NEXT: stp x12, x11, [sp, #96] -; VBITS_GE_2048-NEXT: ubfx x11, x8, #6, #26 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #17, #15 ; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 -; VBITS_GE_2048-NEXT: sbfx x12, x14, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w15 killed $w15 killed $x15 def $x15 -; VBITS_GE_2048-NEXT: sbfx x13, x15, #0, #1 -; VBITS_GE_2048-NEXT: stp x13, x12, [sp, #80] -; VBITS_GE_2048-NEXT: ubfx x12, x8, #5, #27 -; VBITS_GE_2048-NEXT: // kill: def $w16 killed $w16 killed $x16 def $x16 -; VBITS_GE_2048-NEXT: sbfx x13, x16, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w17 killed $w17 killed $x17 def $x17 -; VBITS_GE_2048-NEXT: sbfx x14, x17, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #160] +; VBITS_GE_2048-NEXT: sbfx x10, x13, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #16, #16 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #15, #17 ; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 -; VBITS_GE_2048-NEXT: stp x14, x13, [sp, #64] -; VBITS_GE_2048-NEXT: ubfx x13, x8, #4, #28 -; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 +; VBITS_GE_2048-NEXT: stp x10, x12, [sp, #144] +; VBITS_GE_2048-NEXT: sbfx x10, x14, #0, #1 ; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #14, #18 +; VBITS_GE_2048-NEXT: sbfx x13, x13, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #13, #19 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #128] +; VBITS_GE_2048-NEXT: sbfx x10, x12, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #12, #20 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #11, #21 +; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 ; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 -; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #48] -; VBITS_GE_2048-NEXT: sbfx x11, x12, #0, #1 +; VBITS_GE_2048-NEXT: stp x10, x13, [sp, #112] +; VBITS_GE_2048-NEXT: sbfx x10, x14, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #10, #22 +; VBITS_GE_2048-NEXT: sbfx x12, x12, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 -; VBITS_GE_2048-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x10, x8, #3, #29 -; VBITS_GE_2048-NEXT: stp x12, x11, [sp, #32] -; VBITS_GE_2048-NEXT: ubfx x11, x8, #2, #30 -; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #9, #23 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #96] +; VBITS_GE_2048-NEXT: sbfx x10, x13, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #8, #24 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #7, #25 ; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 -; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 +; VBITS_GE_2048-NEXT: stp x10, x12, [sp, #80] +; VBITS_GE_2048-NEXT: sbfx x10, x14, #0, #1 ; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 -; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #16] -; VBITS_GE_2048-NEXT: ubfx x10, x8, #1, #31 -; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10 -; VBITS_GE_2048-NEXT: sbfx x8, x8, #0, #1 -; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1 -; VBITS_GE_2048-NEXT: stp x8, x10, [sp] -; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x9] +; VBITS_GE_2048-NEXT: ubfx x12, x9, #6, #26 +; VBITS_GE_2048-NEXT: sbfx x13, x13, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #5, #27 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #64] +; VBITS_GE_2048-NEXT: sbfx x10, x12, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #4, #28 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #3, #29 +; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 +; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 +; VBITS_GE_2048-NEXT: stp x10, x13, [sp, #48] +; VBITS_GE_2048-NEXT: sbfx x10, x14, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #2, #30 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #1, #31 +; VBITS_GE_2048-NEXT: sbfx x12, x12, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 +; VBITS_GE_2048-NEXT: sbfx x9, x9, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #32] +; VBITS_GE_2048-NEXT: sbfx x10, x13, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x14, #0, #1 +; VBITS_GE_2048-NEXT: stp x10, x12, [sp, #16] +; VBITS_GE_2048-NEXT: stp x9, x11, [sp] +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x8] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_2048-NEXT: and z0.d, z0.d, #0x1 @@ -1365,8 +1308,7 @@ ; VBITS_GE_2048-NEXT: sel z0.d, p1, z1.d, z2.d ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_2048-NEXT: mov sp, x29 -; VBITS_GE_2048-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_GE_2048-NEXT: ret %mask = load <32 x i1>, <32 x i1>* %c %op1 = load <32 x double>, <32 x double>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -28,8 +28,8 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) #0 { ; VBITS_GE_256-LABEL: insertelement_v4f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $q0 ; VBITS_GE_256-NEXT: fmov h1, #5.00000000 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $q0 ; VBITS_GE_256-NEXT: mov v0.h[3], v1.h[0] ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 ; VBITS_GE_256-NEXT: ret @@ -51,15 +51,15 @@ define <16 x half> @insertelement_v16f16(<16 x half>* %a) #0 { ; VBITS_GE_256-LABEL: insertelement_v16f16: ; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w9, #15 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: mov w9, #15 -; VBITS_GE_256-NEXT: mov z1.h, w9 -; VBITS_GE_256-NEXT: index z2.h, #0, #1 +; VBITS_GE_256-NEXT: fmov h2, #5.00000000 +; VBITS_GE_256-NEXT: index z3.h, #0, #1 ; VBITS_GE_256-NEXT: ptrue p1.h -; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z2.h, z1.h -; VBITS_GE_256-NEXT: fmov h1, #5.00000000 -; VBITS_GE_256-NEXT: mov z0.h, p1/m, h1 +; VBITS_GE_256-NEXT: mov z1.h, w9 +; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h +; VBITS_GE_256-NEXT: mov z0.h, p1/m, h2 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_256-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a @@ -70,15 +70,15 @@ define <32 x half> @insertelement_v32f16(<32 x half>* %a) #0 { ; VBITS_GE_512-LABEL: insertelement_v32f16: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov w9, #31 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: mov w9, #31 -; VBITS_GE_512-NEXT: mov z1.h, w9 -; VBITS_GE_512-NEXT: index z2.h, #0, #1 +; VBITS_GE_512-NEXT: fmov h2, #5.00000000 +; VBITS_GE_512-NEXT: index z3.h, #0, #1 ; VBITS_GE_512-NEXT: ptrue p1.h -; VBITS_GE_512-NEXT: cmpeq p1.h, p1/z, z2.h, z1.h -; VBITS_GE_512-NEXT: fmov h1, #5.00000000 -; VBITS_GE_512-NEXT: mov z0.h, p1/m, h1 +; VBITS_GE_512-NEXT: mov z1.h, w9 +; VBITS_GE_512-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h +; VBITS_GE_512-NEXT: mov z0.h, p1/m, h2 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_512-NEXT: ret %op1 = load <32 x half>, <32 x half>* %a @@ -89,15 +89,15 @@ define <64 x half> @insertelement_v64f16(<64 x half>* %a) #0 { ; VBITS_GE_1024-LABEL: insertelement_v64f16: ; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: mov w9, #63 ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_1024-NEXT: mov w9, #63 -; VBITS_GE_1024-NEXT: mov z1.h, w9 -; VBITS_GE_1024-NEXT: index z2.h, #0, #1 +; VBITS_GE_1024-NEXT: fmov h2, #5.00000000 +; VBITS_GE_1024-NEXT: index z3.h, #0, #1 ; VBITS_GE_1024-NEXT: ptrue p1.h -; VBITS_GE_1024-NEXT: cmpeq p1.h, p1/z, z2.h, z1.h -; VBITS_GE_1024-NEXT: fmov h1, #5.00000000 -; VBITS_GE_1024-NEXT: mov z0.h, p1/m, h1 +; VBITS_GE_1024-NEXT: mov z1.h, w9 +; VBITS_GE_1024-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h +; VBITS_GE_1024-NEXT: mov z0.h, p1/m, h2 ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_1024-NEXT: ret %op1 = load <64 x half>, <64 x half>* %a @@ -108,15 +108,15 @@ define <128 x half> @insertelement_v128f16(<128 x half>* %a) #0 { ; VBITS_GE_2048-LABEL: insertelement_v128f16: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: mov w9, #127 ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: mov w9, #127 -; VBITS_GE_2048-NEXT: mov z1.h, w9 -; VBITS_GE_2048-NEXT: index z2.h, #0, #1 +; VBITS_GE_2048-NEXT: fmov h2, #5.00000000 +; VBITS_GE_2048-NEXT: index z3.h, #0, #1 ; VBITS_GE_2048-NEXT: ptrue p1.h -; VBITS_GE_2048-NEXT: cmpeq p1.h, p1/z, z2.h, z1.h -; VBITS_GE_2048-NEXT: fmov h1, #5.00000000 -; VBITS_GE_2048-NEXT: mov z0.h, p1/m, h1 +; VBITS_GE_2048-NEXT: mov z1.h, w9 +; VBITS_GE_2048-NEXT: cmpeq p1.h, p1/z, z3.h, z1.h +; VBITS_GE_2048-NEXT: mov z0.h, p1/m, h2 ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %op1 = load <128 x half>, <128 x half>* %a @@ -128,8 +128,8 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) #0 { ; VBITS_GE_256-LABEL: insertelement_v2f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $q0 ; VBITS_GE_256-NEXT: fmov s1, #5.00000000 +; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $q0 ; VBITS_GE_256-NEXT: mov v0.s[1], v1.s[0] ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $q0 ; VBITS_GE_256-NEXT: ret @@ -151,15 +151,15 @@ define <8 x float> @insertelement_v8f32(<8 x float>* %a) #0 { ; VBITS_GE_256-LABEL: insertelement_v8f32: ; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w9, #7 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: mov w9, #7 -; VBITS_GE_256-NEXT: mov z1.s, w9 -; VBITS_GE_256-NEXT: index z2.s, #0, #1 +; VBITS_GE_256-NEXT: fmov s2, #5.00000000 +; VBITS_GE_256-NEXT: index z3.s, #0, #1 ; VBITS_GE_256-NEXT: ptrue p1.s -; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z2.s, z1.s -; VBITS_GE_256-NEXT: fmov s1, #5.00000000 -; VBITS_GE_256-NEXT: mov z0.s, p1/m, s1 +; VBITS_GE_256-NEXT: mov z1.s, w9 +; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s +; VBITS_GE_256-NEXT: mov z0.s, p1/m, s2 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_256-NEXT: ret %op1 = load <8 x float>, <8 x float>* %a @@ -170,15 +170,15 @@ define <16 x float> @insertelement_v16f32(<16 x float>* %a) #0 { ; VBITS_GE_512-LABEL: insertelement_v16f32: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov w9, #15 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: mov w9, #15 -; VBITS_GE_512-NEXT: mov z1.s, w9 -; VBITS_GE_512-NEXT: index z2.s, #0, #1 +; VBITS_GE_512-NEXT: fmov s2, #5.00000000 +; VBITS_GE_512-NEXT: index z3.s, #0, #1 ; VBITS_GE_512-NEXT: ptrue p1.s -; VBITS_GE_512-NEXT: cmpeq p1.s, p1/z, z2.s, z1.s -; VBITS_GE_512-NEXT: fmov s1, #5.00000000 -; VBITS_GE_512-NEXT: mov z0.s, p1/m, s1 +; VBITS_GE_512-NEXT: mov z1.s, w9 +; VBITS_GE_512-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s +; VBITS_GE_512-NEXT: mov z0.s, p1/m, s2 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, <16 x float>* %a @@ -189,15 +189,15 @@ define <32 x float> @insertelement_v32f32(<32 x float>* %a) #0 { ; VBITS_GE_1024-LABEL: insertelement_v32f32: ; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: mov w9, #31 ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: mov w9, #31 -; VBITS_GE_1024-NEXT: mov z1.s, w9 -; VBITS_GE_1024-NEXT: index z2.s, #0, #1 +; VBITS_GE_1024-NEXT: fmov s2, #5.00000000 +; VBITS_GE_1024-NEXT: index z3.s, #0, #1 ; VBITS_GE_1024-NEXT: ptrue p1.s -; VBITS_GE_1024-NEXT: cmpeq p1.s, p1/z, z2.s, z1.s -; VBITS_GE_1024-NEXT: fmov s1, #5.00000000 -; VBITS_GE_1024-NEXT: mov z0.s, p1/m, s1 +; VBITS_GE_1024-NEXT: mov z1.s, w9 +; VBITS_GE_1024-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s +; VBITS_GE_1024-NEXT: mov z0.s, p1/m, s2 ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_1024-NEXT: ret %op1 = load <32 x float>, <32 x float>* %a @@ -208,15 +208,15 @@ define <64 x float> @insertelement_v64f32(<64 x float>* %a) #0 { ; VBITS_GE_2048-LABEL: insertelement_v64f32: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: mov w9, #63 ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: mov w9, #63 -; VBITS_GE_2048-NEXT: mov z1.s, w9 -; VBITS_GE_2048-NEXT: index z2.s, #0, #1 +; VBITS_GE_2048-NEXT: fmov s2, #5.00000000 +; VBITS_GE_2048-NEXT: index z3.s, #0, #1 ; VBITS_GE_2048-NEXT: ptrue p1.s -; VBITS_GE_2048-NEXT: cmpeq p1.s, p1/z, z2.s, z1.s -; VBITS_GE_2048-NEXT: fmov s1, #5.00000000 -; VBITS_GE_2048-NEXT: mov z0.s, p1/m, s1 +; VBITS_GE_2048-NEXT: mov z1.s, w9 +; VBITS_GE_2048-NEXT: cmpeq p1.s, p1/z, z3.s, z1.s +; VBITS_GE_2048-NEXT: mov z0.s, p1/m, s2 ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x float>, <64 x float>* %a @@ -248,15 +248,15 @@ define <4 x double> @insertelement_v4f64(<4 x double>* %a) #0 { ; VBITS_GE_256-LABEL: insertelement_v4f64: ; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: mov w9, #3 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: mov w9, #3 -; VBITS_GE_256-NEXT: mov z1.d, x9 -; VBITS_GE_256-NEXT: index z2.d, #0, #1 +; VBITS_GE_256-NEXT: fmov d2, #5.00000000 +; VBITS_GE_256-NEXT: index z3.d, #0, #1 ; VBITS_GE_256-NEXT: ptrue p1.d -; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z2.d, z1.d -; VBITS_GE_256-NEXT: fmov d1, #5.00000000 -; VBITS_GE_256-NEXT: mov z0.d, p1/m, d1 +; VBITS_GE_256-NEXT: mov z1.d, x9 +; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d +; VBITS_GE_256-NEXT: mov z0.d, p1/m, d2 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_256-NEXT: ret %op1 = load <4 x double>, <4 x double>* %a @@ -267,15 +267,15 @@ define <8 x double> @insertelement_v8f64(<8 x double>* %a) #0 { ; VBITS_GE_512-LABEL: insertelement_v8f64: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: mov w9, #7 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: mov w9, #7 -; VBITS_GE_512-NEXT: mov z1.d, x9 -; VBITS_GE_512-NEXT: index z2.d, #0, #1 +; VBITS_GE_512-NEXT: fmov d2, #5.00000000 +; VBITS_GE_512-NEXT: index z3.d, #0, #1 ; VBITS_GE_512-NEXT: ptrue p1.d -; VBITS_GE_512-NEXT: cmpeq p1.d, p1/z, z2.d, z1.d -; VBITS_GE_512-NEXT: fmov d1, #5.00000000 -; VBITS_GE_512-NEXT: mov z0.d, p1/m, d1 +; VBITS_GE_512-NEXT: mov z1.d, x9 +; VBITS_GE_512-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d +; VBITS_GE_512-NEXT: mov z0.d, p1/m, d2 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a @@ -286,15 +286,15 @@ define <16 x double> @insertelement_v16f64(<16 x double>* %a) #0 { ; VBITS_GE_1024-LABEL: insertelement_v16f64: ; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: mov w9, #15 ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: mov w9, #15 -; VBITS_GE_1024-NEXT: mov z1.d, x9 -; VBITS_GE_1024-NEXT: index z2.d, #0, #1 +; VBITS_GE_1024-NEXT: fmov d2, #5.00000000 +; VBITS_GE_1024-NEXT: index z3.d, #0, #1 ; VBITS_GE_1024-NEXT: ptrue p1.d -; VBITS_GE_1024-NEXT: cmpeq p1.d, p1/z, z2.d, z1.d -; VBITS_GE_1024-NEXT: fmov d1, #5.00000000 -; VBITS_GE_1024-NEXT: mov z0.d, p1/m, d1 +; VBITS_GE_1024-NEXT: mov z1.d, x9 +; VBITS_GE_1024-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d +; VBITS_GE_1024-NEXT: mov z0.d, p1/m, d2 ; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x double>, <16 x double>* %a @@ -305,15 +305,15 @@ define <32 x double> @insertelement_v32f64(<32 x double>* %a) #0 { ; VBITS_GE_2048-LABEL: insertelement_v32f64: ; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: mov w9, #31 ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: mov w9, #31 -; VBITS_GE_2048-NEXT: mov z1.d, x9 -; VBITS_GE_2048-NEXT: index z2.d, #0, #1 +; VBITS_GE_2048-NEXT: fmov d2, #5.00000000 +; VBITS_GE_2048-NEXT: index z3.d, #0, #1 ; VBITS_GE_2048-NEXT: ptrue p1.d -; VBITS_GE_2048-NEXT: cmpeq p1.d, p1/z, z2.d, z1.d -; VBITS_GE_2048-NEXT: fmov d1, #5.00000000 -; VBITS_GE_2048-NEXT: mov z0.d, p1/m, d1 +; VBITS_GE_2048-NEXT: mov z1.d, x9 +; VBITS_GE_2048-NEXT: cmpeq p1.d, p1/z, z3.d, z1.d +; VBITS_GE_2048-NEXT: mov z0.d, p1/m, d2 ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x double>, <32 x double>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -34,22 +34,22 @@ ; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h ; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG0]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s ; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[DIV]].h, [[DIV]].h -; CHECK-NEXT: umov [[SCALAR:w[0-9]+]], [[VEC:v[0-9]+]].h[0] -; CHECK-NEXT: fmov s0, [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[1] -; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[2] -; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[3] -; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[4] -; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[5] -; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[6] -; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[7] -; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR]] +; CHECK-NEXT: umov [[SCALAR0:w[0-9]+]], [[VEC:v[0-9]+]].h[0] +; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC]].h[1] +; CHECK-NEXT: fmov s0, [[SCALAR0]] +; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[2] +; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR1]] +; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR2]] +; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[3] +; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR3]] +; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[4] +; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR4]] +; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[5] +; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR5]] +; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[6] +; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR6]] +; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7] +; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]] ; CHECK: ret %res = sdiv <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -90,11 +90,11 @@ ; FULL VECTOR: ; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -116,11 +116,11 @@ ; HALF VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -133,11 +133,11 @@ ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_GE_1024-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s @@ -157,11 +157,11 @@ ; FULL VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -183,11 +183,11 @@ ; HALF VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -200,11 +200,11 @@ ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_GE_2048-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s @@ -224,11 +224,11 @@ ; FULL VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -250,11 +250,11 @@ ; HALF VECTOR: ; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128 +; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -277,11 +277,11 @@ ; FULL VECTOR: ; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl256 +; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -314,13 +314,14 @@ ; CHECK: sshll v1.4s, v1.4h, #0 ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4 -; CHECK-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP1:z[0-9]+]].s, [[OP2:z[0-9]+]].s -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] +; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2:z[0-9]+]].s, [[OP1:z[0-9]+]].s +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[3] ; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: mov v0.h[3], w8 ; CHECK: ret %res = sdiv <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -343,11 +344,11 @@ ; FULL VECTOR: ; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s @@ -357,11 +358,11 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_512-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_512-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0] @@ -378,11 +379,11 @@ ; FULL VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s @@ -392,11 +393,11 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32 +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_1024-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_1024-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0] @@ -413,11 +414,11 @@ ; FULL VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s @@ -427,11 +428,11 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64 +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_2048-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_2048-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0] @@ -446,11 +447,11 @@ define void @sdiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { ; CHECK-LABEL: sdiv_v128i16: ; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].h, vl128 +; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s @@ -639,22 +640,22 @@ ; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h ; CHECK-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG0]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s ; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[DIV]].h, [[DIV]].h -; CHECK-NEXT: umov [[SCALAR:w[0-9]+]], [[VEC:v[0-9]+]].h[0] -; CHECK-NEXT: fmov s0, [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[1] -; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[2] -; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[3] -; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[4] -; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[5] -; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[6] -; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[7] -; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR]] +; CHECK-NEXT: umov [[SCALAR0:w[0-9]+]], [[VEC:v[0-9]+]].h[0] +; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC]].h[1] +; CHECK-NEXT: fmov s0, [[SCALAR0]] +; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[2] +; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR1]] +; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR2]] +; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[3] +; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR3]] +; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[4] +; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR4]] +; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[5] +; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR5]] +; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[6] +; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR6]] +; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7] +; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]] ; CHECK: ret %res = udiv <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -695,11 +696,11 @@ ; FULL VECTOR: ; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -721,11 +722,11 @@ ; HALF VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -738,11 +739,11 @@ ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_GE_1024-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s @@ -762,11 +763,11 @@ ; FULL VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -788,11 +789,11 @@ ; HALF VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -805,11 +806,11 @@ ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_GE_2048-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s @@ -829,11 +830,11 @@ ; FULL VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -855,11 +856,11 @@ ; HALF VECTOR: ; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128 +; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -880,11 +881,11 @@ define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { ; CHECK-LABEL: udiv_v256i8: ; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl256 +; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -917,13 +918,14 @@ ; CHECK: ushll v1.4s, v1.4h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4 -; CHECK-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP1:z[0-9]+]].s, [[OP2:z[0-9]+]].s -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] +; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2:z[0-9]+]].s, [[OP1:z[0-9]+]].s +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[3] ; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: mov v0.h[3], w8 ; CHECK: ret %res = udiv <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -946,11 +948,11 @@ ; FULL VECTOR: ; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s @@ -960,11 +962,11 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_512-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_512-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0] @@ -981,11 +983,11 @@ ; FULL VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s @@ -995,11 +997,11 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32 +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_1024-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_1024-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0] @@ -1016,11 +1018,11 @@ ; FULL VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s @@ -1030,11 +1032,11 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64 +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_2048-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_2048-NEXT: st1h { [[UZP1:z[0-9]+]].h }, [[PG1]], [x0] @@ -1049,11 +1051,11 @@ define void @udiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { ; CHECK-LABEL: udiv_v128i16: ; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].h, vl128 +; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_2048-NEXT: udivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s @@ -1233,8 +1235,8 @@ define void @udiv_constantsplat_v8i32(<8 x i32>* %a) #1 { ; CHECK-LABEL: udiv_constantsplat_v8i32: ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 -; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: mov [[OP2:z[0-9]+]].s, #95 +; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -81,8 +81,8 @@ define void @sext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 { ; CHECK-LABEL: sext_v32i8_v32i16: ; VBITS_GE_512: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].h, vl32 +; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret %a = load <32 x i8>, <32 x i8>* %in @@ -95,8 +95,8 @@ define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 { ; CHECK-LABEL: sext_v64i8_v64i16: ; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].h, vl64 +; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <64 x i8>, <64 x i8>* %in @@ -109,8 +109,8 @@ define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 { ; CHECK-LABEL: sext_v128i8_v128i16: ; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].h, vl128 +; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <128 x i8>, <128 x i8>* %in @@ -163,9 +163,9 @@ define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 { ; CHECK-LABEL: sext_v32i8_v32i32: ; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b +; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <32 x i8>, <32 x i8>* %in @@ -178,9 +178,9 @@ define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 { ; CHECK-LABEL: sext_v64i8_v64i32: ; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b +; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <64 x i8>, <64 x i8>* %in @@ -239,10 +239,10 @@ define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: sext_v32i8_v32i64: ; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b +; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i8>, <32 x i8>* %in @@ -270,8 +270,8 @@ define void @sext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 { ; CHECK-LABEL: sext_v16i16_v16i32: ; VBITS_GE_512: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].s, vl16 +; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret %a = load <16 x i16>, <16 x i16>* %in @@ -284,8 +284,8 @@ define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 { ; CHECK-LABEL: sext_v32i16_v32i32: ; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <32 x i16>, <32 x i16>* %in @@ -298,8 +298,8 @@ define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 { ; CHECK-LABEL: sext_v64i16_v64i32: ; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <64 x i16>, <64 x i16>* %in @@ -340,9 +340,9 @@ define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 { ; CHECK-LABEL: sext_v16i16_v16i64: ; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h +; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <16 x i16>, <16 x i16>* %in @@ -355,9 +355,9 @@ define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: sext_v32i16_v32i64: ; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h +; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i16>, <32 x i16>* %in @@ -385,8 +385,8 @@ define void @sext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 { ; CHECK-LABEL: sext_v8i32_v8i64: ; VBITS_GE_512: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in @@ -399,8 +399,8 @@ define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 { ; CHECK-LABEL: sext_v16i32_v16i64: ; VBITS_GE_1024: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <16 x i32>, <16 x i32>* %in @@ -413,8 +413,8 @@ define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: sext_v32i32_v32i64: ; VBITS_GE_2048: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i32>, <32 x i32>* %in @@ -443,8 +443,8 @@ define void @zext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 { ; CHECK-LABEL: zext_v32i8_v32i16: ; VBITS_GE_512: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].h, vl32 +; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret %a = load <32 x i8>, <32 x i8>* %in @@ -457,8 +457,8 @@ define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 { ; CHECK-LABEL: zext_v64i8_v64i16: ; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].h, vl64 +; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <64 x i8>, <64 x i8>* %in @@ -471,8 +471,8 @@ define void @zext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 { ; CHECK-LABEL: zext_v128i8_v128i16: ; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].h, vl128 +; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <128 x i8>, <128 x i8>* %in @@ -525,9 +525,9 @@ define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 { ; CHECK-LABEL: zext_v32i8_v32i32: ; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b +; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <32 x i8>, <32 x i8>* %in @@ -540,9 +540,9 @@ define void @zext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 { ; CHECK-LABEL: zext_v64i8_v64i32: ; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b +; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <64 x i8>, <64 x i8>* %in @@ -601,10 +601,10 @@ define void @zext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: zext_v32i8_v32i64: ; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b +; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i8>, <32 x i8>* %in @@ -632,8 +632,8 @@ define void @zext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 { ; CHECK-LABEL: zext_v16i16_v16i32: ; VBITS_GE_512: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].s, vl16 +; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret %a = load <16 x i16>, <16 x i16>* %in @@ -646,8 +646,8 @@ define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 { ; CHECK-LABEL: zext_v32i16_v32i32: ; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <32 x i16>, <32 x i16>* %in @@ -660,8 +660,8 @@ define void @zext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 { ; CHECK-LABEL: zext_v64i16_v64i32: ; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <64 x i16>, <64 x i16>* %in @@ -702,9 +702,9 @@ define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 { ; CHECK-LABEL: zext_v16i16_v16i64: ; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h +; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <16 x i16>, <16 x i16>* %in @@ -717,9 +717,9 @@ define void @zext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: zext_v32i16_v32i64: ; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h +; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i16>, <32 x i16>* %in @@ -747,8 +747,8 @@ define void @zext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 { ; CHECK-LABEL: zext_v8i32_v8i64: ; VBITS_GE_512: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret %a = load <8 x i32>, <8 x i32>* %in @@ -761,8 +761,8 @@ define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 { ; CHECK-LABEL: zext_v16i32_v16i64: ; VBITS_GE_1024: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret %a = load <16 x i32>, <16 x i32>* %in @@ -775,8 +775,8 @@ define void @zext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: zext_v32i32_v32i64: ; VBITS_GE_2048: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret %a = load <32 x i32>, <32 x i32>* %in diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll @@ -15,8 +15,8 @@ ; CHECK-LABEL: add_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: mov z1.b, #7 // =0x7 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: add z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -32,8 +32,8 @@ ; CHECK-LABEL: add_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mov z1.h, #15 // =0xf +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: add z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -49,8 +49,8 @@ ; CHECK-LABEL: add_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z1.s, #31 // =0x1f +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -66,8 +66,8 @@ ; CHECK-LABEL: add_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov z1.d, #63 // =0x3f +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: add z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -367,8 +367,8 @@ ; CHECK-LABEL: mul_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: mov z1.b, #7 // =0x7 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -384,8 +384,8 @@ ; CHECK-LABEL: mul_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mov z1.h, #15 // =0xf +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -401,8 +401,8 @@ ; CHECK-LABEL: mul_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z1.s, #31 // =0x1f +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -418,8 +418,8 @@ ; CHECK-LABEL: mul_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov z1.d, #63 // =0x3f +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -575,8 +575,8 @@ ; CHECK-LABEL: smax_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: mov z1.b, #7 // =0x7 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -592,8 +592,8 @@ ; CHECK-LABEL: smax_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mov z1.h, #15 // =0xf +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -609,8 +609,8 @@ ; CHECK-LABEL: smax_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z1.s, #31 // =0x1f +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -626,8 +626,8 @@ ; CHECK-LABEL: smax_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov z1.d, #63 // =0x3f +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -647,8 +647,8 @@ ; CHECK-LABEL: smin_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: mov z1.b, #7 // =0x7 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -664,8 +664,8 @@ ; CHECK-LABEL: smin_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mov z1.h, #15 // =0xf +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -681,8 +681,8 @@ ; CHECK-LABEL: smin_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z1.s, #31 // =0x1f +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -698,8 +698,8 @@ ; CHECK-LABEL: smin_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov z1.d, #63 // =0x3f +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -719,8 +719,8 @@ ; CHECK-LABEL: sub_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: mov z1.b, #7 // =0x7 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: sub z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -736,8 +736,8 @@ ; CHECK-LABEL: sub_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mov z1.h, #15 // =0xf +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: sub z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -753,8 +753,8 @@ ; CHECK-LABEL: sub_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z1.s, #31 // =0x1f +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: sub z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -770,8 +770,8 @@ ; CHECK-LABEL: sub_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov z1.d, #63 // =0x3f +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: sub z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -791,8 +791,8 @@ ; CHECK-LABEL: umax_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: mov z1.b, #7 // =0x7 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -808,8 +808,8 @@ ; CHECK-LABEL: umax_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mov z1.h, #15 // =0xf +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -825,8 +825,8 @@ ; CHECK-LABEL: umax_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z1.s, #31 // =0x1f +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -842,8 +842,8 @@ ; CHECK-LABEL: umax_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov z1.d, #63 // =0x3f +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -863,8 +863,8 @@ ; CHECK-LABEL: umin_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: mov z1.b, #7 // =0x7 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -880,8 +880,8 @@ ; CHECK-LABEL: umin_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: mov z1.h, #15 // =0xf +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -897,8 +897,8 @@ ; CHECK-LABEL: umin_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: mov z1.s, #31 // =0x1f +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -914,8 +914,8 @@ ; CHECK-LABEL: umin_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: mov z1.d, #63 // =0x3f +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -39,10 +39,10 @@ ; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ushr v1.8h, v0.8h, #8 ; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: umov w9, v1.h[1] ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v0.b[1], w8 ; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v0.b[1], w9 ; CHECK-NEXT: mov v0.b[2], w8 ; CHECK-NEXT: umov w8, v1.h[3] ; CHECK-NEXT: mov v0.b[3], w8 @@ -210,13 +210,14 @@ ; CHECK-LABEL: smulh_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-NEXT: ushr v0.4s, v0.4s, #16 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] +; CHECK-NEXT: ushr v1.4s, v0.4s, #16 +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[3] ; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %insert = insertelement <4 x i32> undef, i32 16, i64 0 @@ -666,10 +667,10 @@ ; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b ; CHECK-NEXT: ushr v1.8h, v0.8h, #8 ; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: umov w9, v1.h[1] ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v0.b[1], w8 ; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v0.b[1], w9 ; CHECK-NEXT: mov v0.b[2], w8 ; CHECK-NEXT: umov w8, v1.h[3] ; CHECK-NEXT: mov v0.b[3], w8 @@ -837,13 +838,14 @@ ; CHECK-LABEL: umulh_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h -; CHECK-NEXT: ushr v0.4s, v0.4s, #16 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] +; CHECK-NEXT: ushr v1.4s, v0.4s, #16 +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[3] ; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %insert = insertelement <4 x i32> undef, i32 16, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -34,22 +34,22 @@ ; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h ; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h -; CHECK-NEXT: umov [[SCALAR:w[0-9]+]], [[VEC:v[0-9]+]].h[0] -; CHECK-NEXT: fmov s3, [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[1] -; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[2] -; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[3] -; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[4] -; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[5] -; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[6] -; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[7] -; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR]] +; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].h[0] +; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[1] +; CHECK-NEXT: fmov s3, [[SCALAR1]] +; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[2] +; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR2]] +; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR3]] +; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[3] +; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR4]] +; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[4] +; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR5]] +; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[5] +; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR6]] +; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[6] +; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR7]] +; CHECK-NEXT: umov [[SCALAR8:w[0-9]+]], [[VEC]].h[7] +; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR8]] ; CHECK-NEXT: mls v0.8b, [[FINAL]].8b, v1.8b ; CHECK: ret %res = srem <8 x i8> %op1, %op2 @@ -76,9 +76,9 @@ ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_512: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b +; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_GE_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b @@ -93,11 +93,11 @@ ; FULL VECTOR: ; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -121,11 +121,11 @@ ; HALF VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -140,14 +140,14 @@ ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h -; VBITS_GE_1024-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s +; VBITS_GE_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b ; VBITS_GE_1024-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b @@ -166,11 +166,11 @@ ; FULL VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -194,11 +194,11 @@ ; HALF VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -213,14 +213,14 @@ ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h -; VBITS_GE_2048-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s +; VBITS_GE_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b ; VBITS_GE_2048-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b @@ -239,11 +239,11 @@ ; FULL VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -267,11 +267,11 @@ ; HALF VECTOR: ; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128 +; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -296,11 +296,11 @@ ; FULL VECTOR: ; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].b, vl256 +; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h @@ -338,11 +338,12 @@ ; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, z2.s, z3.s ; CHECK-NEXT: mov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].s[1] ; CHECK-NEXT: mov [[SCALAR2:w[0-9]+]], [[VEC]].s[2] +; CHECK-NEXT: mov [[VEC2:v[0-9]+]].16b, [[VEC]].16b +; CHECK-NEXT: mov [[VEC2]].h[1], [[SCALAR1]] ; CHECK-NEXT: mov [[SCALAR3:w[0-9]+]], [[VEC]].s[3] -; CHECK-NEXT: mov [[VEC]].h[1], [[SCALAR1]] -; CHECK-NEXT: mov [[VEC]].h[2], [[SCALAR2]] -; CHECK-NEXT: mov [[VEC]].h[3], [[SCALAR3]] -; CHECK-NEXT: mls v0.4h, [[VEC]].4h, v1.4h +; CHECK-NEXT: mov [[VEC2]].h[2], [[SCALAR2]] +; CHECK-NEXT: mov [[VEC2]].h[3], [[SCALAR3]] +; CHECK-NEXT: mls v0.4h, [[VEC2]].4h, v1.4h ; CHECK: ret %res = srem <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -350,9 +351,9 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; CHECK-LABEL: srem_v8i16: -; CHECK: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8 +; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8 ; CHECK-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h @@ -366,15 +367,16 @@ ; FULL VECTOR: ; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_256-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h -; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_256-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_256-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s +; VBITS_EQ_256-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_256-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h -; VBITS_EQ_256-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_EQ_256-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] +; VBITS_EQ_256-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_256-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_EQ_256-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -382,12 +384,12 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h -; VBITS_GE_512-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h +; VBITS_GE_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_512-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_GE_512-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -405,15 +407,16 @@ ; FULL VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h -; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s +; VBITS_EQ_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h -; VBITS_EQ_512-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_EQ_512-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] +; VBITS_EQ_512-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_512-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_EQ_512-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -421,12 +424,12 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32 +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h -; VBITS_GE_1024-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h +; VBITS_GE_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_1024-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_GE_1024-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -442,15 +445,16 @@ define void @srem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { ; CHECK-LABEL: srem_v64i16: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h -; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s +; VBITS_EQ_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h -; VBITS_EQ_1024-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_EQ_1024-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] +; VBITS_EQ_1024-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_1024-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_EQ_1024-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -458,12 +462,12 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64 +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h -; VBITS_GE_2048-NEXT: sdiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h +; VBITS_GE_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_2048-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_GE_2048-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -479,15 +483,16 @@ define void @srem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { ; CHECK-LABEL: srem_v128i16: ; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].h, vl128 +; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h -; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_2048-NEXT: sdivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s +; VBITS_EQ_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h -; VBITS_EQ_2048-NEXT: sdiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_EQ_2048-NEXT: movprfx [[OP3_LO:z[0-9]+]], [[OP1_LO]] +; VBITS_EQ_2048-NEXT: sdiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP3_LO]].s, [[OP2_LO]].s ; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h ; VBITS_EQ_2048-NEXT: mul [[MUL:z[0-9]+]].h, [[PG]]/m, [[OP2]].h, [[ZIP]].h ; VBITS_EQ_2048-NEXT: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[MUL]].h @@ -708,22 +713,22 @@ ; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h ; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h -; CHECK-NEXT: umov [[SCALAR:w[0-9]+]], [[VEC:v[0-9]+]].h[0] -; CHECK-NEXT: fmov s3, [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[1] -; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[2] -; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[3] -; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[4] -; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[5] -; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[6] -; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR]] -; CHECK-NEXT: umov [[SCALAR]], [[VEC]].h[7] -; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR]] +; CHECK-NEXT: umov [[SCALAR0:w[0-9]+]], [[VEC:v[0-9]+]].h[0] +; CHECK-NEXT: umov [[SCALAR1:w[0-9]+]], [[VEC]].h[1] +; CHECK-NEXT: fmov s3, [[SCALAR0]] +; CHECK-NEXT: umov [[SCALAR2:w[0-9]+]], [[VEC]].h[2] +; CHECK-NEXT: mov [[FINAL:v[0-9]+]].b[1], [[SCALAR1]] +; CHECK-NEXT: mov [[FINAL]].b[2], [[SCALAR2]] +; CHECK-NEXT: umov [[SCALAR3:w[0-9]+]], [[VEC]].h[3] +; CHECK-NEXT: mov [[FINAL]].b[3], [[SCALAR3]] +; CHECK-NEXT: umov [[SCALAR4:w[0-9]+]], [[VEC]].h[4] +; CHECK-NEXT: mov [[FINAL]].b[4], [[SCALAR4]] +; CHECK-NEXT: umov [[SCALAR5:w[0-9]+]], [[VEC]].h[5] +; CHECK-NEXT: mov [[FINAL]].b[5], [[SCALAR5]] +; CHECK-NEXT: umov [[SCALAR6:w[0-9]+]], [[VEC]].h[6] +; CHECK-NEXT: mov [[FINAL]].b[6], [[SCALAR6]] +; CHECK-NEXT: umov [[SCALAR7:w[0-9]+]], [[VEC]].h[7] +; CHECK-NEXT: mov [[FINAL]].b[7], [[SCALAR7]] ; CHECK-NEXT: mls v0.8b, [[FINAL]].8b, v1.8b ; CHECK: ret %res = urem <8 x i8> %op1, %op2 @@ -750,9 +755,9 @@ ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_512: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b +; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_GE_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b @@ -767,11 +772,11 @@ ; FULL VECTOR: ; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_256-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -795,11 +800,11 @@ ; HALF VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -814,14 +819,14 @@ ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32 +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h -; VBITS_GE_1024-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s +; VBITS_GE_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b ; VBITS_GE_1024-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b @@ -840,11 +845,11 @@ ; FULL VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -868,11 +873,11 @@ ; HALF VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -887,14 +892,14 @@ ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64 +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h -; VBITS_GE_2048-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s +; VBITS_GE_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b ; VBITS_GE_2048-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b @@ -913,11 +918,11 @@ ; FULL VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl128 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h @@ -941,11 +946,11 @@ ; HALF VECTOR: ; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128 +; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h ; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h ; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h @@ -968,11 +973,11 @@ define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { ; CHECK-LABEL: urem_v256i8: ; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].b, vl256 +; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b +; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b ; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b ; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h @@ -1010,11 +1015,12 @@ ; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, z2.s, z3.s ; CHECK-NEXT: mov [[SCALAR1:w[0-9]+]], [[VEC:v[0-9]+]].s[1] ; CHECK-NEXT: mov [[SCALAR2:w[0-9]+]], [[VEC]].s[2] +; CHECK-NEXT: mov v3.16b, v2.16b +; CHECK-NEXT: mov [[VECO:v[0-9]+]].h[1], [[SCALAR1]] ; CHECK-NEXT: mov [[SCALAR3:w[0-9]+]], [[VEC]].s[3] -; CHECK-NEXT: mov [[VEC]].h[1], [[SCALAR1]] -; CHECK-NEXT: mov [[VEC]].h[2], [[SCALAR2]] -; CHECK-NEXT: mov [[VEC]].h[3], [[SCALAR3]] -; CHECK-NEXT: mls v0.4h, [[VEC]].4h, v1.4h +; CHECK-NEXT: mov [[VECO]].h[2], [[SCALAR2]] +; CHECK-NEXT: mov [[VECO]].h[3], [[SCALAR3]] +; CHECK-NEXT: mls v0.4h, [[VECO]].4h, v1.4h ; CHECK: ret %res = urem <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -1022,9 +1028,9 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; CHECK-LABEL: urem_v8i16: -; CHECK: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8 +; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl8 ; CHECK-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG1]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; CHECK-NEXT: mls v0.8h, v2.8h, v1.8h @@ -1038,15 +1044,16 @@ ; FULL VECTOR: ; VBITS_EQ_256: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 ; VBITS_EQ_256-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_256-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl8 -; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_256-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h -; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_256-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_256-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s +; VBITS_EQ_256-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_256-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h -; VBITS_EQ_256-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_EQ_256-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] +; VBITS_EQ_256-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_256-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_EQ_256-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -1054,12 +1061,12 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16 +; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h -; VBITS_GE_512-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h +; VBITS_GE_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_512-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_GE_512-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -1077,15 +1084,16 @@ ; FULL VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].h, vl32 +; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_EQ_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 -; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h -; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s +; VBITS_EQ_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h -; VBITS_EQ_512-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_EQ_512-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] +; VBITS_EQ_512-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_512-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_EQ_512-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -1093,12 +1101,12 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32 +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h -; VBITS_GE_1024-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h +; VBITS_GE_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_1024-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_GE_1024-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -1114,15 +1122,16 @@ define void @urem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { ; CHECK-LABEL: urem_v64i16: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].h, vl64 +; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 ; VBITS_EQ_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_EQ_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_EQ_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32 -; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h -; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_HI]].s, [[OP1_HI]].s +; VBITS_EQ_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h -; VBITS_EQ_1024-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_EQ_1024-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] +; VBITS_EQ_1024-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_1024-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_EQ_1024-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -1130,12 +1139,12 @@ ; HALF VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64 +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 ; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64 -; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h -; VBITS_GE_2048-NEXT: udiv [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h +; VBITS_GE_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_2048-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h ; VBITS_GE_2048-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h @@ -1151,15 +1160,16 @@ define void @urem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { ; CHECK-LABEL: urem_v128i16: ; VBITS_EQ_2048: ptrue [[PG:p[0-9]+]].h, vl128 +; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64 ; VBITS_EQ_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_EQ_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; VBITS_EQ_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl64 -; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h -; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h +; VBITS_EQ_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h ; VBITS_EQ_2048-NEXT: udivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s +; VBITS_EQ_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h ; VBITS_EQ_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h -; VBITS_EQ_2048-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_EQ_2048-NEXT: movprfx [[RES_LO:z[0-9]+]], [[OP1_LO]] +; VBITS_EQ_2048-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[RES_LO]].s, [[OP2_LO]].s ; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h ; VBITS_EQ_2048-NEXT: mul [[MUL:z[0-9]+]].h, [[PG]]/m, [[OP2]].h, [[ZIP]].h ; VBITS_EQ_2048-NEXT: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[MUL]].h diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll @@ -22,7 +22,7 @@ ; Don't use SVE for 64-bit vectors. define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) #0 { -; CHECK: select_v8i8: +; CHECK-LABEL: select_v8i8: ; CHECK: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: dup v2.8b, w8 @@ -34,7 +34,7 @@ ; Don't use SVE for 128-bit vectors. define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) #0 { -; CHECK: select_v16i8: +; CHECK-LABEL: select_v16i8: ; CHECK: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: dup v2.16b, w8 @@ -45,14 +45,14 @@ } define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) #0 { -; CHECK: select_v32i8: -; CHECK: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,32)]] -; CHECK-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v32i8: +; CHECK: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,32)]] ; CHECK-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; CHECK-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] +; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].b ; CHECK-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]] ; CHECK-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1 -; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].b ; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0 ; CHECK-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b ; CHECK-NEXT: st1b { [[RES]].b }, [[PG1]], [x0] @@ -65,14 +65,14 @@ } define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 { -; CHECK: select_v64i8: -; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,64)]] -; VBITS_GE_512-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v64i8: +; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,64)]] ; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] +; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].b ; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]] ; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1 -; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].b ; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0 ; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b ; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG1]], [x0] @@ -85,14 +85,14 @@ } define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) #0 { -; CHECK: select_v128i8: -; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,128)]] -; VBITS_GE_1024-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v128i8: +; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,128)]] ; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].b ; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]] ; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1 -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].b ; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0 ; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b ; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG1]], [x0] @@ -105,14 +105,14 @@ } define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) #0 { -; CHECK: select_v256i8: -; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,256)]] -; VBITS_GE_2048-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v256i8: +; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,256)]] ; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1] +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].b ; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]] ; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1 -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].b ; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0 ; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b ; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG1]], [x0] @@ -126,7 +126,7 @@ ; Don't use SVE for 64-bit vectors. define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) #0 { -; CHECK: select_v4i16: +; CHECK-LABEL: select_v4i16: ; CHECK: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: dup v2.4h, w8 @@ -138,7 +138,7 @@ ; Don't use SVE for 128-bit vectors. define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) #0 { -; CHECK: select_v8i16: +; CHECK-LABEL: select_v8i16: ; CHECK: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: dup v2.8h, w8 @@ -149,14 +149,14 @@ } define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) #0 { -; CHECK: select_v16i16: -; CHECK: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] -; CHECK-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v16i16: +; CHECK: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] ; CHECK-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; CHECK-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] +; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h ; CHECK-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]] ; CHECK-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1 -; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h ; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0 ; CHECK-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h ; CHECK-NEXT: st1h { [[RES]].h }, [[PG1]], [x0] @@ -169,14 +169,14 @@ } define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 { -; CHECK: select_v32i16: -; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] -; VBITS_GE_512-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v32i16: +; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] ; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] +; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h ; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]] ; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1 -; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h ; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0 ; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG1]], [x0] @@ -189,14 +189,14 @@ } define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) #0 { -; CHECK: select_v64i16: -; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] -; VBITS_GE_1024-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v64i16: +; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] ; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h ; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]] ; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1 -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h ; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0 ; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG1]], [x0] @@ -209,14 +209,14 @@ } define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) #0 { -; CHECK: select_v128i16: -; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] -; VBITS_GE_2048-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v128i16: +; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] ; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1] +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h ; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]] ; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1 -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h ; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0 ; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG1]], [x0] @@ -230,7 +230,7 @@ ; Don't use SVE for 64-bit vectors. define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) #0 { -; CHECK: select_v2i32: +; CHECK-LABEL: select_v2i32: ; CHECK: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: dup v2.2s, w8 @@ -242,7 +242,7 @@ ; Don't use SVE for 128-bit vectors. define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) #0 { -; CHECK: select_v4i32: +; CHECK-LABEL: select_v4i32: ; CHECK: tst w0, #0x1 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: dup v2.4s, w8 @@ -253,14 +253,14 @@ } define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) #0 { -; CHECK: select_v8i32: -; CHECK: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] -; CHECK-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v8i32: +; CHECK: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] ; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0] ; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1] +; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s ; CHECK-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]] ; CHECK-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1 -; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s ; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0 ; CHECK-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s ; CHECK-NEXT: st1w { [[RES]].s }, [[PG1]], [x0] @@ -273,14 +273,14 @@ } define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 { -; CHECK: select_v16i32: -; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] -; VBITS_GE_512-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v16i32: +; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] ; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1] +; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]] ; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1 -; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0 ; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x0] @@ -293,14 +293,14 @@ } define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) #0 { -; CHECK: select_v32i32: -; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] -; VBITS_GE_1024-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v32i32: +; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] ; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1] +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]] ; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1 -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0 ; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x0] @@ -313,14 +313,14 @@ } define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) #0 { -; CHECK: select_v64i32: -; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] -; VBITS_GE_2048-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v64i32: +; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] ; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1] +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]] ; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1 -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s ; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0 ; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x0] @@ -334,7 +334,7 @@ ; Don't use SVE for 64-bit vectors. define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) #0 { -; CHECK: select_v1i64: +; CHECK-LABEL: select_v1i64: ; CHECK: tst w0, #0x1 ; CHECK-NEXT: csetm x8, ne ; CHECK-NEXT: fmov d2, x8 @@ -346,7 +346,7 @@ ; Don't use SVE for 128-bit vectors. define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) #0 { -; CHECK: select_v2i64: +; CHECK-LABEL: select_v2i64: ; CHECK: tst w0, #0x1 ; CHECK-NEXT: csetm x8, ne ; CHECK-NEXT: dup v2.2d, x8 @@ -357,14 +357,14 @@ } define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) #0 { -; CHECK: select_v4i64: -; CHECK: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] -; CHECK-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v4i64: +; CHECK: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] ; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0] ; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1] +; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d ; CHECK-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]] ; CHECK-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1 -; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d ; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0 ; CHECK-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d ; CHECK-NEXT: st1d { [[RES]].d }, [[PG1]], [x0] @@ -377,14 +377,14 @@ } define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 { -; CHECK: select_v8i64: -; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] -; VBITS_GE_512-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v8i64: +; VBITS_GE_512: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] ; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1] +; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]] ; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1 -; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0 ; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x0] @@ -397,14 +397,14 @@ } define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) #0 { -; CHECK: select_v16i64: -; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] -; VBITS_GE_1024-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v16i64: +; VBITS_GE_1024: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] ; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1] +; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]] ; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1 -; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0 ; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x0] @@ -417,14 +417,14 @@ } define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, i1 %mask) #0 { -; CHECK: select_v32i64: -; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] -; VBITS_GE_2048-NEXT: and w[[AND:[0-9]+]], w2, #0x1 +; CHECK-LABEL: select_v32i64: +; VBITS_GE_2048: and w[[AND:[0-9]+]], w2, #0x1 +; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] ; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1] +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]] ; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1 -; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d ; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0 ; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -169,18 +169,18 @@ ; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 ; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: mov x8, sp +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_EQ_256-NEXT: mov x8, #8 ; VBITS_EQ_256-NEXT: ldp q0, q1, [sp] ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: mov x8, #8 ; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h ; VBITS_EQ_256-NEXT: ucvtf z0.s, p0/m, z0.s +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_EQ_256-NEXT: ucvtf z1.s, p0/m, z1.s ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_EQ_256-NEXT: mov sp, x29 ; VBITS_EQ_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_EQ_256-NEXT: ret @@ -284,17 +284,17 @@ ; VBITS_EQ_256-LABEL: ucvtf_v8i16_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ldr q0, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: uunpklo z1.s, z0.h -; VBITS_EQ_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d ; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v8i16_v8f64: @@ -396,12 +396,12 @@ ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ptrue p0.s -; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 ; VBITS_EQ_256-NEXT: ucvtf z0.h, p0/m, z0.s ; VBITS_EQ_256-NEXT: ucvtf z1.h, p0/m, z1.s ; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 +; VBITS_EQ_256-NEXT: splice z1.h, p0, z1.h, z0.h ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_EQ_256-NEXT: ret @@ -412,8 +412,8 @@ ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: ucvtf z0.h, p0/m, z0.s -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a @@ -429,8 +429,8 @@ ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.s ; VBITS_GE_1024-NEXT: ucvtf z0.h, p0/m, z0.s -; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <32 x i32>, <32 x i32>* %a @@ -446,8 +446,8 @@ ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s ; VBITS_GE_2048-NEXT: ucvtf z0.h, p0/m, z0.s -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 +; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x i32>, <64 x i32>* %a @@ -601,18 +601,18 @@ ; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 ; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: mov x8, sp +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: ldp q0, q1, [sp] ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: ucvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_EQ_256-NEXT: ucvtf z1.d, p0/m, z1.d ; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_EQ_256-NEXT: mov sp, x29 ; VBITS_EQ_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_EQ_256-NEXT: ret @@ -752,9 +752,9 @@ ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.d ; VBITS_GE_1024-NEXT: ucvtf z0.h, p0/m, z0.d +; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x i64>, <16 x i64>* %a @@ -770,9 +770,9 @@ ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d ; VBITS_GE_2048-NEXT: ucvtf z0.h, p0/m, z0.d +; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x i64>, <32 x i64>* %a @@ -831,12 +831,12 @@ ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ptrue p0.d -; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 ; VBITS_EQ_256-NEXT: ucvtf z0.s, p0/m, z0.d ; VBITS_EQ_256-NEXT: ucvtf z1.s, p0/m, z1.d ; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_EQ_256-NEXT: ret @@ -847,8 +847,8 @@ ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: ucvtf z0.s, p0/m, z0.d -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a @@ -864,8 +864,8 @@ ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.d ; VBITS_GE_1024-NEXT: ucvtf z0.s, p0/m, z0.d -; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 +; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x i64>, <16 x i64>* %a @@ -881,8 +881,8 @@ ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d ; VBITS_GE_2048-NEXT: ucvtf z0.s, p0/m, z0.d -; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 +; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x i64>, <32 x i64>* %a @@ -1134,18 +1134,18 @@ ; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 ; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: mov x8, sp +; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x8] +; VBITS_EQ_256-NEXT: mov x8, #8 ; VBITS_EQ_256-NEXT: ldp q0, q1, [sp] ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: mov x8, #8 ; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h ; VBITS_EQ_256-NEXT: scvtf z0.s, p0/m, z0.s +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_EQ_256-NEXT: scvtf z1.s, p0/m, z1.s ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_EQ_256-NEXT: mov sp, x29 ; VBITS_EQ_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_EQ_256-NEXT: ret @@ -1249,17 +1249,17 @@ ; VBITS_EQ_256-LABEL: scvtf_v8i16_v8f64: ; VBITS_EQ_256: // %bb.0: ; VBITS_EQ_256-NEXT: ldr q0, [x0] -; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: mov x8, #4 -; VBITS_EQ_256-NEXT: sunpklo z1.s, z0.h -; VBITS_EQ_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d ; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v8i16_v8f64: @@ -1361,12 +1361,12 @@ ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ptrue p0.s -; VBITS_EQ_256-NEXT: ptrue p1.h, vl8 ; VBITS_EQ_256-NEXT: scvtf z0.h, p0/m, z0.s ; VBITS_EQ_256-NEXT: scvtf z1.h, p0/m, z1.s ; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_EQ_256-NEXT: splice z1.h, p1, z1.h, z0.h +; VBITS_EQ_256-NEXT: ptrue p0.h, vl8 +; VBITS_EQ_256-NEXT: splice z1.h, p0, z1.h, z0.h ; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 ; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_EQ_256-NEXT: ret @@ -1377,8 +1377,8 @@ ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: scvtf z0.h, p0/m, z0.s -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a @@ -1394,8 +1394,8 @@ ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.s ; VBITS_GE_1024-NEXT: scvtf z0.h, p0/m, z0.s -; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_1024-NEXT: ptrue p0.h, vl32 +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <32 x i32>, <32 x i32>* %a @@ -1411,8 +1411,8 @@ ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.s ; VBITS_GE_2048-NEXT: scvtf z0.h, p0/m, z0.s -; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: ptrue p0.h, vl64 +; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x i32>, <64 x i32>* %a @@ -1566,18 +1566,18 @@ ; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 ; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: mov x8, sp +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8] +; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: ldp q0, q1, [sp] ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: scvtf z0.d, p0/m, z0.d +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_EQ_256-NEXT: scvtf z1.d, p0/m, z1.d ; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_EQ_256-NEXT: mov sp, x29 ; VBITS_EQ_256-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_EQ_256-NEXT: ret @@ -1717,9 +1717,9 @@ ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.d ; VBITS_GE_1024-NEXT: scvtf z0.h, p0/m, z0.d +; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x i64>, <16 x i64>* %a @@ -1735,9 +1735,9 @@ ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d ; VBITS_GE_2048-NEXT: scvtf z0.h, p0/m, z0.d +; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x i64>, <32 x i64>* %a @@ -1796,12 +1796,12 @@ ; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_EQ_256-NEXT: ptrue p0.d -; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 ; VBITS_EQ_256-NEXT: scvtf z0.s, p0/m, z0.d ; VBITS_EQ_256-NEXT: scvtf z1.s, p0/m, z1.d ; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_EQ_256-NEXT: splice z1.s, p1, z1.s, z0.s +; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 ; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_EQ_256-NEXT: ret @@ -1812,8 +1812,8 @@ ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: scvtf z0.s, p0/m, z0.d -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a @@ -1829,8 +1829,8 @@ ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p0.d ; VBITS_GE_1024-NEXT: scvtf z0.s, p0/m, z0.d -; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 +; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x i64>, <16 x i64>* %a @@ -1846,8 +1846,8 @@ ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p0.d ; VBITS_GE_2048-NEXT: scvtf z0.s, p0/m, z0.d -; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 +; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x i64>, <32 x i64>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -54,75 +54,75 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr w8, [x2] +; CHECK-NEXT: ldr w9, [x2] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: asr w9, w8, #31 -; CHECK-NEXT: strb w9, [sp, #31] -; CHECK-NEXT: sbfx w9, w8, #30, #1 -; CHECK-NEXT: strb w9, [sp, #30] -; CHECK-NEXT: sbfx w9, w8, #29, #1 -; CHECK-NEXT: strb w9, [sp, #29] -; CHECK-NEXT: sbfx w9, w8, #28, #1 -; CHECK-NEXT: strb w9, [sp, #28] -; CHECK-NEXT: sbfx w9, w8, #27, #1 -; CHECK-NEXT: strb w9, [sp, #27] -; CHECK-NEXT: sbfx w9, w8, #26, #1 -; CHECK-NEXT: strb w9, [sp, #26] -; CHECK-NEXT: sbfx w9, w8, #25, #1 -; CHECK-NEXT: strb w9, [sp, #25] -; CHECK-NEXT: sbfx w9, w8, #24, #1 -; CHECK-NEXT: strb w9, [sp, #24] -; CHECK-NEXT: sbfx w9, w8, #23, #1 -; CHECK-NEXT: strb w9, [sp, #23] -; CHECK-NEXT: sbfx w9, w8, #22, #1 -; CHECK-NEXT: strb w9, [sp, #22] -; CHECK-NEXT: sbfx w9, w8, #21, #1 -; CHECK-NEXT: strb w9, [sp, #21] -; CHECK-NEXT: sbfx w9, w8, #20, #1 -; CHECK-NEXT: strb w9, [sp, #20] -; CHECK-NEXT: sbfx w9, w8, #19, #1 -; CHECK-NEXT: strb w9, [sp, #19] -; CHECK-NEXT: sbfx w9, w8, #18, #1 -; CHECK-NEXT: strb w9, [sp, #18] -; CHECK-NEXT: sbfx w9, w8, #17, #1 -; CHECK-NEXT: strb w9, [sp, #17] -; CHECK-NEXT: sbfx w9, w8, #16, #1 -; CHECK-NEXT: strb w9, [sp, #16] -; CHECK-NEXT: sbfx w9, w8, #15, #1 -; CHECK-NEXT: strb w9, [sp, #15] -; CHECK-NEXT: sbfx w9, w8, #14, #1 -; CHECK-NEXT: strb w9, [sp, #14] -; CHECK-NEXT: sbfx w9, w8, #13, #1 -; CHECK-NEXT: strb w9, [sp, #13] -; CHECK-NEXT: sbfx w9, w8, #12, #1 -; CHECK-NEXT: strb w9, [sp, #12] -; CHECK-NEXT: sbfx w9, w8, #11, #1 -; CHECK-NEXT: strb w9, [sp, #11] -; CHECK-NEXT: sbfx w9, w8, #10, #1 -; CHECK-NEXT: strb w9, [sp, #10] -; CHECK-NEXT: sbfx w9, w8, #9, #1 -; CHECK-NEXT: strb w9, [sp, #9] -; CHECK-NEXT: sbfx w9, w8, #8, #1 -; CHECK-NEXT: strb w9, [sp, #8] -; CHECK-NEXT: sbfx w9, w8, #7, #1 -; CHECK-NEXT: strb w9, [sp, #7] -; CHECK-NEXT: sbfx w9, w8, #6, #1 -; CHECK-NEXT: strb w9, [sp, #6] -; CHECK-NEXT: sbfx w9, w8, #5, #1 -; CHECK-NEXT: strb w9, [sp, #5] -; CHECK-NEXT: sbfx w9, w8, #4, #1 -; CHECK-NEXT: strb w9, [sp, #4] -; CHECK-NEXT: sbfx w9, w8, #3, #1 -; CHECK-NEXT: strb w9, [sp, #3] -; CHECK-NEXT: sbfx w9, w8, #2, #1 -; CHECK-NEXT: strb w9, [sp, #2] -; CHECK-NEXT: sbfx w9, w8, #1, #1 -; CHECK-NEXT: sbfx w8, w8, #0, #1 -; CHECK-NEXT: strb w9, [sp, #1] -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: strb w8, [sp] -; CHECK-NEXT: ld1b { z0.b }, p0/z, [x9] +; CHECK-NEXT: asr w10, w9, #31 +; CHECK-NEXT: sbfx w11, w9, #30, #1 +; CHECK-NEXT: sbfx w12, w9, #29, #1 +; CHECK-NEXT: strb w10, [sp, #31] +; CHECK-NEXT: sbfx w10, w9, #28, #1 +; CHECK-NEXT: strb w11, [sp, #30] +; CHECK-NEXT: sbfx w11, w9, #27, #1 +; CHECK-NEXT: strb w12, [sp, #29] +; CHECK-NEXT: sbfx w12, w9, #26, #1 +; CHECK-NEXT: strb w10, [sp, #28] +; CHECK-NEXT: sbfx w10, w9, #25, #1 +; CHECK-NEXT: strb w11, [sp, #27] +; CHECK-NEXT: sbfx w11, w9, #24, #1 +; CHECK-NEXT: strb w12, [sp, #26] +; CHECK-NEXT: sbfx w12, w9, #23, #1 +; CHECK-NEXT: strb w10, [sp, #25] +; CHECK-NEXT: sbfx w10, w9, #22, #1 +; CHECK-NEXT: strb w11, [sp, #24] +; CHECK-NEXT: sbfx w11, w9, #21, #1 +; CHECK-NEXT: strb w12, [sp, #23] +; CHECK-NEXT: sbfx w12, w9, #20, #1 +; CHECK-NEXT: strb w10, [sp, #22] +; CHECK-NEXT: sbfx w10, w9, #19, #1 +; CHECK-NEXT: strb w11, [sp, #21] +; CHECK-NEXT: sbfx w11, w9, #18, #1 +; CHECK-NEXT: strb w12, [sp, #20] +; CHECK-NEXT: sbfx w12, w9, #17, #1 +; CHECK-NEXT: strb w10, [sp, #19] +; CHECK-NEXT: sbfx w10, w9, #16, #1 +; CHECK-NEXT: strb w11, [sp, #18] +; CHECK-NEXT: sbfx w11, w9, #15, #1 +; CHECK-NEXT: strb w12, [sp, #17] +; CHECK-NEXT: sbfx w12, w9, #14, #1 +; CHECK-NEXT: strb w10, [sp, #16] +; CHECK-NEXT: sbfx w10, w9, #13, #1 +; CHECK-NEXT: strb w11, [sp, #15] +; CHECK-NEXT: sbfx w11, w9, #12, #1 +; CHECK-NEXT: strb w12, [sp, #14] +; CHECK-NEXT: sbfx w12, w9, #11, #1 +; CHECK-NEXT: strb w10, [sp, #13] +; CHECK-NEXT: sbfx w10, w9, #10, #1 +; CHECK-NEXT: strb w11, [sp, #12] +; CHECK-NEXT: sbfx w11, w9, #9, #1 +; CHECK-NEXT: strb w12, [sp, #11] +; CHECK-NEXT: sbfx w12, w9, #8, #1 +; CHECK-NEXT: strb w10, [sp, #10] +; CHECK-NEXT: sbfx w10, w9, #7, #1 +; CHECK-NEXT: strb w11, [sp, #9] +; CHECK-NEXT: sbfx w11, w9, #6, #1 +; CHECK-NEXT: strb w12, [sp, #8] +; CHECK-NEXT: sbfx w12, w9, #5, #1 +; CHECK-NEXT: strb w10, [sp, #7] +; CHECK-NEXT: sbfx w10, w9, #4, #1 +; CHECK-NEXT: strb w11, [sp, #6] +; CHECK-NEXT: sbfx w11, w9, #3, #1 +; CHECK-NEXT: strb w12, [sp, #5] +; CHECK-NEXT: sbfx w12, w9, #2, #1 +; CHECK-NEXT: strb w10, [sp, #4] +; CHECK-NEXT: sbfx w10, w9, #1, #1 +; CHECK-NEXT: sbfx w9, w9, #0, #1 +; CHECK-NEXT: strb w11, [sp, #3] +; CHECK-NEXT: strb w12, [sp, #2] +; CHECK-NEXT: strb w10, [sp, #1] +; CHECK-NEXT: strb w9, [sp] +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1] ; CHECK-NEXT: and z0.b, z0.b, #0x1 @@ -150,139 +150,139 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: ldr x8, [x2] +; VBITS_GE_512-NEXT: ldr x9, [x2] +; VBITS_GE_512-NEXT: mov x8, sp ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: ptrue p1.b -; VBITS_GE_512-NEXT: asr x9, x8, #63 -; VBITS_GE_512-NEXT: strb w9, [sp, #63] -; VBITS_GE_512-NEXT: sbfx x9, x8, #62, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #62] -; VBITS_GE_512-NEXT: sbfx x9, x8, #61, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #61] -; VBITS_GE_512-NEXT: sbfx x9, x8, #60, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #60] -; VBITS_GE_512-NEXT: sbfx x9, x8, #59, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #59] -; VBITS_GE_512-NEXT: sbfx x9, x8, #58, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #58] -; VBITS_GE_512-NEXT: sbfx x9, x8, #57, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #57] -; VBITS_GE_512-NEXT: sbfx x9, x8, #56, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #56] -; VBITS_GE_512-NEXT: sbfx x9, x8, #55, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #55] -; VBITS_GE_512-NEXT: sbfx x9, x8, #54, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #54] -; VBITS_GE_512-NEXT: sbfx x9, x8, #53, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #53] -; VBITS_GE_512-NEXT: sbfx x9, x8, #52, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #52] -; VBITS_GE_512-NEXT: sbfx x9, x8, #51, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #51] -; VBITS_GE_512-NEXT: sbfx x9, x8, #50, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #50] -; VBITS_GE_512-NEXT: sbfx x9, x8, #49, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #49] -; VBITS_GE_512-NEXT: sbfx x9, x8, #48, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #48] -; VBITS_GE_512-NEXT: sbfx x9, x8, #47, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #47] -; VBITS_GE_512-NEXT: sbfx x9, x8, #46, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #46] -; VBITS_GE_512-NEXT: sbfx x9, x8, #45, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #45] -; VBITS_GE_512-NEXT: sbfx x9, x8, #44, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #44] -; VBITS_GE_512-NEXT: sbfx x9, x8, #43, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #43] -; VBITS_GE_512-NEXT: sbfx x9, x8, #42, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #42] -; VBITS_GE_512-NEXT: sbfx x9, x8, #41, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #41] -; VBITS_GE_512-NEXT: sbfx x9, x8, #40, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #40] -; VBITS_GE_512-NEXT: sbfx x9, x8, #39, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #39] -; VBITS_GE_512-NEXT: sbfx x9, x8, #38, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #38] -; VBITS_GE_512-NEXT: sbfx x9, x8, #37, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #37] -; VBITS_GE_512-NEXT: sbfx x9, x8, #36, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #36] -; VBITS_GE_512-NEXT: sbfx x9, x8, #35, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #35] -; VBITS_GE_512-NEXT: sbfx x9, x8, #34, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #34] -; VBITS_GE_512-NEXT: sbfx x9, x8, #33, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #33] -; VBITS_GE_512-NEXT: sbfx x9, x8, #32, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #32] -; VBITS_GE_512-NEXT: asr w9, w8, #31 -; VBITS_GE_512-NEXT: strb w9, [sp, #31] -; VBITS_GE_512-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #30] -; VBITS_GE_512-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #29] -; VBITS_GE_512-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #28] -; VBITS_GE_512-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #27] -; VBITS_GE_512-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #26] -; VBITS_GE_512-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #25] -; VBITS_GE_512-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #24] -; VBITS_GE_512-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #23] -; VBITS_GE_512-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #22] -; VBITS_GE_512-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #21] -; VBITS_GE_512-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #20] -; VBITS_GE_512-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #19] -; VBITS_GE_512-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #18] -; VBITS_GE_512-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #17] -; VBITS_GE_512-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #16] -; VBITS_GE_512-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #15] -; VBITS_GE_512-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #14] -; VBITS_GE_512-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #13] -; VBITS_GE_512-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #12] -; VBITS_GE_512-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #11] -; VBITS_GE_512-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #10] -; VBITS_GE_512-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #9] -; VBITS_GE_512-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #8] -; VBITS_GE_512-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #7] -; VBITS_GE_512-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #6] -; VBITS_GE_512-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #5] -; VBITS_GE_512-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #4] -; VBITS_GE_512-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #3] -; VBITS_GE_512-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #2] -; VBITS_GE_512-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_512-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_512-NEXT: strb w9, [sp, #1] -; VBITS_GE_512-NEXT: mov x9, sp -; VBITS_GE_512-NEXT: strb w8, [sp] -; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x9] +; VBITS_GE_512-NEXT: asr x10, x9, #63 +; VBITS_GE_512-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_512-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #63] +; VBITS_GE_512-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #62] +; VBITS_GE_512-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #61] +; VBITS_GE_512-NEXT: sbfx x12, x9, #58, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #60] +; VBITS_GE_512-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #59] +; VBITS_GE_512-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #58] +; VBITS_GE_512-NEXT: sbfx x12, x9, #55, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #57] +; VBITS_GE_512-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #56] +; VBITS_GE_512-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #55] +; VBITS_GE_512-NEXT: sbfx x12, x9, #52, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #54] +; VBITS_GE_512-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #53] +; VBITS_GE_512-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #52] +; VBITS_GE_512-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #51] +; VBITS_GE_512-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #50] +; VBITS_GE_512-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #49] +; VBITS_GE_512-NEXT: sbfx x12, x9, #46, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #48] +; VBITS_GE_512-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #47] +; VBITS_GE_512-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #46] +; VBITS_GE_512-NEXT: sbfx x12, x9, #43, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #45] +; VBITS_GE_512-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #44] +; VBITS_GE_512-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #43] +; VBITS_GE_512-NEXT: sbfx x12, x9, #40, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #42] +; VBITS_GE_512-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #41] +; VBITS_GE_512-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #40] +; VBITS_GE_512-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #39] +; VBITS_GE_512-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #38] +; VBITS_GE_512-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #37] +; VBITS_GE_512-NEXT: sbfx x12, x9, #34, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #36] +; VBITS_GE_512-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #35] +; VBITS_GE_512-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #34] +; VBITS_GE_512-NEXT: asr w12, w9, #31 +; VBITS_GE_512-NEXT: strb w10, [sp, #33] +; VBITS_GE_512-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #32] +; VBITS_GE_512-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #31] +; VBITS_GE_512-NEXT: sbfx w12, w9, #28, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #30] +; VBITS_GE_512-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #29] +; VBITS_GE_512-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #28] +; VBITS_GE_512-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #27] +; VBITS_GE_512-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #26] +; VBITS_GE_512-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #25] +; VBITS_GE_512-NEXT: sbfx w12, w9, #22, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #24] +; VBITS_GE_512-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #23] +; VBITS_GE_512-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #22] +; VBITS_GE_512-NEXT: sbfx w12, w9, #19, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #21] +; VBITS_GE_512-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #20] +; VBITS_GE_512-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #19] +; VBITS_GE_512-NEXT: sbfx w12, w9, #16, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #18] +; VBITS_GE_512-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #17] +; VBITS_GE_512-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #16] +; VBITS_GE_512-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #15] +; VBITS_GE_512-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #14] +; VBITS_GE_512-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #13] +; VBITS_GE_512-NEXT: sbfx w12, w9, #10, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #12] +; VBITS_GE_512-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #11] +; VBITS_GE_512-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #10] +; VBITS_GE_512-NEXT: sbfx w12, w9, #7, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #9] +; VBITS_GE_512-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #8] +; VBITS_GE_512-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #7] +; VBITS_GE_512-NEXT: sbfx w12, w9, #4, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #6] +; VBITS_GE_512-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_512-NEXT: strb w11, [sp, #5] +; VBITS_GE_512-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_512-NEXT: strb w12, [sp, #4] +; VBITS_GE_512-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_512-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_512-NEXT: strb w10, [sp, #3] +; VBITS_GE_512-NEXT: strb w11, [sp, #2] +; VBITS_GE_512-NEXT: strb w12, [sp, #1] +; VBITS_GE_512-NEXT: strb w9, [sp] +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x8] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z2.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: and z0.b, z0.b, #0x1 @@ -310,268 +310,268 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: ldr x8, [x2, #8] +; VBITS_GE_1024-NEXT: ldr x9, [x2, #8] +; VBITS_GE_1024-NEXT: mov x8, sp ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 ; VBITS_GE_1024-NEXT: ptrue p1.b -; VBITS_GE_1024-NEXT: asr x9, x8, #63 -; VBITS_GE_1024-NEXT: strb w9, [sp, #127] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #62, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #126] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #61, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #125] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #60, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #124] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #59, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #123] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #58, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #122] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #57, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #121] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #56, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #120] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #55, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #119] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #54, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #118] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #53, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #117] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #52, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #116] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #51, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #115] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #50, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #114] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #49, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #113] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #48, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #112] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #47, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #111] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #46, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #110] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #45, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #109] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #44, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #108] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #43, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #107] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #42, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #106] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #41, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #105] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #40, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #104] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #39, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #103] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #38, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #102] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #37, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #101] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #36, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #100] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #35, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #99] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #34, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #98] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #33, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #97] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #32, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #96] -; VBITS_GE_1024-NEXT: asr w9, w8, #31 -; VBITS_GE_1024-NEXT: strb w9, [sp, #95] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #94] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #93] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #92] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #91] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #90] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #89] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #88] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #87] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #86] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #85] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #84] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #83] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #82] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #81] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #80] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #79] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #78] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #77] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #76] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #75] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #74] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #73] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #72] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #71] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #70] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #69] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #68] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #67] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #66] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_1024-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_1024-NEXT: strb w9, [sp, #65] -; VBITS_GE_1024-NEXT: strb w8, [sp, #64] -; VBITS_GE_1024-NEXT: ldr x8, [x2] -; VBITS_GE_1024-NEXT: mov x9, sp -; VBITS_GE_1024-NEXT: asr x10, x8, #63 +; VBITS_GE_1024-NEXT: asr x10, x9, #63 +; VBITS_GE_1024-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_1024-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #127] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #126] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #125] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #58, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #124] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #123] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #122] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #55, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #121] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #120] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #119] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #52, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #118] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #117] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #116] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #115] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #114] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #113] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #46, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #112] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #111] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #110] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #43, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #109] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #108] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #107] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #40, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #106] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #105] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #104] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #103] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #102] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #101] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #34, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #100] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #99] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #98] +; VBITS_GE_1024-NEXT: asr w12, w9, #31 +; VBITS_GE_1024-NEXT: strb w10, [sp, #97] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #96] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #95] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #28, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #94] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #93] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #92] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #91] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #90] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #89] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #22, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #88] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #87] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #86] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #19, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #85] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #84] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #83] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #16, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #82] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #81] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #80] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #79] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #78] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #77] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #10, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #76] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #75] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #74] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #7, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #73] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #72] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #71] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #4, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #70] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #69] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #68] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_1024-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_1024-NEXT: strb w10, [sp, #67] +; VBITS_GE_1024-NEXT: strb w11, [sp, #66] +; VBITS_GE_1024-NEXT: strb w12, [sp, #65] +; VBITS_GE_1024-NEXT: strb w9, [sp, #64] +; VBITS_GE_1024-NEXT: ldr x9, [x2] +; VBITS_GE_1024-NEXT: asr x10, x9, #63 +; VBITS_GE_1024-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_1024-NEXT: sbfx x12, x9, #61, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #63] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #62, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #62] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #61, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #61] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #60, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #62] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #61] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #58, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #60] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #59, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #59] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #58, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #58] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #57, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #59] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #58] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #55, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #57] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #56, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #56] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #55, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #55] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #54, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #56] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #55] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #52, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #54] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #53, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #53] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #52, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #52] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #51, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #53] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #52] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #49, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #51] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #50, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #50] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #49, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #49] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #48, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #50] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #49] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #46, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #48] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #47, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #47] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #46, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #46] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #45, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #47] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #46] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #43, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #45] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #44, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #44] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #43, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #43] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #42, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #44] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #43] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #40, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #42] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #41, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #41] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #40, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #40] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #39, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #41] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #40] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #37, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #39] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #38, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #38] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #37, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #37] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #36, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #38] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #37] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #34, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #36] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #35, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #35] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #34, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #34] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #33, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #35] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #34] +; VBITS_GE_1024-NEXT: asr w12, w9, #31 ; VBITS_GE_1024-NEXT: strb w10, [sp, #33] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #32, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #32] -; VBITS_GE_1024-NEXT: asr w10, w8, #31 -; VBITS_GE_1024-NEXT: strb w10, [sp, #31] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #30, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #32] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #31] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #28, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #30] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #29, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #29] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #28, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #28] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #27, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #29] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #28] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #25, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #27] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #26, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #26] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #25, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #25] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #24, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #26] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #25] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #22, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #24] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #23, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #23] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #22, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #22] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #21, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #23] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #22] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #19, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #21] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #20, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #20] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #19, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #19] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #18, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #20] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #19] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #16, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #18] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #17, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #17] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #16, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #16] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #15, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #17] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #16] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #13, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #15] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #14, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #14] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #13, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #13] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #12, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #14] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #13] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #10, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #12] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #11, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #11] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #10, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #10] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #9, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #11] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #10] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #7, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #9] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #8, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #8] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #7, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #7] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #6, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #8] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #7] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #4, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #6] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #5, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #5] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #4, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #4] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #3, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_1024-NEXT: strb w11, [sp, #5] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_1024-NEXT: strb w12, [sp, #4] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_1024-NEXT: sbfx w9, w9, #0, #1 ; VBITS_GE_1024-NEXT: strb w10, [sp, #3] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #2, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #2] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #1, #1 -; VBITS_GE_1024-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_1024-NEXT: strb w10, [sp, #1] -; VBITS_GE_1024-NEXT: strb w8, [sp] -; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x9] +; VBITS_GE_1024-NEXT: strb w11, [sp, #2] +; VBITS_GE_1024-NEXT: strb w12, [sp, #1] +; VBITS_GE_1024-NEXT: strb w9, [sp] +; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x8] ; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1b { z2.b }, p0/z, [x1] ; VBITS_GE_1024-NEXT: and z0.b, z0.b, #0x1 @@ -599,526 +599,526 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: ldr x8, [x2, #24] +; VBITS_GE_2048-NEXT: ldr x9, [x2, #24] +; VBITS_GE_2048-NEXT: mov x8, sp ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 ; VBITS_GE_2048-NEXT: ptrue p1.b -; VBITS_GE_2048-NEXT: asr x9, x8, #63 -; VBITS_GE_2048-NEXT: strb w9, [sp, #255] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #62, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #254] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #61, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #253] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #252] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #59, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #251] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #58, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #250] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #249] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #56, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #248] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #55, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #247] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #246] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #53, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #245] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #52, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #244] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #243] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #50, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #242] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #49, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #241] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #240] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #47, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #239] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #46, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #238] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #237] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #44, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #236] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #43, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #235] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #234] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #41, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #233] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #40, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #232] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #231] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #38, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #230] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #37, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #229] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #228] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #35, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #227] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #34, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #226] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #225] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #32, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #224] -; VBITS_GE_2048-NEXT: asr w9, w8, #31 -; VBITS_GE_2048-NEXT: strb w9, [sp, #223] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #222] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #221] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #220] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #219] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #218] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #217] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #216] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #215] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #214] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #213] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #212] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #211] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #210] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #209] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #208] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #207] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #206] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #205] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #204] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #203] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #202] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #201] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #200] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #199] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #198] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #197] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #196] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #195] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #194] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #193] -; VBITS_GE_2048-NEXT: strb w8, [sp, #192] -; VBITS_GE_2048-NEXT: ldr x8, [x2, #16] -; VBITS_GE_2048-NEXT: asr x9, x8, #63 -; VBITS_GE_2048-NEXT: strb w9, [sp, #191] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #62, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #190] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #61, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #189] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #188] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #59, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #187] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #58, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #186] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #185] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #56, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #184] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #55, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #183] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #182] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #53, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #181] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #52, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #180] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #179] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #50, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #178] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #49, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #177] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #176] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #47, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #175] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #46, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #174] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #173] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #44, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #172] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #43, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #171] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #170] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #41, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #169] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #40, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #168] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #167] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #38, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #166] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #37, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #165] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #164] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #35, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #163] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #34, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #162] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #161] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #32, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #160] -; VBITS_GE_2048-NEXT: asr w9, w8, #31 -; VBITS_GE_2048-NEXT: strb w9, [sp, #159] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #158] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #157] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #156] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #155] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #154] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #153] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #152] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #151] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #150] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #149] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #148] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #147] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #146] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #145] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #144] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #143] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #142] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #141] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #140] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #139] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #138] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #137] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #136] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #135] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #134] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #133] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #132] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #131] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #130] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #129] -; VBITS_GE_2048-NEXT: strb w8, [sp, #128] -; VBITS_GE_2048-NEXT: ldr x8, [x2, #8] -; VBITS_GE_2048-NEXT: asr x9, x8, #63 -; VBITS_GE_2048-NEXT: strb w9, [sp, #127] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #62, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #126] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #61, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #125] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #124] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #59, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #123] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #58, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #122] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #121] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #56, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #120] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #55, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #119] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #118] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #53, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #117] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #52, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #116] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #115] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #50, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #114] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #49, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #113] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #112] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #47, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #111] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #46, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #110] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #109] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #44, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #108] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #43, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #107] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #106] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #41, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #105] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #40, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #104] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #103] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #38, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #102] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #37, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #101] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #100] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #35, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #99] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #34, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #98] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #97] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #32, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #96] -; VBITS_GE_2048-NEXT: asr w9, w8, #31 -; VBITS_GE_2048-NEXT: strb w9, [sp, #95] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #94] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #93] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #92] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #91] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #90] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #89] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #88] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #87] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #86] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #85] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #84] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #83] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #82] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #81] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #80] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #79] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #78] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #77] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #76] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #75] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #74] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #73] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #72] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #71] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #70] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #69] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #68] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #67] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #66] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_2048-NEXT: strb w9, [sp, #65] -; VBITS_GE_2048-NEXT: strb w8, [sp, #64] -; VBITS_GE_2048-NEXT: ldr x8, [x2] -; VBITS_GE_2048-NEXT: mov x9, sp -; VBITS_GE_2048-NEXT: asr x10, x8, #63 +; VBITS_GE_2048-NEXT: asr x10, x9, #63 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #255] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #254] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #253] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #58, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #252] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #251] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #250] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #55, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #249] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #248] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #247] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #52, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #246] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #245] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #244] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #243] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #242] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #241] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #46, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #240] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #239] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #238] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #43, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #237] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #236] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #235] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #40, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #234] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #233] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #232] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #231] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #230] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #229] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #34, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #228] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #227] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #226] +; VBITS_GE_2048-NEXT: asr w12, w9, #31 +; VBITS_GE_2048-NEXT: strb w10, [sp, #225] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #224] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #223] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #28, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #222] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #221] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #220] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #219] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #218] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #217] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #22, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #216] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #215] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #214] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #19, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #213] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #212] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #211] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #16, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #210] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #209] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #208] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #207] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #206] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #205] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #10, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #204] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #203] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #202] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #7, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #201] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #200] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #199] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #4, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #198] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #197] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #196] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_2048-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #195] +; VBITS_GE_2048-NEXT: strb w11, [sp, #194] +; VBITS_GE_2048-NEXT: strb w12, [sp, #193] +; VBITS_GE_2048-NEXT: strb w9, [sp, #192] +; VBITS_GE_2048-NEXT: ldr x9, [x2, #16] +; VBITS_GE_2048-NEXT: asr x10, x9, #63 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #191] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #190] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #189] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #58, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #188] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #187] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #186] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #55, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #185] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #184] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #183] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #52, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #182] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #181] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #180] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #179] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #178] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #177] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #46, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #176] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #175] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #174] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #43, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #173] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #172] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #171] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #40, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #170] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #169] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #168] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #167] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #166] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #165] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #34, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #164] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #163] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #162] +; VBITS_GE_2048-NEXT: asr w12, w9, #31 +; VBITS_GE_2048-NEXT: strb w10, [sp, #161] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #160] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #159] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #28, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #158] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #157] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #156] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #155] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #154] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #153] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #22, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #152] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #151] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #150] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #19, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #149] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #148] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #147] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #16, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #146] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #145] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #144] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #143] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #142] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #141] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #10, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #140] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #139] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #138] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #7, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #137] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #136] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #135] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #4, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #134] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #133] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #132] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_2048-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #131] +; VBITS_GE_2048-NEXT: strb w11, [sp, #130] +; VBITS_GE_2048-NEXT: strb w12, [sp, #129] +; VBITS_GE_2048-NEXT: strb w9, [sp, #128] +; VBITS_GE_2048-NEXT: ldr x9, [x2, #8] +; VBITS_GE_2048-NEXT: asr x10, x9, #63 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #127] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #126] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #125] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #58, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #124] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #123] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #122] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #55, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #121] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #120] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #119] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #52, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #118] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #117] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #116] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #115] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #114] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #113] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #46, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #112] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #111] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #110] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #43, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #109] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #108] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #107] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #40, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #106] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #105] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #104] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #103] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #102] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #101] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #34, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #100] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #99] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #98] +; VBITS_GE_2048-NEXT: asr w12, w9, #31 +; VBITS_GE_2048-NEXT: strb w10, [sp, #97] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #96] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #95] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #28, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #94] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #93] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #92] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #91] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #90] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #89] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #22, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #88] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #87] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #86] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #19, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #85] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #84] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #83] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #16, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #82] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #81] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #80] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #79] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #78] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #77] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #10, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #76] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #75] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #74] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #7, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #73] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #72] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #71] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #4, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #70] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #69] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #68] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_2048-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_2048-NEXT: strb w10, [sp, #67] +; VBITS_GE_2048-NEXT: strb w11, [sp, #66] +; VBITS_GE_2048-NEXT: strb w12, [sp, #65] +; VBITS_GE_2048-NEXT: strb w9, [sp, #64] +; VBITS_GE_2048-NEXT: ldr x9, [x2] +; VBITS_GE_2048-NEXT: asr x10, x9, #63 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x9, #61, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #63] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #62, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #62] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #61, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #61] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #60, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #62] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #61] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #58, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #60] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #59, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #59] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #58, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #58] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #57, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #59] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #58] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #55, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #57] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #56, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #56] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #55, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #55] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #54, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #56] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #55] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #52, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #54] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #53, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #53] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #52, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #52] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #51, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #53] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #52] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #49, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #51] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #50, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #50] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #49, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #49] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #48, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #50] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #49] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #46, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #48] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #47, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #47] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #46, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #46] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #45, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #47] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #46] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #43, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #45] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #44, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #44] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #43, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #43] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #42, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #44] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #43] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #40, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #42] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #41, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #41] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #40, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #40] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #39, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #41] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #40] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #37, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #39] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #38, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #38] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #37, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #37] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #36, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #38] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #37] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #34, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #36] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #35, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #35] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #34, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #34] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #33, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #35] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #34] +; VBITS_GE_2048-NEXT: asr w12, w9, #31 ; VBITS_GE_2048-NEXT: strb w10, [sp, #33] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #32, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #32] -; VBITS_GE_2048-NEXT: asr w10, w8, #31 -; VBITS_GE_2048-NEXT: strb w10, [sp, #31] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #30, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #32] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #31] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #28, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #30] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #29, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #29] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #28, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #28] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #27, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #29] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #28] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #25, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #27] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #26, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #26] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #25, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #25] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #24, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #26] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #25] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #22, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #24] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #23, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #23] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #22, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #22] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #21, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #23] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #22] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #19, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #21] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #20, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #20] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #19, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #19] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #18, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #20] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #19] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #16, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #18] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #17, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #17] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #16, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #16] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #15, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #17] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #16] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #13, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #15] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #14, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #14] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #13, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #13] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #12, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #14] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #13] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #10, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #12] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #11, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #11] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #10, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #10] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #9, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #11] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #10] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #7, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #9] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #8, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #8] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #7, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #7] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #6, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #8] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #7] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #4, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #6] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #5, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #5] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #4, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #4] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #3, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_2048-NEXT: strb w11, [sp, #5] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_2048-NEXT: strb w12, [sp, #4] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_2048-NEXT: sbfx w9, w9, #0, #1 ; VBITS_GE_2048-NEXT: strb w10, [sp, #3] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #2] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #1, #1 -; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_2048-NEXT: strb w10, [sp, #1] -; VBITS_GE_2048-NEXT: strb w8, [sp] -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x9] +; VBITS_GE_2048-NEXT: strb w11, [sp, #2] +; VBITS_GE_2048-NEXT: strb w12, [sp, #1] +; VBITS_GE_2048-NEXT: strb w9, [sp] +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x8] ; VBITS_GE_2048-NEXT: ld1b { z1.b }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1b { z2.b }, p0/z, [x1] ; VBITS_GE_2048-NEXT: and z0.b, z0.b, #0x1 @@ -1171,43 +1171,43 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldrh w8, [x2] +; CHECK-NEXT: ldrh w9, [x2] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: sbfx w9, w8, #15, #1 -; CHECK-NEXT: strh w9, [sp, #30] -; CHECK-NEXT: sbfx w9, w8, #14, #1 -; CHECK-NEXT: strh w9, [sp, #28] -; CHECK-NEXT: sbfx w9, w8, #13, #1 -; CHECK-NEXT: strh w9, [sp, #26] -; CHECK-NEXT: sbfx w9, w8, #12, #1 -; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: sbfx w9, w8, #11, #1 -; CHECK-NEXT: strh w9, [sp, #22] -; CHECK-NEXT: sbfx w9, w8, #10, #1 -; CHECK-NEXT: strh w9, [sp, #20] -; CHECK-NEXT: sbfx w9, w8, #9, #1 -; CHECK-NEXT: strh w9, [sp, #18] -; CHECK-NEXT: sbfx w9, w8, #8, #1 -; CHECK-NEXT: strh w9, [sp, #16] -; CHECK-NEXT: sbfx w9, w8, #7, #1 -; CHECK-NEXT: strh w9, [sp, #14] -; CHECK-NEXT: sbfx w9, w8, #6, #1 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: sbfx w9, w8, #5, #1 -; CHECK-NEXT: strh w9, [sp, #10] -; CHECK-NEXT: sbfx w9, w8, #4, #1 -; CHECK-NEXT: strh w9, [sp, #8] -; CHECK-NEXT: sbfx w9, w8, #3, #1 -; CHECK-NEXT: strh w9, [sp, #6] -; CHECK-NEXT: sbfx w9, w8, #2, #1 -; CHECK-NEXT: strh w9, [sp, #4] -; CHECK-NEXT: sbfx w9, w8, #1, #1 -; CHECK-NEXT: sbfx w8, w8, #0, #1 -; CHECK-NEXT: strh w9, [sp, #2] -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: strh w8, [sp] -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x9] +; CHECK-NEXT: sbfx w10, w9, #15, #1 +; CHECK-NEXT: sbfx w11, w9, #14, #1 +; CHECK-NEXT: sbfx w12, w9, #13, #1 +; CHECK-NEXT: strh w10, [sp, #30] +; CHECK-NEXT: sbfx w10, w9, #12, #1 +; CHECK-NEXT: strh w11, [sp, #28] +; CHECK-NEXT: sbfx w11, w9, #11, #1 +; CHECK-NEXT: strh w12, [sp, #26] +; CHECK-NEXT: sbfx w12, w9, #10, #1 +; CHECK-NEXT: strh w10, [sp, #24] +; CHECK-NEXT: sbfx w10, w9, #9, #1 +; CHECK-NEXT: strh w11, [sp, #22] +; CHECK-NEXT: sbfx w11, w9, #8, #1 +; CHECK-NEXT: strh w12, [sp, #20] +; CHECK-NEXT: sbfx w12, w9, #7, #1 +; CHECK-NEXT: strh w10, [sp, #18] +; CHECK-NEXT: sbfx w10, w9, #6, #1 +; CHECK-NEXT: strh w11, [sp, #16] +; CHECK-NEXT: sbfx w11, w9, #5, #1 +; CHECK-NEXT: strh w12, [sp, #14] +; CHECK-NEXT: sbfx w12, w9, #4, #1 +; CHECK-NEXT: strh w10, [sp, #12] +; CHECK-NEXT: sbfx w10, w9, #3, #1 +; CHECK-NEXT: strh w11, [sp, #10] +; CHECK-NEXT: sbfx w11, w9, #2, #1 +; CHECK-NEXT: strh w12, [sp, #8] +; CHECK-NEXT: sbfx w12, w9, #1, #1 +; CHECK-NEXT: sbfx w9, w9, #0, #1 +; CHECK-NEXT: strh w10, [sp, #6] +; CHECK-NEXT: strh w11, [sp, #4] +; CHECK-NEXT: strh w12, [sp, #2] +; CHECK-NEXT: strh w9, [sp] +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] ; CHECK-NEXT: and z0.h, z0.h, #0x1 @@ -1235,75 +1235,75 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: ldr w8, [x2] +; VBITS_GE_512-NEXT: ldr w9, [x2] +; VBITS_GE_512-NEXT: mov x8, sp ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ptrue p1.h -; VBITS_GE_512-NEXT: asr w9, w8, #31 -; VBITS_GE_512-NEXT: strh w9, [sp, #62] -; VBITS_GE_512-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #60] -; VBITS_GE_512-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #58] -; VBITS_GE_512-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #56] -; VBITS_GE_512-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #54] -; VBITS_GE_512-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #52] -; VBITS_GE_512-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #50] -; VBITS_GE_512-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #48] -; VBITS_GE_512-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #46] -; VBITS_GE_512-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #44] -; VBITS_GE_512-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #42] -; VBITS_GE_512-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #40] -; VBITS_GE_512-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #38] -; VBITS_GE_512-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #36] -; VBITS_GE_512-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #34] -; VBITS_GE_512-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #32] -; VBITS_GE_512-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #30] -; VBITS_GE_512-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #28] -; VBITS_GE_512-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #26] -; VBITS_GE_512-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #24] -; VBITS_GE_512-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #22] -; VBITS_GE_512-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #20] -; VBITS_GE_512-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #18] -; VBITS_GE_512-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #16] -; VBITS_GE_512-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #14] -; VBITS_GE_512-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #12] -; VBITS_GE_512-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #10] -; VBITS_GE_512-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #8] -; VBITS_GE_512-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #6] -; VBITS_GE_512-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #4] -; VBITS_GE_512-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_512-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_512-NEXT: strh w9, [sp, #2] -; VBITS_GE_512-NEXT: mov x9, sp -; VBITS_GE_512-NEXT: strh w8, [sp] -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x9] +; VBITS_GE_512-NEXT: asr w10, w9, #31 +; VBITS_GE_512-NEXT: sbfx w11, w9, #30, #1 +; VBITS_GE_512-NEXT: sbfx w12, w9, #29, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #62] +; VBITS_GE_512-NEXT: sbfx w10, w9, #28, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #60] +; VBITS_GE_512-NEXT: sbfx w11, w9, #27, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #58] +; VBITS_GE_512-NEXT: sbfx w12, w9, #26, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #56] +; VBITS_GE_512-NEXT: sbfx w10, w9, #25, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #54] +; VBITS_GE_512-NEXT: sbfx w11, w9, #24, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #52] +; VBITS_GE_512-NEXT: sbfx w12, w9, #23, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #50] +; VBITS_GE_512-NEXT: sbfx w10, w9, #22, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #48] +; VBITS_GE_512-NEXT: sbfx w11, w9, #21, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #46] +; VBITS_GE_512-NEXT: sbfx w12, w9, #20, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #44] +; VBITS_GE_512-NEXT: sbfx w10, w9, #19, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #42] +; VBITS_GE_512-NEXT: sbfx w11, w9, #18, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #40] +; VBITS_GE_512-NEXT: sbfx w12, w9, #17, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #38] +; VBITS_GE_512-NEXT: sbfx w10, w9, #16, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #36] +; VBITS_GE_512-NEXT: sbfx w11, w9, #15, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #34] +; VBITS_GE_512-NEXT: sbfx w12, w9, #14, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #32] +; VBITS_GE_512-NEXT: sbfx w10, w9, #13, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #30] +; VBITS_GE_512-NEXT: sbfx w11, w9, #12, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #28] +; VBITS_GE_512-NEXT: sbfx w12, w9, #11, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #26] +; VBITS_GE_512-NEXT: sbfx w10, w9, #10, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #24] +; VBITS_GE_512-NEXT: sbfx w11, w9, #9, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #22] +; VBITS_GE_512-NEXT: sbfx w12, w9, #8, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #20] +; VBITS_GE_512-NEXT: sbfx w10, w9, #7, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #18] +; VBITS_GE_512-NEXT: sbfx w11, w9, #6, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #16] +; VBITS_GE_512-NEXT: sbfx w12, w9, #5, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #14] +; VBITS_GE_512-NEXT: sbfx w10, w9, #4, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #12] +; VBITS_GE_512-NEXT: sbfx w11, w9, #3, #1 +; VBITS_GE_512-NEXT: strh w12, [sp, #10] +; VBITS_GE_512-NEXT: sbfx w12, w9, #2, #1 +; VBITS_GE_512-NEXT: strh w10, [sp, #8] +; VBITS_GE_512-NEXT: sbfx w10, w9, #1, #1 +; VBITS_GE_512-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_512-NEXT: strh w11, [sp, #6] +; VBITS_GE_512-NEXT: strh w12, [sp, #4] +; VBITS_GE_512-NEXT: strh w10, [sp, #2] +; VBITS_GE_512-NEXT: strh w9, [sp] +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x8] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x1 @@ -1331,139 +1331,139 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: ldr x8, [x2] +; VBITS_GE_1024-NEXT: ldr x9, [x2] +; VBITS_GE_1024-NEXT: mov x8, sp ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 ; VBITS_GE_1024-NEXT: ptrue p1.h -; VBITS_GE_1024-NEXT: asr x9, x8, #63 -; VBITS_GE_1024-NEXT: strh w9, [sp, #126] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #62, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #124] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #61, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #122] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #60, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #120] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #59, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #118] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #58, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #116] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #57, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #114] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #56, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #112] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #55, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #110] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #54, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #108] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #53, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #106] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #52, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #104] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #51, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #102] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #50, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #100] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #49, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #98] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #48, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #96] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #47, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #94] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #46, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #92] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #45, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #90] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #44, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #88] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #43, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #86] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #42, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #84] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #41, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #82] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #40, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #80] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #39, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #78] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #38, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #76] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #37, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #74] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #36, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #72] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #35, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #70] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #34, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #68] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #33, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #66] -; VBITS_GE_1024-NEXT: sbfx x9, x8, #32, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #64] -; VBITS_GE_1024-NEXT: asr w9, w8, #31 -; VBITS_GE_1024-NEXT: strh w9, [sp, #62] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #60] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #58] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #56] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #54] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #52] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #50] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #48] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #46] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #44] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #42] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #40] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #38] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #36] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #34] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #32] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #30] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #28] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #26] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #24] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #22] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #20] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #18] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #16] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #14] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #12] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #10] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #8] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #6] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #4] -; VBITS_GE_1024-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_1024-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_1024-NEXT: strh w9, [sp, #2] -; VBITS_GE_1024-NEXT: mov x9, sp -; VBITS_GE_1024-NEXT: strh w8, [sp] -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x9] +; VBITS_GE_1024-NEXT: asr x10, x9, #63 +; VBITS_GE_1024-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_1024-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #126] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #124] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #122] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #58, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #120] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #118] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #116] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #55, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #114] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #112] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #110] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #52, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #108] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #106] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #104] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #102] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #100] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #98] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #46, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #96] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #94] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #92] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #43, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #90] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #88] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #86] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #40, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #84] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #82] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #80] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #78] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #76] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #74] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #34, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #72] +; VBITS_GE_1024-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #70] +; VBITS_GE_1024-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #68] +; VBITS_GE_1024-NEXT: asr w12, w9, #31 +; VBITS_GE_1024-NEXT: strh w10, [sp, #66] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #64] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #62] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #28, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #60] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #58] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #56] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #54] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #52] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #50] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #22, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #48] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #46] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #44] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #19, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #42] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #40] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #38] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #16, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #36] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #34] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #32] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #30] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #28] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #26] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #10, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #24] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #22] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #20] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #7, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #18] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #16] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #14] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #4, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #12] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_1024-NEXT: strh w11, [sp, #10] +; VBITS_GE_1024-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_1024-NEXT: strh w12, [sp, #8] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_1024-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_1024-NEXT: strh w10, [sp, #6] +; VBITS_GE_1024-NEXT: strh w11, [sp, #4] +; VBITS_GE_1024-NEXT: strh w12, [sp, #2] +; VBITS_GE_1024-NEXT: strh w9, [sp] +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x8] ; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1h { z2.h }, p0/z, [x1] ; VBITS_GE_1024-NEXT: and z0.h, z0.h, #0x1 @@ -1491,268 +1491,268 @@ ; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 ; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 -; VBITS_GE_2048-NEXT: ldr x8, [x2, #8] +; VBITS_GE_2048-NEXT: ldr x9, [x2, #8] +; VBITS_GE_2048-NEXT: mov x8, sp ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 ; VBITS_GE_2048-NEXT: ptrue p1.h -; VBITS_GE_2048-NEXT: asr x9, x8, #63 -; VBITS_GE_2048-NEXT: strh w9, [sp, #254] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #62, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #252] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #61, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #250] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #60, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #248] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #59, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #246] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #58, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #244] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #57, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #242] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #56, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #240] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #55, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #238] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #54, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #236] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #53, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #234] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #52, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #232] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #51, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #230] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #50, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #228] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #49, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #226] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #48, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #224] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #47, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #222] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #46, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #220] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #45, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #218] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #44, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #216] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #43, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #214] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #42, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #212] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #41, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #210] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #40, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #208] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #39, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #206] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #38, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #204] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #37, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #202] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #36, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #200] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #35, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #198] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #34, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #196] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #33, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #194] -; VBITS_GE_2048-NEXT: sbfx x9, x8, #32, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #192] -; VBITS_GE_2048-NEXT: asr w9, w8, #31 -; VBITS_GE_2048-NEXT: strh w9, [sp, #190] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #30, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #188] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #29, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #186] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #28, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #184] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #27, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #182] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #26, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #180] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #25, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #178] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #24, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #176] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #23, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #174] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #22, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #172] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #21, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #170] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #20, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #168] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #19, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #166] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #18, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #164] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #17, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #162] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #16, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #160] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #15, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #158] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #14, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #156] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #13, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #154] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #12, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #152] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #11, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #150] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #10, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #148] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #9, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #146] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #8, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #144] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #7, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #142] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #6, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #140] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #5, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #138] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #4, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #136] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #3, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #134] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #2, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #132] -; VBITS_GE_2048-NEXT: sbfx w9, w8, #1, #1 -; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_2048-NEXT: strh w9, [sp, #130] -; VBITS_GE_2048-NEXT: strh w8, [sp, #128] -; VBITS_GE_2048-NEXT: ldr x8, [x2] -; VBITS_GE_2048-NEXT: mov x9, sp -; VBITS_GE_2048-NEXT: asr x10, x8, #63 +; VBITS_GE_2048-NEXT: asr x10, x9, #63 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #254] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #252] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #250] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #58, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #248] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #246] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #244] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #55, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #242] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #240] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #238] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #52, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #236] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #234] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #232] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #230] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #228] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #226] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #46, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #224] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #222] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #220] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #43, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #218] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #216] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #214] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #40, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #212] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #210] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #208] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #206] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #204] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #202] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #34, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #200] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #198] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #196] +; VBITS_GE_2048-NEXT: asr w12, w9, #31 +; VBITS_GE_2048-NEXT: strh w10, [sp, #194] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #192] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #190] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #28, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #188] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #186] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #184] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #182] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #180] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #178] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #22, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #176] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #174] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #172] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #19, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #170] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #168] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #166] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #16, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #164] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #162] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #160] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #158] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #156] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #154] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #10, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #152] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #150] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #148] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #7, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #146] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #144] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #142] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #4, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #140] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #138] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #136] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_2048-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_2048-NEXT: strh w10, [sp, #134] +; VBITS_GE_2048-NEXT: strh w11, [sp, #132] +; VBITS_GE_2048-NEXT: strh w12, [sp, #130] +; VBITS_GE_2048-NEXT: strh w9, [sp, #128] +; VBITS_GE_2048-NEXT: ldr x9, [x2] +; VBITS_GE_2048-NEXT: asr x10, x9, #63 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x9, #61, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #126] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #62, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #124] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #61, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #122] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #60, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #60, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #124] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #59, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #122] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #58, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #120] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #59, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #118] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #58, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #116] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #57, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #57, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #118] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #56, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #116] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #55, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #114] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #56, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #112] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #55, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #110] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #54, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #54, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #112] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #53, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #110] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #52, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #108] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #53, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #106] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #52, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #104] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #51, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #106] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #104] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #49, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #102] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #50, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #100] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #49, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #98] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #48, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #48, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #100] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #47, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #98] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #46, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #96] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #47, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #94] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #46, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #92] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #45, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #45, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #94] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #44, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #92] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #43, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #90] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #44, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #88] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #43, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #86] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #42, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #42, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #88] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #41, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #86] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #40, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #84] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #41, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #82] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #40, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #80] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #39, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #82] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #80] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #37, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #78] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #38, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #76] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #37, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #74] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #36, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #36, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #76] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #35, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #74] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #34, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #72] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #35, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #70] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #34, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #68] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #33, #1 +; VBITS_GE_2048-NEXT: sbfx x10, x9, #33, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #70] +; VBITS_GE_2048-NEXT: sbfx x11, x9, #32, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #68] +; VBITS_GE_2048-NEXT: asr w12, w9, #31 ; VBITS_GE_2048-NEXT: strh w10, [sp, #66] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #32, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #64] -; VBITS_GE_2048-NEXT: asr w10, w8, #31 -; VBITS_GE_2048-NEXT: strh w10, [sp, #62] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #30, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #30, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #64] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #29, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #62] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #28, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #60] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #29, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #58] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #28, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #56] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #27, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #58] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #56] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #25, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #54] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #26, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #52] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #25, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #50] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #24, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #24, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #52] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #23, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #50] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #22, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #48] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #23, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #46] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #22, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #44] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #21, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #21, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #46] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #20, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #44] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #19, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #42] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #20, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #40] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #19, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #38] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #18, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #18, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #40] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #17, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #38] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #16, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #36] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #17, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #34] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #16, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #32] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #15, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #34] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #32] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #13, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #30] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #14, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #28] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #13, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #26] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #12, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #12, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #28] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #11, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #26] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #10, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #24] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #11, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #22] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #10, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #20] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #9, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #9, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #22] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #8, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #20] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #7, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #18] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #8, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #16] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #7, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #14] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #6, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #6, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #16] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #5, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #14] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #4, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #12] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #5, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #10] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #4, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #8] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #3, #1 +; VBITS_GE_2048-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_2048-NEXT: strh w11, [sp, #10] +; VBITS_GE_2048-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_2048-NEXT: strh w12, [sp, #8] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_2048-NEXT: sbfx w9, w9, #0, #1 ; VBITS_GE_2048-NEXT: strh w10, [sp, #6] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #4] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #1, #1 -; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_2048-NEXT: strh w10, [sp, #2] -; VBITS_GE_2048-NEXT: strh w8, [sp] -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x9] +; VBITS_GE_2048-NEXT: strh w11, [sp, #4] +; VBITS_GE_2048-NEXT: strh w12, [sp, #2] +; VBITS_GE_2048-NEXT: strh w9, [sp] +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x8] ; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1h { z2.h }, p0/z, [x1] ; VBITS_GE_2048-NEXT: and z0.h, z0.h, #0x1 @@ -1805,23 +1805,23 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldrb w8, [x2] +; CHECK-NEXT: ldrb w9, [x2] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: sbfx w10, w8, #7, #1 -; CHECK-NEXT: sbfx w11, w8, #6, #1 +; CHECK-NEXT: sbfx w10, w9, #7, #1 +; CHECK-NEXT: sbfx w11, w9, #6, #1 +; CHECK-NEXT: sbfx w12, w9, #5, #1 +; CHECK-NEXT: sbfx w13, w9, #4, #1 ; CHECK-NEXT: stp w11, w10, [sp, #24] -; CHECK-NEXT: sbfx w10, w8, #3, #1 -; CHECK-NEXT: sbfx w11, w8, #2, #1 -; CHECK-NEXT: sbfx w12, w8, #5, #1 -; CHECK-NEXT: sbfx w13, w8, #4, #1 -; CHECK-NEXT: stp w11, w10, [sp, #8] -; CHECK-NEXT: sbfx w10, w8, #1, #1 -; CHECK-NEXT: sbfx w8, w8, #0, #1 +; CHECK-NEXT: sbfx w10, w9, #3, #1 +; CHECK-NEXT: sbfx w11, w9, #2, #1 ; CHECK-NEXT: stp w13, w12, [sp, #16] -; CHECK-NEXT: stp w8, w10, [sp] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9] +; CHECK-NEXT: sbfx w12, w9, #1, #1 +; CHECK-NEXT: sbfx w9, w9, #0, #1 +; CHECK-NEXT: stp w11, w10, [sp, #8] +; CHECK-NEXT: stp w9, w12, [sp] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] ; CHECK-NEXT: and z0.s, z0.s, #0x1 @@ -1849,35 +1849,35 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: ldrh w8, [x2] +; VBITS_GE_512-NEXT: ldrh w9, [x2] +; VBITS_GE_512-NEXT: mov x8, sp ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: mov x9, sp ; VBITS_GE_512-NEXT: ptrue p1.s -; VBITS_GE_512-NEXT: sbfx w10, w8, #15, #1 -; VBITS_GE_512-NEXT: sbfx w11, w8, #14, #1 +; VBITS_GE_512-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_512-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_512-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_512-NEXT: sbfx w13, w9, #12, #1 ; VBITS_GE_512-NEXT: stp w11, w10, [sp, #56] -; VBITS_GE_512-NEXT: sbfx w10, w8, #7, #1 -; VBITS_GE_512-NEXT: sbfx w11, w8, #6, #1 -; VBITS_GE_512-NEXT: sbfx w12, w8, #13, #1 -; VBITS_GE_512-NEXT: sbfx w13, w8, #12, #1 -; VBITS_GE_512-NEXT: stp w11, w10, [sp, #24] -; VBITS_GE_512-NEXT: sbfx w10, w8, #3, #1 -; VBITS_GE_512-NEXT: sbfx w11, w8, #2, #1 -; VBITS_GE_512-NEXT: sbfx w14, w8, #11, #1 -; VBITS_GE_512-NEXT: sbfx w15, w8, #10, #1 -; VBITS_GE_512-NEXT: sbfx w16, w8, #9, #1 -; VBITS_GE_512-NEXT: sbfx w17, w8, #8, #1 +; VBITS_GE_512-NEXT: sbfx w10, w9, #11, #1 +; VBITS_GE_512-NEXT: sbfx w11, w9, #10, #1 ; VBITS_GE_512-NEXT: stp w13, w12, [sp, #48] -; VBITS_GE_512-NEXT: sbfx w12, w8, #5, #1 -; VBITS_GE_512-NEXT: sbfx w13, w8, #4, #1 -; VBITS_GE_512-NEXT: stp w11, w10, [sp, #8] -; VBITS_GE_512-NEXT: sbfx w10, w8, #1, #1 -; VBITS_GE_512-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_512-NEXT: stp w15, w14, [sp, #40] -; VBITS_GE_512-NEXT: stp w17, w16, [sp, #32] +; VBITS_GE_512-NEXT: sbfx w12, w9, #9, #1 +; VBITS_GE_512-NEXT: sbfx w13, w9, #8, #1 +; VBITS_GE_512-NEXT: stp w11, w10, [sp, #40] +; VBITS_GE_512-NEXT: sbfx w10, w9, #7, #1 +; VBITS_GE_512-NEXT: sbfx w11, w9, #6, #1 +; VBITS_GE_512-NEXT: stp w13, w12, [sp, #32] +; VBITS_GE_512-NEXT: sbfx w12, w9, #5, #1 +; VBITS_GE_512-NEXT: sbfx w13, w9, #4, #1 +; VBITS_GE_512-NEXT: stp w11, w10, [sp, #24] +; VBITS_GE_512-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_512-NEXT: sbfx w11, w9, #2, #1 ; VBITS_GE_512-NEXT: stp w13, w12, [sp, #16] -; VBITS_GE_512-NEXT: stp w8, w10, [sp] -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x9] +; VBITS_GE_512-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_512-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_512-NEXT: stp w11, w10, [sp, #8] +; VBITS_GE_512-NEXT: stp w9, w12, [sp] +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x8] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: and z0.s, z0.s, #0x1 @@ -1898,68 +1898,66 @@ define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, <32 x i1>* %c) #0 { ; VBITS_GE_1024-LABEL: select_v32i32: ; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; VBITS_GE_1024-NEXT: sub x9, sp, #224 -; VBITS_GE_1024-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; VBITS_GE_1024-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_GE_1024-NEXT: sub x9, sp, #240 ; VBITS_GE_1024-NEXT: mov x29, sp ; VBITS_GE_1024-NEXT: and sp, x9, #0xffffffffffffff80 -; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 32 -; VBITS_GE_1024-NEXT: .cfi_offset w19, -16 -; VBITS_GE_1024-NEXT: .cfi_offset w30, -24 -; VBITS_GE_1024-NEXT: .cfi_offset w29, -32 -; VBITS_GE_1024-NEXT: ldr w8, [x2] +; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 +; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 +; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 +; VBITS_GE_1024-NEXT: ldr w9, [x2] +; VBITS_GE_1024-NEXT: mov x8, sp ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: mov x9, sp ; VBITS_GE_1024-NEXT: ptrue p1.s -; VBITS_GE_1024-NEXT: asr w10, w8, #31 -; VBITS_GE_1024-NEXT: sbfx w11, w8, #30, #1 +; VBITS_GE_1024-NEXT: asr w10, w9, #31 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #30, #1 +; VBITS_GE_1024-NEXT: sbfx w12, w9, #29, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #28, #1 ; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #120] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #15, #1 -; VBITS_GE_1024-NEXT: sbfx w11, w8, #14, #1 -; VBITS_GE_1024-NEXT: sbfx w12, w8, #29, #1 -; VBITS_GE_1024-NEXT: sbfx w13, w8, #28, #1 -; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #56] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #7, #1 -; VBITS_GE_1024-NEXT: sbfx w11, w8, #6, #1 -; VBITS_GE_1024-NEXT: sbfx w14, w8, #27, #1 -; VBITS_GE_1024-NEXT: sbfx w15, w8, #26, #1 -; VBITS_GE_1024-NEXT: sbfx w16, w8, #25, #1 -; VBITS_GE_1024-NEXT: sbfx w17, w8, #24, #1 +; VBITS_GE_1024-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #26, #1 ; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #112] -; VBITS_GE_1024-NEXT: sbfx w12, w8, #13, #1 -; VBITS_GE_1024-NEXT: sbfx w13, w8, #12, #1 -; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #24] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #3, #1 -; VBITS_GE_1024-NEXT: sbfx w11, w8, #2, #1 -; VBITS_GE_1024-NEXT: sbfx w18, w8, #23, #1 -; VBITS_GE_1024-NEXT: sbfx w2, w8, #22, #1 -; VBITS_GE_1024-NEXT: sbfx w3, w8, #21, #1 -; VBITS_GE_1024-NEXT: sbfx w4, w8, #20, #1 -; VBITS_GE_1024-NEXT: sbfx w5, w8, #19, #1 -; VBITS_GE_1024-NEXT: sbfx w6, w8, #18, #1 -; VBITS_GE_1024-NEXT: sbfx w7, w8, #17, #1 -; VBITS_GE_1024-NEXT: sbfx w19, w8, #16, #1 -; VBITS_GE_1024-NEXT: stp w15, w14, [sp, #104] -; VBITS_GE_1024-NEXT: stp w17, w16, [sp, #96] -; VBITS_GE_1024-NEXT: sbfx w14, w8, #11, #1 -; VBITS_GE_1024-NEXT: sbfx w15, w8, #10, #1 -; VBITS_GE_1024-NEXT: sbfx w16, w8, #9, #1 -; VBITS_GE_1024-NEXT: sbfx w17, w8, #8, #1 +; VBITS_GE_1024-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #24, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #104] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #23, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #22, #1 +; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #96] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #21, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #20, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #88] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #19, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #18, #1 +; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #80] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #17, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #16, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #72] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #64] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #12, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #56] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #11, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #10, #1 ; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #48] -; VBITS_GE_1024-NEXT: sbfx w12, w8, #5, #1 -; VBITS_GE_1024-NEXT: sbfx w13, w8, #4, #1 -; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #8] -; VBITS_GE_1024-NEXT: sbfx w10, w8, #1, #1 -; VBITS_GE_1024-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_1024-NEXT: stp w2, w18, [sp, #88] -; VBITS_GE_1024-NEXT: stp w4, w3, [sp, #80] -; VBITS_GE_1024-NEXT: stp w6, w5, [sp, #72] -; VBITS_GE_1024-NEXT: stp w19, w7, [sp, #64] -; VBITS_GE_1024-NEXT: stp w15, w14, [sp, #40] -; VBITS_GE_1024-NEXT: stp w17, w16, [sp, #32] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #9, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #8, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #40] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #7, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #6, #1 +; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #32] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #5, #1 +; VBITS_GE_1024-NEXT: sbfx w13, w9, #4, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #24] +; VBITS_GE_1024-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_1024-NEXT: sbfx w11, w9, #2, #1 ; VBITS_GE_1024-NEXT: stp w13, w12, [sp, #16] -; VBITS_GE_1024-NEXT: stp w8, w10, [sp] -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x9] +; VBITS_GE_1024-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_1024-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_1024-NEXT: stp w11, w10, [sp, #8] +; VBITS_GE_1024-NEXT: stp w9, w12, [sp] +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x8] ; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1w { z2.s }, p0/z, [x1] ; VBITS_GE_1024-NEXT: and z0.s, z0.s, #0x1 @@ -1967,8 +1965,7 @@ ; VBITS_GE_1024-NEXT: sel z0.s, p1, z1.s, z2.s ; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_1024-NEXT: mov sp, x29 -; VBITS_GE_1024-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; VBITS_GE_1024-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; VBITS_GE_1024-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_GE_1024-NEXT: ret %mask = load <32 x i1>, <32 x i1>* %c %op1 = load <32 x i32>, <32 x i32>* %a @@ -1981,161 +1978,114 @@ define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, <64 x i1>* %c) #0 { ; VBITS_GE_2048-LABEL: select_v64i32: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: sub x9, sp, #672 -; VBITS_GE_2048-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_GE_2048-NEXT: sub x9, sp, #496 ; VBITS_GE_2048-NEXT: mov x29, sp ; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 -; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 96 -; VBITS_GE_2048-NEXT: .cfi_offset w19, -8 -; VBITS_GE_2048-NEXT: .cfi_offset w20, -16 -; VBITS_GE_2048-NEXT: .cfi_offset w21, -24 -; VBITS_GE_2048-NEXT: .cfi_offset w22, -32 -; VBITS_GE_2048-NEXT: .cfi_offset w23, -40 -; VBITS_GE_2048-NEXT: .cfi_offset w24, -48 -; VBITS_GE_2048-NEXT: .cfi_offset w25, -56 -; VBITS_GE_2048-NEXT: .cfi_offset w26, -64 -; VBITS_GE_2048-NEXT: .cfi_offset w27, -72 -; VBITS_GE_2048-NEXT: .cfi_offset w28, -80 -; VBITS_GE_2048-NEXT: .cfi_offset w30, -88 -; VBITS_GE_2048-NEXT: .cfi_offset w29, -96 -; VBITS_GE_2048-NEXT: ldr x8, [x2] +; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 +; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 +; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 +; VBITS_GE_2048-NEXT: ldr x9, [x2] +; VBITS_GE_2048-NEXT: mov x8, sp ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: add x9, sp, #256 ; VBITS_GE_2048-NEXT: ptrue p1.s -; VBITS_GE_2048-NEXT: asr x10, x8, #63 -; VBITS_GE_2048-NEXT: str w10, [sp, #508] -; VBITS_GE_2048-NEXT: sbfx x10, x8, #37, #1 -; VBITS_GE_2048-NEXT: sbfx x11, x8, #62, #1 -; VBITS_GE_2048-NEXT: str w10, [sp, #404] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #18, #1 -; VBITS_GE_2048-NEXT: sbfx x12, x8, #61, #1 -; VBITS_GE_2048-NEXT: sbfx x13, x8, #60, #1 -; VBITS_GE_2048-NEXT: sbfx x14, x8, #59, #1 -; VBITS_GE_2048-NEXT: str w11, [sp, #504] -; VBITS_GE_2048-NEXT: sbfx x11, x8, #36, #1 -; VBITS_GE_2048-NEXT: str w10, [sp, #328] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #9, #1 -; VBITS_GE_2048-NEXT: sbfx x15, x8, #58, #1 -; VBITS_GE_2048-NEXT: sbfx x16, x8, #57, #1 -; VBITS_GE_2048-NEXT: sbfx x17, x8, #56, #1 -; VBITS_GE_2048-NEXT: sbfx x18, x8, #55, #1 -; VBITS_GE_2048-NEXT: str w12, [sp, #500] -; VBITS_GE_2048-NEXT: sbfx x12, x8, #35, #1 -; VBITS_GE_2048-NEXT: str w13, [sp, #496] -; VBITS_GE_2048-NEXT: sbfx x13, x8, #34, #1 -; VBITS_GE_2048-NEXT: str w14, [sp, #492] -; VBITS_GE_2048-NEXT: sbfx x14, x8, #33, #1 -; VBITS_GE_2048-NEXT: str w11, [sp, #400] -; VBITS_GE_2048-NEXT: sbfx w11, w8, #17, #1 -; VBITS_GE_2048-NEXT: str w10, [sp, #292] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #4, #1 -; VBITS_GE_2048-NEXT: sbfx x2, x8, #54, #1 -; VBITS_GE_2048-NEXT: sbfx x3, x8, #53, #1 -; VBITS_GE_2048-NEXT: sbfx x4, x8, #52, #1 -; VBITS_GE_2048-NEXT: sbfx x5, x8, #51, #1 -; VBITS_GE_2048-NEXT: sbfx x6, x8, #50, #1 -; VBITS_GE_2048-NEXT: sbfx x7, x8, #49, #1 -; VBITS_GE_2048-NEXT: sbfx x19, x8, #48, #1 -; VBITS_GE_2048-NEXT: sbfx x20, x8, #47, #1 -; VBITS_GE_2048-NEXT: sbfx x21, x8, #46, #1 -; VBITS_GE_2048-NEXT: sbfx x22, x8, #45, #1 -; VBITS_GE_2048-NEXT: str w15, [sp, #488] -; VBITS_GE_2048-NEXT: sbfx x15, x8, #32, #1 -; VBITS_GE_2048-NEXT: str w16, [sp, #484] -; VBITS_GE_2048-NEXT: asr w16, w8, #31 -; VBITS_GE_2048-NEXT: str w17, [sp, #480] -; VBITS_GE_2048-NEXT: sbfx w17, w8, #30, #1 -; VBITS_GE_2048-NEXT: str w18, [sp, #476] -; VBITS_GE_2048-NEXT: sbfx w18, w8, #29, #1 -; VBITS_GE_2048-NEXT: str w12, [sp, #396] -; VBITS_GE_2048-NEXT: str w13, [sp, #392] -; VBITS_GE_2048-NEXT: str w14, [sp, #388] -; VBITS_GE_2048-NEXT: sbfx w12, w8, #16, #1 -; VBITS_GE_2048-NEXT: sbfx w13, w8, #15, #1 -; VBITS_GE_2048-NEXT: sbfx w14, w8, #14, #1 -; VBITS_GE_2048-NEXT: str w11, [sp, #324] -; VBITS_GE_2048-NEXT: sbfx w11, w8, #8, #1 -; VBITS_GE_2048-NEXT: str w10, [sp, #272] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #2, #1 -; VBITS_GE_2048-NEXT: sbfx x23, x8, #44, #1 -; VBITS_GE_2048-NEXT: sbfx x24, x8, #43, #1 -; VBITS_GE_2048-NEXT: sbfx x25, x8, #42, #1 -; VBITS_GE_2048-NEXT: sbfx x26, x8, #41, #1 -; VBITS_GE_2048-NEXT: sbfx x27, x8, #40, #1 -; VBITS_GE_2048-NEXT: sbfx x28, x8, #39, #1 -; VBITS_GE_2048-NEXT: sbfx x30, x8, #38, #1 -; VBITS_GE_2048-NEXT: str w2, [sp, #472] -; VBITS_GE_2048-NEXT: sbfx w2, w8, #28, #1 -; VBITS_GE_2048-NEXT: str w3, [sp, #468] -; VBITS_GE_2048-NEXT: sbfx w3, w8, #27, #1 -; VBITS_GE_2048-NEXT: str w4, [sp, #464] -; VBITS_GE_2048-NEXT: sbfx w4, w8, #26, #1 -; VBITS_GE_2048-NEXT: str w5, [sp, #460] -; VBITS_GE_2048-NEXT: str w6, [sp, #456] -; VBITS_GE_2048-NEXT: sbfx w5, w8, #25, #1 -; VBITS_GE_2048-NEXT: str w7, [sp, #452] -; VBITS_GE_2048-NEXT: str w19, [sp, #448] -; VBITS_GE_2048-NEXT: sbfx w6, w8, #24, #1 -; VBITS_GE_2048-NEXT: str w20, [sp, #444] -; VBITS_GE_2048-NEXT: str w21, [sp, #440] -; VBITS_GE_2048-NEXT: sbfx w7, w8, #23, #1 -; VBITS_GE_2048-NEXT: str w22, [sp, #436] -; VBITS_GE_2048-NEXT: sbfx w19, w8, #22, #1 -; VBITS_GE_2048-NEXT: sbfx w20, w8, #21, #1 -; VBITS_GE_2048-NEXT: sbfx w21, w8, #20, #1 -; VBITS_GE_2048-NEXT: sbfx w22, w8, #19, #1 -; VBITS_GE_2048-NEXT: str w15, [sp, #384] -; VBITS_GE_2048-NEXT: str w16, [sp, #380] -; VBITS_GE_2048-NEXT: str w17, [sp, #376] -; VBITS_GE_2048-NEXT: str w18, [sp, #372] -; VBITS_GE_2048-NEXT: sbfx w15, w8, #13, #1 -; VBITS_GE_2048-NEXT: sbfx w16, w8, #12, #1 -; VBITS_GE_2048-NEXT: sbfx w17, w8, #11, #1 -; VBITS_GE_2048-NEXT: sbfx w18, w8, #10, #1 -; VBITS_GE_2048-NEXT: str w12, [sp, #320] -; VBITS_GE_2048-NEXT: str w13, [sp, #316] -; VBITS_GE_2048-NEXT: str w14, [sp, #312] -; VBITS_GE_2048-NEXT: sbfx w12, w8, #7, #1 -; VBITS_GE_2048-NEXT: sbfx w13, w8, #6, #1 -; VBITS_GE_2048-NEXT: sbfx w14, w8, #5, #1 -; VBITS_GE_2048-NEXT: str w11, [sp, #288] -; VBITS_GE_2048-NEXT: sbfx w11, w8, #3, #1 -; VBITS_GE_2048-NEXT: str w10, [sp, #264] -; VBITS_GE_2048-NEXT: sbfx w10, w8, #1, #1 -; VBITS_GE_2048-NEXT: sbfx w8, w8, #0, #1 -; VBITS_GE_2048-NEXT: str w23, [sp, #432] -; VBITS_GE_2048-NEXT: str w24, [sp, #428] -; VBITS_GE_2048-NEXT: str w25, [sp, #424] -; VBITS_GE_2048-NEXT: str w26, [sp, #420] -; VBITS_GE_2048-NEXT: str w27, [sp, #416] -; VBITS_GE_2048-NEXT: str w28, [sp, #412] -; VBITS_GE_2048-NEXT: str w30, [sp, #408] -; VBITS_GE_2048-NEXT: str w2, [sp, #368] -; VBITS_GE_2048-NEXT: str w3, [sp, #364] -; VBITS_GE_2048-NEXT: str w4, [sp, #360] -; VBITS_GE_2048-NEXT: str w5, [sp, #356] -; VBITS_GE_2048-NEXT: str w6, [sp, #352] -; VBITS_GE_2048-NEXT: str w7, [sp, #348] -; VBITS_GE_2048-NEXT: str w19, [sp, #344] -; VBITS_GE_2048-NEXT: str w20, [sp, #340] -; VBITS_GE_2048-NEXT: str w21, [sp, #336] -; VBITS_GE_2048-NEXT: str w22, [sp, #332] -; VBITS_GE_2048-NEXT: str w15, [sp, #308] -; VBITS_GE_2048-NEXT: str w16, [sp, #304] -; VBITS_GE_2048-NEXT: str w17, [sp, #300] -; VBITS_GE_2048-NEXT: str w18, [sp, #296] -; VBITS_GE_2048-NEXT: str w12, [sp, #284] -; VBITS_GE_2048-NEXT: str w13, [sp, #280] -; VBITS_GE_2048-NEXT: str w14, [sp, #276] -; VBITS_GE_2048-NEXT: str w11, [sp, #268] -; VBITS_GE_2048-NEXT: str w10, [sp, #260] -; VBITS_GE_2048-NEXT: str w8, [sp, #256] -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x9] +; VBITS_GE_2048-NEXT: asr x10, x9, #63 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #62, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x9, #61, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #60, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #248] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #59, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #58, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #240] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #57, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #56, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #232] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #55, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #54, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #224] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #53, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #52, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #216] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #51, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #50, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #208] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #49, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #48, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #200] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #47, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #46, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #192] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #45, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #44, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #184] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #43, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #42, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #176] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #41, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #40, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #168] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #39, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #38, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #160] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #37, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #36, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #152] +; VBITS_GE_2048-NEXT: sbfx x10, x9, #35, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x9, #34, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #144] +; VBITS_GE_2048-NEXT: sbfx x12, x9, #33, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x9, #32, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #136] +; VBITS_GE_2048-NEXT: asr w10, w9, #31 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #30, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #128] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #29, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #28, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #120] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #27, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #26, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #112] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #25, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #24, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #104] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #23, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #22, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #96] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #21, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #20, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #88] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #19, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #18, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #80] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #17, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #16, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #72] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #15, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #14, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #64] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #13, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #12, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #56] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #11, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #10, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #48] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #9, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #8, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #40] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #7, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #6, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #32] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #5, #1 +; VBITS_GE_2048-NEXT: sbfx w13, w9, #4, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #24] +; VBITS_GE_2048-NEXT: sbfx w10, w9, #3, #1 +; VBITS_GE_2048-NEXT: sbfx w11, w9, #2, #1 +; VBITS_GE_2048-NEXT: stp w13, w12, [sp, #16] +; VBITS_GE_2048-NEXT: sbfx w12, w9, #1, #1 +; VBITS_GE_2048-NEXT: sbfx w9, w9, #0, #1 +; VBITS_GE_2048-NEXT: stp w11, w10, [sp, #8] +; VBITS_GE_2048-NEXT: stp w9, w12, [sp] +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x8] ; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { z2.s }, p0/z, [x1] ; VBITS_GE_2048-NEXT: and z0.s, z0.s, #0x1 @@ -2143,12 +2093,7 @@ ; VBITS_GE_2048-NEXT: sel z0.s, p1, z1.s, z2.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_2048-NEXT: mov sp, x29 -; VBITS_GE_2048-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload +; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_GE_2048-NEXT: ret %mask = load <64 x i1>, <64 x i1>* %c %op1 = load <64 x i32>, <64 x i32>* %a @@ -2194,20 +2139,20 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldrb w8, [x2] +; CHECK-NEXT: ldrb w9, [x2] +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: lsr w10, w8, #3 -; CHECK-NEXT: lsr w11, w8, #2 +; CHECK-NEXT: lsr w10, w9, #3 +; CHECK-NEXT: lsr w11, w9, #2 +; CHECK-NEXT: sbfx x12, x9, #0, #1 +; CHECK-NEXT: lsr w9, w9, #1 ; CHECK-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NEXT: sbfx x11, x11, #0, #1 +; CHECK-NEXT: sbfx x9, x9, #0, #1 ; CHECK-NEXT: stp x11, x10, [sp, #16] -; CHECK-NEXT: sbfx x10, x8, #0, #1 -; CHECK-NEXT: lsr w8, w8, #1 -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: stp x10, x8, [sp] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x9] +; CHECK-NEXT: stp x12, x9, [sp] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] ; CHECK-NEXT: and z0.d, z0.d, #0x1 @@ -2235,30 +2180,30 @@ ; VBITS_GE_512-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_512-NEXT: .cfi_offset w30, -8 ; VBITS_GE_512-NEXT: .cfi_offset w29, -16 -; VBITS_GE_512-NEXT: ldrb w8, [x2] +; VBITS_GE_512-NEXT: ldrb w9, [x2] +; VBITS_GE_512-NEXT: mov x8, sp ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: mov x9, sp ; VBITS_GE_512-NEXT: ptrue p1.d -; VBITS_GE_512-NEXT: lsr w10, w8, #7 -; VBITS_GE_512-NEXT: lsr w11, w8, #6 -; VBITS_GE_512-NEXT: lsr w12, w8, #5 -; VBITS_GE_512-NEXT: lsr w13, w8, #4 +; VBITS_GE_512-NEXT: lsr w10, w9, #7 +; VBITS_GE_512-NEXT: lsr w11, w9, #6 +; VBITS_GE_512-NEXT: lsr w12, w9, #5 +; VBITS_GE_512-NEXT: lsr w13, w9, #4 ; VBITS_GE_512-NEXT: sbfx x10, x10, #0, #1 ; VBITS_GE_512-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_512-NEXT: sbfx x12, x12, #0, #1 +; VBITS_GE_512-NEXT: sbfx x13, x13, #0, #1 +; VBITS_GE_512-NEXT: lsr w14, w9, #3 ; VBITS_GE_512-NEXT: stp x11, x10, [sp, #48] -; VBITS_GE_512-NEXT: sbfx x11, x12, #0, #1 -; VBITS_GE_512-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_512-NEXT: lsr w10, w8, #3 -; VBITS_GE_512-NEXT: stp x12, x11, [sp, #32] -; VBITS_GE_512-NEXT: lsr w11, w8, #2 +; VBITS_GE_512-NEXT: lsr w10, w9, #2 +; VBITS_GE_512-NEXT: stp x13, x12, [sp, #32] +; VBITS_GE_512-NEXT: sbfx x12, x9, #0, #1 +; VBITS_GE_512-NEXT: lsr w9, w9, #1 +; VBITS_GE_512-NEXT: sbfx x11, x14, #0, #1 ; VBITS_GE_512-NEXT: sbfx x10, x10, #0, #1 -; VBITS_GE_512-NEXT: sbfx x11, x11, #0, #1 -; VBITS_GE_512-NEXT: stp x11, x10, [sp, #16] -; VBITS_GE_512-NEXT: sbfx x10, x8, #0, #1 -; VBITS_GE_512-NEXT: lsr w8, w8, #1 -; VBITS_GE_512-NEXT: sbfx x8, x8, #0, #1 -; VBITS_GE_512-NEXT: stp x10, x8, [sp] -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x9] +; VBITS_GE_512-NEXT: sbfx x9, x9, #0, #1 +; VBITS_GE_512-NEXT: stp x10, x11, [sp, #16] +; VBITS_GE_512-NEXT: stp x12, x9, [sp] +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x8] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: and z0.d, z0.d, #0x1 @@ -2286,50 +2231,50 @@ ; VBITS_GE_1024-NEXT: .cfi_def_cfa w29, 16 ; VBITS_GE_1024-NEXT: .cfi_offset w30, -8 ; VBITS_GE_1024-NEXT: .cfi_offset w29, -16 -; VBITS_GE_1024-NEXT: ldrh w8, [x2] +; VBITS_GE_1024-NEXT: ldrh w9, [x2] +; VBITS_GE_1024-NEXT: mov x8, sp ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: mov x9, sp ; VBITS_GE_1024-NEXT: ptrue p1.d -; VBITS_GE_1024-NEXT: lsr w10, w8, #15 -; VBITS_GE_1024-NEXT: lsr w11, w8, #14 -; VBITS_GE_1024-NEXT: lsr w12, w8, #13 -; VBITS_GE_1024-NEXT: lsr w13, w8, #12 +; VBITS_GE_1024-NEXT: lsr w10, w9, #15 +; VBITS_GE_1024-NEXT: lsr w11, w9, #14 +; VBITS_GE_1024-NEXT: lsr w12, w9, #13 +; VBITS_GE_1024-NEXT: lsr w13, w9, #12 ; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1 ; VBITS_GE_1024-NEXT: sbfx x11, x11, #0, #1 -; VBITS_GE_1024-NEXT: lsr w14, w8, #11 -; VBITS_GE_1024-NEXT: lsr w15, w8, #10 +; VBITS_GE_1024-NEXT: sbfx x12, x12, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x13, x13, #0, #1 +; VBITS_GE_1024-NEXT: lsr w14, w9, #11 +; VBITS_GE_1024-NEXT: lsr w15, w9, #10 ; VBITS_GE_1024-NEXT: stp x11, x10, [sp, #112] -; VBITS_GE_1024-NEXT: sbfx x11, x12, #0, #1 -; VBITS_GE_1024-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_1024-NEXT: lsr w16, w8, #9 -; VBITS_GE_1024-NEXT: lsr w17, w8, #8 -; VBITS_GE_1024-NEXT: stp x12, x11, [sp, #96] -; VBITS_GE_1024-NEXT: sbfx x12, x14, #0, #1 -; VBITS_GE_1024-NEXT: sbfx x13, x15, #0, #1 -; VBITS_GE_1024-NEXT: lsr w10, w8, #7 -; VBITS_GE_1024-NEXT: lsr w11, w8, #6 -; VBITS_GE_1024-NEXT: stp x13, x12, [sp, #80] -; VBITS_GE_1024-NEXT: sbfx x13, x16, #0, #1 -; VBITS_GE_1024-NEXT: sbfx x14, x17, #0, #1 -; VBITS_GE_1024-NEXT: lsr w12, w8, #5 -; VBITS_GE_1024-NEXT: stp x14, x13, [sp, #64] -; VBITS_GE_1024-NEXT: lsr w13, w8, #4 +; VBITS_GE_1024-NEXT: lsr w10, w9, #9 +; VBITS_GE_1024-NEXT: stp x13, x12, [sp, #96] +; VBITS_GE_1024-NEXT: lsr w13, w9, #8 +; VBITS_GE_1024-NEXT: sbfx x11, x14, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x12, x15, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x13, x13, #0, #1 +; VBITS_GE_1024-NEXT: lsr w14, w9, #3 +; VBITS_GE_1024-NEXT: stp x12, x11, [sp, #80] +; VBITS_GE_1024-NEXT: lsr w11, w9, #6 +; VBITS_GE_1024-NEXT: stp x13, x10, [sp, #64] +; VBITS_GE_1024-NEXT: lsr w10, w9, #7 +; VBITS_GE_1024-NEXT: lsr w12, w9, #5 +; VBITS_GE_1024-NEXT: lsr w13, w9, #4 ; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1 ; VBITS_GE_1024-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x12, x12, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x13, x13, #0, #1 ; VBITS_GE_1024-NEXT: stp x11, x10, [sp, #48] -; VBITS_GE_1024-NEXT: sbfx x11, x12, #0, #1 -; VBITS_GE_1024-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_1024-NEXT: lsr w10, w8, #3 -; VBITS_GE_1024-NEXT: stp x12, x11, [sp, #32] -; VBITS_GE_1024-NEXT: lsr w11, w8, #2 -; VBITS_GE_1024-NEXT: sbfx x10, x10, #0, #1 +; VBITS_GE_1024-NEXT: lsr w11, w9, #2 +; VBITS_GE_1024-NEXT: stp x13, x12, [sp, #32] +; VBITS_GE_1024-NEXT: sbfx x12, x9, #0, #1 +; VBITS_GE_1024-NEXT: lsr w9, w9, #1 +; VBITS_GE_1024-NEXT: sbfx x10, x14, #0, #1 ; VBITS_GE_1024-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_1024-NEXT: sbfx x9, x9, #0, #1 ; VBITS_GE_1024-NEXT: stp x11, x10, [sp, #16] -; VBITS_GE_1024-NEXT: sbfx x10, x8, #0, #1 -; VBITS_GE_1024-NEXT: lsr w8, w8, #1 -; VBITS_GE_1024-NEXT: sbfx x8, x8, #0, #1 -; VBITS_GE_1024-NEXT: stp x10, x8, [sp] -; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x9] +; VBITS_GE_1024-NEXT: stp x12, x9, [sp] +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x8] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_1024-NEXT: and z0.d, z0.d, #0x1 @@ -2350,130 +2295,128 @@ define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, <32 x i1>* %c) #0 { ; VBITS_GE_2048-LABEL: select_v32i64: ; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; VBITS_GE_2048-NEXT: sub x9, sp, #480 -; VBITS_GE_2048-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; VBITS_GE_2048-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; VBITS_GE_2048-NEXT: sub x9, sp, #496 ; VBITS_GE_2048-NEXT: mov x29, sp ; VBITS_GE_2048-NEXT: and sp, x9, #0xffffffffffffff00 -; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 32 -; VBITS_GE_2048-NEXT: .cfi_offset w19, -16 -; VBITS_GE_2048-NEXT: .cfi_offset w30, -24 -; VBITS_GE_2048-NEXT: .cfi_offset w29, -32 -; VBITS_GE_2048-NEXT: ldr w8, [x2] +; VBITS_GE_2048-NEXT: .cfi_def_cfa w29, 16 +; VBITS_GE_2048-NEXT: .cfi_offset w30, -8 +; VBITS_GE_2048-NEXT: .cfi_offset w29, -16 +; VBITS_GE_2048-NEXT: ldr w9, [x2] +; VBITS_GE_2048-NEXT: mov x8, sp ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: mov x9, sp ; VBITS_GE_2048-NEXT: ptrue p1.d -; VBITS_GE_2048-NEXT: ubfx x10, x8, #31, #1 -; VBITS_GE_2048-NEXT: ubfx x11, x8, #30, #2 -; VBITS_GE_2048-NEXT: ubfx x12, x8, #29, #3 +; VBITS_GE_2048-NEXT: ubfx x10, x9, #31, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #30, #2 ; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10 ; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 -; VBITS_GE_2048-NEXT: ubfx x13, x8, #28, #4 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #29, #3 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #28, #4 ; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1 ; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 -; VBITS_GE_2048-NEXT: ubfx x14, x8, #27, #5 -; VBITS_GE_2048-NEXT: ubfx x15, x8, #26, #6 -; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #240] -; VBITS_GE_2048-NEXT: sbfx x11, x12, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 -; VBITS_GE_2048-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x16, x8, #25, #7 -; VBITS_GE_2048-NEXT: ubfx x17, x8, #24, #8 -; VBITS_GE_2048-NEXT: stp x12, x11, [sp, #224] +; VBITS_GE_2048-NEXT: ubfx x14, x9, #27, #5 +; VBITS_GE_2048-NEXT: ubfx x15, x9, #26, #6 ; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 -; VBITS_GE_2048-NEXT: sbfx x12, x14, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w15 killed $w15 killed $x15 def $x15 -; VBITS_GE_2048-NEXT: sbfx x13, x15, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x18, x8, #23, #9 -; VBITS_GE_2048-NEXT: ubfx x2, x8, #22, #10 -; VBITS_GE_2048-NEXT: stp x13, x12, [sp, #208] -; VBITS_GE_2048-NEXT: // kill: def $w16 killed $w16 killed $x16 def $x16 -; VBITS_GE_2048-NEXT: sbfx x13, x16, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w17 killed $w17 killed $x17 def $x17 -; VBITS_GE_2048-NEXT: sbfx x14, x17, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x3, x8, #21, #11 -; VBITS_GE_2048-NEXT: ubfx x4, x8, #20, #12 -; VBITS_GE_2048-NEXT: ubfx x10, x8, #15, #17 -; VBITS_GE_2048-NEXT: ubfx x11, x8, #14, #18 -; VBITS_GE_2048-NEXT: stp x14, x13, [sp, #192] -; VBITS_GE_2048-NEXT: // kill: def $w18 killed $w18 killed $x18 def $x18 -; VBITS_GE_2048-NEXT: sbfx x14, x18, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w2 killed $w2 killed $x2 def $x2 -; VBITS_GE_2048-NEXT: sbfx x15, x2, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x5, x8, #19, #13 -; VBITS_GE_2048-NEXT: ubfx x6, x8, #18, #14 -; VBITS_GE_2048-NEXT: ubfx x12, x8, #13, #19 -; VBITS_GE_2048-NEXT: stp x15, x14, [sp, #176] -; VBITS_GE_2048-NEXT: // kill: def $w3 killed $w3 killed $x3 def $x3 -; VBITS_GE_2048-NEXT: sbfx x15, x3, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w4 killed $w4 killed $x4 def $x4 -; VBITS_GE_2048-NEXT: sbfx x16, x4, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #240] +; VBITS_GE_2048-NEXT: sbfx x10, x12, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x13, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x13, x14, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #25, #7 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #23, #9 +; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x12, x10, [sp, #224] +; VBITS_GE_2048-NEXT: sbfx x10, x15, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #24, #8 +; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 +; VBITS_GE_2048-NEXT: stp x10, x13, [sp, #208] +; VBITS_GE_2048-NEXT: sbfx x10, x11, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x12, x12, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #22, #10 +; VBITS_GE_2048-NEXT: sbfx x13, x14, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #21, #11 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x12, x10, [sp, #192] +; VBITS_GE_2048-NEXT: sbfx x10, x11, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #20, #12 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #19, #13 ; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 -; VBITS_GE_2048-NEXT: ubfx x7, x8, #17, #15 -; VBITS_GE_2048-NEXT: ubfx x19, x8, #16, #16 -; VBITS_GE_2048-NEXT: ubfx x13, x8, #12, #20 -; VBITS_GE_2048-NEXT: stp x16, x15, [sp, #160] -; VBITS_GE_2048-NEXT: // kill: def $w5 killed $w5 killed $x5 def $x5 -; VBITS_GE_2048-NEXT: sbfx x16, x5, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w6 killed $w6 killed $x6 def $x6 -; VBITS_GE_2048-NEXT: sbfx x17, x6, #0, #1 -; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1 -; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 -; VBITS_GE_2048-NEXT: ubfx x14, x8, #11, #21 -; VBITS_GE_2048-NEXT: ubfx x15, x8, #10, #22 -; VBITS_GE_2048-NEXT: stp x17, x16, [sp, #144] -; VBITS_GE_2048-NEXT: // kill: def $w7 killed $w7 killed $x7 def $x7 -; VBITS_GE_2048-NEXT: sbfx x17, x7, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w19 killed $w19 killed $x19 def $x19 -; VBITS_GE_2048-NEXT: sbfx x18, x19, #0, #1 -; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #112] -; VBITS_GE_2048-NEXT: sbfx x11, x12, #0, #1 +; VBITS_GE_2048-NEXT: stp x10, x13, [sp, #176] +; VBITS_GE_2048-NEXT: sbfx x10, x14, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #18, #14 +; VBITS_GE_2048-NEXT: sbfx x12, x12, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 -; VBITS_GE_2048-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x16, x8, #9, #23 -; VBITS_GE_2048-NEXT: stp x18, x17, [sp, #128] -; VBITS_GE_2048-NEXT: ubfx x17, x8, #8, #24 -; VBITS_GE_2048-NEXT: ubfx x10, x8, #7, #25 -; VBITS_GE_2048-NEXT: stp x12, x11, [sp, #96] -; VBITS_GE_2048-NEXT: ubfx x11, x8, #6, #26 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #17, #15 ; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 -; VBITS_GE_2048-NEXT: sbfx x12, x14, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w15 killed $w15 killed $x15 def $x15 -; VBITS_GE_2048-NEXT: sbfx x13, x15, #0, #1 -; VBITS_GE_2048-NEXT: stp x13, x12, [sp, #80] -; VBITS_GE_2048-NEXT: ubfx x12, x8, #5, #27 -; VBITS_GE_2048-NEXT: // kill: def $w16 killed $w16 killed $x16 def $x16 -; VBITS_GE_2048-NEXT: sbfx x13, x16, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w17 killed $w17 killed $x17 def $x17 -; VBITS_GE_2048-NEXT: sbfx x14, x17, #0, #1 -; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #160] +; VBITS_GE_2048-NEXT: sbfx x10, x13, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #16, #16 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #15, #17 ; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 -; VBITS_GE_2048-NEXT: stp x14, x13, [sp, #64] -; VBITS_GE_2048-NEXT: ubfx x13, x8, #4, #28 -; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 +; VBITS_GE_2048-NEXT: stp x10, x12, [sp, #144] +; VBITS_GE_2048-NEXT: sbfx x10, x14, #0, #1 ; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #14, #18 +; VBITS_GE_2048-NEXT: sbfx x13, x13, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #13, #19 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #128] +; VBITS_GE_2048-NEXT: sbfx x10, x12, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #12, #20 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #11, #21 +; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 ; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 -; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #48] -; VBITS_GE_2048-NEXT: sbfx x11, x12, #0, #1 +; VBITS_GE_2048-NEXT: stp x10, x13, [sp, #112] +; VBITS_GE_2048-NEXT: sbfx x10, x14, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #10, #22 +; VBITS_GE_2048-NEXT: sbfx x12, x12, #0, #1 ; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 -; VBITS_GE_2048-NEXT: sbfx x12, x13, #0, #1 -; VBITS_GE_2048-NEXT: ubfx x10, x8, #3, #29 -; VBITS_GE_2048-NEXT: stp x12, x11, [sp, #32] -; VBITS_GE_2048-NEXT: ubfx x11, x8, #2, #30 -; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #9, #23 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #96] +; VBITS_GE_2048-NEXT: sbfx x10, x13, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #8, #24 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #7, #25 ; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 -; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 +; VBITS_GE_2048-NEXT: stp x10, x12, [sp, #80] +; VBITS_GE_2048-NEXT: sbfx x10, x14, #0, #1 ; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 -; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #16] -; VBITS_GE_2048-NEXT: ubfx x10, x8, #1, #31 -; VBITS_GE_2048-NEXT: // kill: def $w10 killed $w10 killed $x10 def $x10 -; VBITS_GE_2048-NEXT: sbfx x8, x8, #0, #1 -; VBITS_GE_2048-NEXT: sbfx x10, x10, #0, #1 -; VBITS_GE_2048-NEXT: stp x8, x10, [sp] -; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x9] +; VBITS_GE_2048-NEXT: ubfx x12, x9, #6, #26 +; VBITS_GE_2048-NEXT: sbfx x13, x13, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #5, #27 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #64] +; VBITS_GE_2048-NEXT: sbfx x10, x12, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x11, x9, #4, #28 +; VBITS_GE_2048-NEXT: ubfx x12, x9, #3, #29 +; VBITS_GE_2048-NEXT: // kill: def $w11 killed $w11 killed $x11 def $x11 +; VBITS_GE_2048-NEXT: // kill: def $w12 killed $w12 killed $x12 def $x12 +; VBITS_GE_2048-NEXT: stp x10, x13, [sp, #48] +; VBITS_GE_2048-NEXT: sbfx x10, x14, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x11, #0, #1 +; VBITS_GE_2048-NEXT: ubfx x13, x9, #2, #30 +; VBITS_GE_2048-NEXT: ubfx x14, x9, #1, #31 +; VBITS_GE_2048-NEXT: sbfx x12, x12, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w13 killed $w13 killed $x13 def $x13 +; VBITS_GE_2048-NEXT: sbfx x9, x9, #0, #1 +; VBITS_GE_2048-NEXT: // kill: def $w14 killed $w14 killed $x14 def $x14 +; VBITS_GE_2048-NEXT: stp x11, x10, [sp, #32] +; VBITS_GE_2048-NEXT: sbfx x10, x13, #0, #1 +; VBITS_GE_2048-NEXT: sbfx x11, x14, #0, #1 +; VBITS_GE_2048-NEXT: stp x10, x12, [sp, #16] +; VBITS_GE_2048-NEXT: stp x9, x11, [sp] +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x8] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_2048-NEXT: and z0.d, z0.d, #0x1 @@ -2481,8 +2424,7 @@ ; VBITS_GE_2048-NEXT: sel z0.d, p1, z1.d, z2.d ; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_2048-NEXT: mov sp, x29 -; VBITS_GE_2048-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; VBITS_GE_2048-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; VBITS_GE_2048-NEXT: ret %mask = load <32 x i1>, <32 x i1>* %c %op1 = load <32 x i64>, <32 x i64>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -28,15 +28,15 @@ ; CHECK-LABEL: masked_gather_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: cmeq v1.2s, v1.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: ld1sb { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: st1b { z0.s }, p0, [x0] @@ -81,23 +81,23 @@ ; VBITS_EQ_256-NEXT: ldr d0, [x0] ; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: cmeq v0.8b, v0.8b, #0 -; VBITS_EQ_256-NEXT: zip2 v3.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: zip2 v1.8b, v0.8b, v0.8b ; VBITS_EQ_256-NEXT: zip1 v0.8b, v0.8b, v0.8b -; VBITS_EQ_256-NEXT: shl v3.4h, v3.4h, #8 +; VBITS_EQ_256-NEXT: shl v1.4h, v1.4h, #8 ; VBITS_EQ_256-NEXT: shl v0.4h, v0.4h, #8 -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: sshr v3.4h, v3.4h, #8 +; VBITS_EQ_256-NEXT: sshr v1.4h, v1.4h, #8 ; VBITS_EQ_256-NEXT: sshr v0.4h, v0.4h, #8 -; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h ; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z3.d, #0 +; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 ; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_EQ_256-NEXT: ld1sb { z0.d }, p1/z, [z1.d] -; VBITS_EQ_256-NEXT: ld1sb { z1.d }, p0/z, [z2.d] +; VBITS_EQ_256-NEXT: ld1sb { z0.d }, p1/z, [z2.d] +; VBITS_EQ_256-NEXT: ld1sb { z1.d }, p0/z, [z3.d] ; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h @@ -159,15 +159,15 @@ ; VBITS_GE_2048-LABEL: masked_gather_v32i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 +; VBITS_GE_2048-NEXT: ptrue p2.d, vl32 ; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 -; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq p2.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: mov z0.b, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: ld1d { z1.d }, p2/z, [x1] +; VBITS_GE_2048-NEXT: cmpeq p1.b, p0/z, z0.b, #0 +; VBITS_GE_2048-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_2048-NEXT: cmpne p1.d, p2/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1b { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h @@ -190,15 +190,15 @@ ; CHECK-LABEL: masked_gather_v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x0, #2] -; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: cmeq v1.2s, v1.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: ld1sh { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldrh w8, [x0, #2] +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: st1h { z0.s }, p0, [x0] @@ -241,24 +241,24 @@ ; VBITS_EQ_256-NEXT: ldr q0, [x0] ; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: cmeq v0.8h, v0.8h, #0 -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: uunpklo z3.s, z0.h -; VBITS_EQ_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z3.d, #0 -; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_EQ_256-NEXT: ld1h { z2.d }, p1/z, [z2.d] -; VBITS_EQ_256-NEXT: ld1h { z0.d }, p0/z, [z1.d] -; VBITS_EQ_256-NEXT: uzp1 z1.s, z2.s, z2.s +; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p1/z, [z3.d] +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [z2.d] ; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] -; VBITS_EQ_256-NEXT: str q1, [x0] +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v0.d[1], v1.d[0] +; VBITS_EQ_256-NEXT: str q0, [x0] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i16: @@ -287,15 +287,15 @@ ; VBITS_GE_1024-LABEL: masked_gather_v16i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 -; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: cmpeq p2.h, p0/z, z0.h, #0 -; VBITS_GE_1024-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z1.d] +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_1024-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_1024-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] @@ -312,15 +312,15 @@ ; VBITS_GE_2048-LABEL: masked_gather_v32i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 -; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: cmpeq p2.h, p0/z, z0.h, #0 -; VBITS_GE_2048-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z1.d] +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -341,8 +341,8 @@ ; CHECK-LABEL: masked_gather_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 @@ -391,23 +391,23 @@ ; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 ; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 ; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_EQ_256-NEXT: mov x9, sp ; VBITS_EQ_256-NEXT: ptrue p1.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 -; VBITS_EQ_256-NEXT: mov x8, sp -; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p1/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x8] -; VBITS_EQ_256-NEXT: ldr q0, [sp, #16] -; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z0.d, #0 -; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [z1.d] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x9] +; VBITS_EQ_256-NEXT: ldr q1, [sp, #16] +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p2/z, [z0.d] ; VBITS_EQ_256-NEXT: ldr q1, [sp] -; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_EQ_256-NEXT: ld1w { z1.d }, p1/z, [z2.d] ; VBITS_EQ_256-NEXT: ptrue p1.s, vl4 ; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s @@ -420,8 +420,8 @@ ; VBITS_GE_512-LABEL: masked_gather_v8i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p2.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff @@ -443,8 +443,8 @@ ; VBITS_GE_1024-LABEL: masked_gather_v16i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: cmpeq p2.s, p0/z, z0.s, #0 ; VBITS_GE_1024-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff @@ -466,8 +466,8 @@ ; VBITS_GE_2048-LABEL: masked_gather_v32i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: cmpeq p2.s, p0/z, z0.s, #0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff @@ -516,8 +516,8 @@ ; CHECK-LABEL: masked_gather_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [z1.d] @@ -628,22 +628,22 @@ define void @masked_gather_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 { ; CHECK-LABEL: masked_gather_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: movi d2, #0000000000000000 -; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: fcmeq v0.4h, v0.4h, #0.0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: sshr v0.2s, v0.2s, #16 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov v2.h[0], w9 -; CHECK-NEXT: mov v2.h[1], w8 -; CHECK-NEXT: shl v0.4h, v2.4h, #15 +; CHECK-NEXT: fcmeq v1.4h, v1.4h, #0.0 +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov w9, v1.s[1] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mov v0.h[0], w8 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: shl v0.4h, v0.4h, #15 ; CHECK-NEXT: sshr v0.4h, v0.4h, #15 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z0.d, z0.s @@ -711,15 +711,15 @@ ; VBITS_GE_1024-LABEL: masked_gather_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 -; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_1024-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z1.d] +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_1024-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_1024-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] @@ -736,15 +736,15 @@ ; VBITS_GE_2048-LABEL: masked_gather_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 -; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z1.d] +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -765,8 +765,8 @@ ; CHECK-LABEL: masked_gather_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: fcmeq v0.2s, v0.2s, #0.0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 @@ -807,8 +807,8 @@ ; VBITS_GE_512-LABEL: masked_gather_v8f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_512-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff @@ -830,8 +830,8 @@ ; VBITS_GE_1024-LABEL: masked_gather_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_1024-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff @@ -853,8 +853,8 @@ ; VBITS_GE_2048-LABEL: masked_gather_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff @@ -903,8 +903,8 @@ ; CHECK-LABEL: masked_gather_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [z1.d] @@ -998,15 +998,15 @@ ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 -; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z1.d, lsl #1] +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d, lsl #1] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -1026,8 +1026,8 @@ ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_sext_f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff @@ -1073,15 +1073,15 @@ ; VBITS_GE_2048-LABEL: masked_gather_32b_scaled_zext: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 -; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z1.d, lsl #1] +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d, lsl #1] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -1101,15 +1101,15 @@ ; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_sext: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 -; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z1.d] +; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -1130,15 +1130,15 @@ ; VBITS_GE_2048-LABEL: masked_gather_32b_unscaled_zext: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 -; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 -; VBITS_GE_2048-NEXT: mov z0.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z1.d] +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] @@ -1158,8 +1158,8 @@ ; VBITS_GE_2048-LABEL: masked_gather_64b_scaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff @@ -1182,8 +1182,8 @@ ; VBITS_GE_2048-LABEL: masked_gather_64b_unscaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff @@ -1208,14 +1208,14 @@ ; VBITS_GE_2048-LABEL: masked_gather_vec_plus_reg: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z2.d, x2 ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1236,14 +1236,14 @@ ; VBITS_GE_2048-LABEL: masked_gather_vec_plus_imm: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z2.d, #4 // =0x4 ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1263,17 +1263,17 @@ ; VBITS_GE_2048-LABEL: masked_gather_passthru: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: ld1w { z2.s }, p0/z, [x2] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] -; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_2048-NEXT: sel z0.s, p2, z0.s, z2.s +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x2] +; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [z1.d] +; VBITS_GE_2048-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_2048-NEXT: mov z0.s, p2/m, z1.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_2048-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a @@ -1289,8 +1289,8 @@ ; VBITS_GE_2048-LABEL: masked_gather_passthru_0: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -26,22 +26,22 @@ define <2 x half> @masked_load_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 { ; CHECK-LABEL: masked_load_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ldr s1, [x1] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: ldr s2, [x1] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: fcmeq v0.4h, v0.4h, v1.4h -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov v0.s[1], w9 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: sshr v0.2s, v0.2s, #16 -; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov v1.h[0], w9 -; CHECK-NEXT: mov v1.h[1], w8 -; CHECK-NEXT: shl v0.4h, v1.4h, #15 +; CHECK-NEXT: fcmeq v1.4h, v1.4h, v2.4h +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov w9, v1.s[1] +; CHECK-NEXT: mov v0.h[0], w8 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: shl v0.4h, v0.4h, #15 ; CHECK-NEXT: sshr v0.4h, v0.4h, #15 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] @@ -58,8 +58,8 @@ ; CHECK-LABEL: masked_load_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -76,8 +76,8 @@ ; CHECK-LABEL: masked_load_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -269,8 +269,8 @@ ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_512-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_512-NEXT: cmpne p1.h, p0/z, z0.h, #0 ; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] @@ -328,8 +328,8 @@ ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_512-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] @@ -367,8 +367,8 @@ ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] @@ -387,8 +387,8 @@ ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_512-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_512-NEXT: cmpne p1.h, p0/z, z0.h, #0 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8] @@ -446,8 +446,8 @@ ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_512-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8] @@ -485,8 +485,8 @@ ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -28,16 +28,16 @@ ; CHECK-LABEL: masked_scatter_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x0, #1] -; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: cmeq v2.2s, v1.2s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1b { z1.d }, p0, [z0.d] +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: st1b { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret %vals = load <2 x i8>, <2 x i8>* %a %ptrs = load <2 x i8*>, <2 x i8*>* %b @@ -54,11 +54,11 @@ ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: cmeq v2.4h, v0.4h, #0 -; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i8>, <4 x i8>* %a @@ -75,29 +75,29 @@ ; VBITS_EQ_256-NEXT: ldr d0, [x0] ; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: cmeq v3.8b, v0.8b, #0 -; VBITS_EQ_256-NEXT: zip1 v4.8b, v3.8b, v0.8b -; VBITS_EQ_256-NEXT: zip2 v3.8b, v3.8b, v0.8b -; VBITS_EQ_256-NEXT: shl v4.4h, v4.4h, #8 -; VBITS_EQ_256-NEXT: shl v3.4h, v3.4h, #8 -; VBITS_EQ_256-NEXT: sshr v4.4h, v4.4h, #8 -; VBITS_EQ_256-NEXT: sshr v3.4h, v3.4h, #8 -; VBITS_EQ_256-NEXT: uunpklo z4.s, z4.h -; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: uunpklo z4.d, z4.s -; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s -; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z4.d, #0 -; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z3.d, #0 -; VBITS_EQ_256-NEXT: zip1 v3.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: cmeq v1.8b, v0.8b, #0 +; VBITS_EQ_256-NEXT: zip1 v5.8b, v0.8b, v0.8b +; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: zip1 v2.8b, v1.8b, v0.8b +; VBITS_EQ_256-NEXT: zip2 v1.8b, v1.8b, v0.8b ; VBITS_EQ_256-NEXT: zip2 v0.8b, v0.8b, v0.8b -; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: shl v2.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: shl v1.4h, v1.4h, #8 ; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: st1b { z3.d }, p1, [z2.d] -; VBITS_EQ_256-NEXT: st1b { z0.d }, p0, [z1.d] +; VBITS_EQ_256-NEXT: sshr v2.4h, v2.4h, #8 +; VBITS_EQ_256-NEXT: sshr v1.4h, v1.4h, #8 +; VBITS_EQ_256-NEXT: uunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 +; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: uunpklo z1.s, z5.h +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: st1b { z1.d }, p1, [z4.d] +; VBITS_EQ_256-NEXT: st1b { z0.d }, p0, [z3.d] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_scatter_v8i8: @@ -106,13 +106,13 @@ ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmeq v2.8b, v0.8b, #0 -; VBITS_GE_512-NEXT: uunpklo z2.h, z2.b ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: uunpklo z2.h, z2.b +; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_512-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i8>, <8 x i8>* %a @@ -129,13 +129,13 @@ ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_1024-NEXT: cmeq v2.16b, v0.16b, #0 -; VBITS_GE_1024-NEXT: uunpklo z2.h, z2.b ; VBITS_GE_1024-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_1024-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: uunpklo z2.h, z2.b +; VBITS_GE_1024-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_1024-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: st1b { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i8>, <16 x i8>* %a @@ -149,19 +149,19 @@ ; VBITS_GE_2048-LABEL: masked_scatter_v32i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 -; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 -; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_2048-NEXT: ld1d { z2.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; VBITS_GE_2048-NEXT: mov z2.b, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z2.h, z2.b ; VBITS_GE_2048-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h +; VBITS_GE_2048-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: uunpklo z1.h, z1.b ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: st1b { z0.d }, p0, [z1.d] +; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z1.d, #0 +; VBITS_GE_2048-NEXT: st1b { z0.d }, p0, [z2.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i8>, <32 x i8>* %a %ptrs = load <32 x i8*>, <32 x i8*>* %b @@ -178,16 +178,16 @@ ; CHECK-LABEL: masked_scatter_v2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x0, #2] -; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: cmeq v2.2s, v1.2s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldrh w8, [x0, #2] +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1h { z1.d }, p0, [z0.d] +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret %vals = load <2 x i16>, <2 x i16>* %a %ptrs = load <2 x i16*>, <2 x i16*>* %b @@ -203,11 +203,11 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmeq v2.4h, v0.4h, #0 -; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i16>, <4 x i16>* %a @@ -224,23 +224,23 @@ ; VBITS_EQ_256-NEXT: ldr q0, [x0] ; VBITS_EQ_256-NEXT: mov x8, #4 ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 -; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] -; VBITS_EQ_256-NEXT: cmeq v3.8h, v0.8h, #0 -; VBITS_EQ_256-NEXT: uunpklo z4.s, z3.h -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: uunpklo z4.d, z4.s -; VBITS_EQ_256-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z4.d, #0 -; VBITS_EQ_256-NEXT: uunpklo z4.s, z0.h -; VBITS_EQ_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: cmeq v1.8h, v0.8h, #0 +; VBITS_EQ_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 ; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s -; VBITS_EQ_256-NEXT: uunpklo z4.d, z4.s -; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z3.d, #0 ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: st1h { z4.d }, p1, [z2.d] -; VBITS_EQ_256-NEXT: st1h { z0.d }, p0, [z1.d] +; VBITS_EQ_256-NEXT: uunpklo z2.s, z1.h +; VBITS_EQ_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h +; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: st1h { z0.d }, p1, [z2.d] +; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; VBITS_EQ_256-NEXT: uunpklo z1.d, z3.s +; VBITS_EQ_256-NEXT: st1h { z1.d }, p0, [z4.d] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_scatter_v8i16: @@ -249,11 +249,11 @@ ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmeq v2.8h, v0.8h, #0 -; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_512-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i16>, <8 x i16>* %a @@ -267,16 +267,16 @@ ; VBITS_GE_1024-LABEL: masked_scatter_v16i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_1024-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_1024-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i16>, <16 x i16>* %a @@ -290,16 +290,16 @@ ; VBITS_GE_2048-LABEL: masked_scatter_v32i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i16>, <32 x i16>* %a @@ -317,13 +317,13 @@ ; CHECK-LABEL: masked_scatter_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: cmeq v2.2s, v0.2s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: st1w { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret %vals = load <2 x i32>, <2 x i32>* %a %ptrs = load <2 x i32*>, <2 x i32*>* %b @@ -339,9 +339,9 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmeq v2.4s, v0.4s, #0 +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i32>, <4 x i32>* %a @@ -363,23 +363,23 @@ ; VBITS_EQ_256-NEXT: .cfi_offset w30, -8 ; VBITS_EQ_256-NEXT: .cfi_offset w29, -16 ; VBITS_EQ_256-NEXT: ptrue p1.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p1/z, [x0] ; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_EQ_256-NEXT: add x9, sp, #32 +; VBITS_EQ_256-NEXT: mov x10, sp ; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_EQ_256-NEXT: cmpeq p2.s, p1/z, z0.s, #0 -; VBITS_EQ_256-NEXT: add x8, sp, #32 -; VBITS_EQ_256-NEXT: mov x9, sp ; VBITS_EQ_256-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_EQ_256-NEXT: st1w { z3.s }, p1, [x8] -; VBITS_EQ_256-NEXT: st1w { z0.s }, p1, [x9] +; VBITS_EQ_256-NEXT: st1w { z3.s }, p1, [x9] +; VBITS_EQ_256-NEXT: st1w { z0.s }, p1, [x10] ; VBITS_EQ_256-NEXT: ldr q0, [sp, #32] ; VBITS_EQ_256-NEXT: ldr q3, [sp] ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: uunpklo z3.d, z3.s ; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; VBITS_EQ_256-NEXT: st1w { z3.d }, p1, [z2.d] +; VBITS_EQ_256-NEXT: uunpklo z0.d, z3.s +; VBITS_EQ_256-NEXT: st1w { z0.d }, p1, [z2.d] ; VBITS_EQ_256-NEXT: ldr q0, [sp, #48] ; VBITS_EQ_256-NEXT: ldr q2, [sp, #16] ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s @@ -393,14 +393,14 @@ ; VBITS_GE_512-LABEL: masked_scatter_v8i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_512-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_512-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i32>, <8 x i32>* %a @@ -414,14 +414,14 @@ ; VBITS_GE_1024-LABEL: masked_scatter_v16i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_1024-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i32>, <16 x i32>* %a @@ -435,14 +435,14 @@ ; VBITS_GE_2048-LABEL: masked_scatter_v32i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i32>, <32 x i32>* %a @@ -480,11 +480,11 @@ ; CHECK-LABEL: masked_scatter_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: cmeq v2.2d, v0.2d, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: cmeq v1.2d, v0.2d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: st1d { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret %vals = load <2 x i64>, <2 x i64>* %a %ptrs = load <2 x i64*>, <2 x i64*>* %b @@ -579,29 +579,29 @@ define void @masked_scatter_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: movi d2, #0000000000000000 -; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: fcmeq v3.4h, v0.4h, #0.0 -; CHECK-NEXT: umov w8, v3.h[0] -; CHECK-NEXT: umov w9, v3.h[1] -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: mov v3.s[1], w9 -; CHECK-NEXT: shl v3.2s, v3.2s, #16 -; CHECK-NEXT: sshr v3.2s, v3.2s, #16 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov w8, v3.s[1] -; CHECK-NEXT: mov v2.h[0], w9 -; CHECK-NEXT: mov v2.h[1], w8 -; CHECK-NEXT: shl v2.4h, v2.4h, #15 -; CHECK-NEXT: sshr v2.4h, v2.4h, #15 -; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: umov w8, v2.h[0] +; CHECK-NEXT: umov w9, v2.h[1] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: mov v2.s[1], w9 +; CHECK-NEXT: shl v2.2s, v2.2s, #16 +; CHECK-NEXT: sshr v2.2s, v2.2s, #16 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov w9, v2.s[1] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: mov v0.h[0], w8 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: uunpklo z0.d, z1.s +; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret %vals = load <2 x half>, <2 x half>* %a %ptrs = load <2 x half*>, <2 x half*>* %b @@ -617,11 +617,11 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq v2.4h, v0.4h, #0.0 -; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x half>, <4 x half>* %a @@ -638,11 +638,11 @@ ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq v2.8h, v0.8h, #0.0 -; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_512-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x half>, <8 x half>* %a @@ -656,16 +656,16 @@ ; VBITS_GE_1024-LABEL: masked_scatter_v16f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 -; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_1024-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_1024-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x half>, <16 x half>* %a @@ -679,16 +679,16 @@ ; VBITS_GE_2048-LABEL: masked_scatter_v32f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a @@ -706,13 +706,13 @@ ; CHECK-LABEL: masked_scatter_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fcmeq v2.2s, v0.2s, #0.0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: fcmeq v1.2s, v0.2s, #0.0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: st1w { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret %vals = load <2 x float>, <2 x float>* %a %ptrs = load <2 x float*>, <2 x float*>* %b @@ -728,9 +728,9 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x float>, <4 x float>* %a @@ -744,14 +744,14 @@ ; VBITS_GE_512-LABEL: masked_scatter_v8f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_512-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_512-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x float>, <8 x float>* %a @@ -765,14 +765,14 @@ ; VBITS_GE_1024-LABEL: masked_scatter_v16f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 -; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ptrue p1.d, vl16 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_1024-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x float>, <16 x float>* %a @@ -786,14 +786,14 @@ ; VBITS_GE_2048-LABEL: masked_scatter_v32f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a @@ -831,11 +831,11 @@ ; CHECK-LABEL: masked_scatter_v2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: fcmeq v2.2d, v0.2d, #0.0 -; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: st1d { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret %vals = load <2 x double>, <2 x double>* %a %ptrs = load <2 x double*>, <2 x double*>* %b @@ -916,16 +916,16 @@ ; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d, lsl #1] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a @@ -942,14 +942,14 @@ ; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_sext_f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a @@ -985,16 +985,16 @@ ; VBITS_GE_2048-LABEL: masked_scatter_32b_scaled_zext: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d, lsl #1] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a @@ -1011,16 +1011,16 @@ ; VBITS_GE_2048-LABEL: masked_scatter_32b_unscaled_sext: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a @@ -1038,16 +1038,16 @@ ; VBITS_GE_2048-LABEL: masked_scatter_32b_unscaled_zext: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 -; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a @@ -1064,14 +1064,14 @@ ; VBITS_GE_2048-LABEL: masked_scatter_64b_scaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a @@ -1086,14 +1086,14 @@ ; VBITS_GE_2048-LABEL: masked_scatter_64b_unscaled: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 +; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a @@ -1116,10 +1116,10 @@ ; VBITS_GE_2048-NEXT: mov z2.d, x2 ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d -; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: uunpklo z2.d, z3.s +; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a @@ -1142,10 +1142,10 @@ ; VBITS_GE_2048-NEXT: mov z2.d, #4 // =0x4 ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d -; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s -; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 +; VBITS_GE_2048-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: uunpklo z2.d, z3.s +; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -26,25 +26,25 @@ define void @masked_store_v2f16(<2 x half>* %ap, <2 x half>* %bp) #0 { ; CHECK-LABEL: masked_store_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ldr s1, [x1] -; CHECK-NEXT: movi d2, #0000000000000000 +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: ldr s2, [x1] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: fcmeq v1.4h, v0.4h, v1.4h -; CHECK-NEXT: umov w8, v1.h[0] -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov v1.s[1], w9 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: sshr v1.2s, v1.2s, #16 -; CHECK-NEXT: fmov w9, s1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v2.h[0], w9 -; CHECK-NEXT: mov v2.h[1], w8 -; CHECK-NEXT: shl v1.4h, v2.4h, #15 -; CHECK-NEXT: sshr v1.4h, v1.4h, #15 -; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 -; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: fcmeq v2.4h, v1.4h, v2.4h +; CHECK-NEXT: umov w8, v2.h[0] +; CHECK-NEXT: umov w9, v2.h[1] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: mov v2.s[1], w9 +; CHECK-NEXT: shl v2.2s, v2.2s, #16 +; CHECK-NEXT: sshr v2.2s, v2.2s, #16 +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov w9, v2.s[1] +; CHECK-NEXT: mov v0.h[0], w8 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: st1h { z1.h }, p0, [x1] ; CHECK-NEXT: ret %a = load <2 x half>, <2 x half>* %ap %b = load <2 x half>, <2 x half>* %bp @@ -58,8 +58,8 @@ ; CHECK-LABEL: masked_store_v2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: fcmeq v1.2s, v0.2s, v1.2s ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -75,8 +75,8 @@ ; CHECK-LABEL: masked_store_v4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: fcmeq v1.4s, v0.4s, v1.4s ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -159,15 +159,15 @@ ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d -; VBITS_GE_512-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_512-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_512-NEXT: ptrue p0.b, vl8 +; VBITS_GE_512-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_512-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z1.b, #0 -; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap @@ -184,14 +184,14 @@ ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: ptrue p1.h, vl8 ; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p0.h, vl8 ; VBITS_GE_512-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_GE_512-NEXT: cmpne p0.h, p1/z, z1.h, #0 ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z1.h, #0 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap @@ -208,12 +208,12 @@ ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: ptrue p1.s, vl8 ; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_GE_512-NEXT: cmpne p0.s, p1/z, z1.s, #0 -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_512-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap @@ -230,14 +230,14 @@ ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: ptrue p1.b, vl16 ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p0.b, vl16 ; VBITS_GE_512-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: uzp1 z1.b, z1.b, z1.b -; VBITS_GE_512-NEXT: cmpne p0.b, p1/z, z1.b, #0 ; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_512-NEXT: uzp1 z1.b, z1.b, z1.b +; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z1.b, #0 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap @@ -254,12 +254,12 @@ ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: ptrue p1.h, vl16 ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 ; VBITS_GE_512-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_GE_512-NEXT: cmpne p0.h, p1/z, z1.h, #0 -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z1.h, #0 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap @@ -276,12 +276,12 @@ ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: ptrue p1.b, vl32 ; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_512-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p0.b, vl32 ; VBITS_GE_512-NEXT: uzp1 z1.b, z1.b, z1.b -; VBITS_GE_512-NEXT: cmpne p0.b, p1/z, z1.b, #0 -; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z1.b, #0 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle.ll @@ -66,19 +66,19 @@ ; Ensure sensible type legalisation. ; VBITS_EQ_256-LABEL: shuffle_ext_byone_v64i8: ; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 ; VBITS_EQ_256-NEXT: mov w8, #32 +; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 ; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] ; VBITS_EQ_256-NEXT: ld1b { z2.b }, p0/z, [x1] ; VBITS_EQ_256-NEXT: mov z0.b, z0.b[31] -; VBITS_EQ_256-NEXT: fmov w9, s0 ; VBITS_EQ_256-NEXT: mov z3.b, z2.b[31] +; VBITS_EQ_256-NEXT: fmov w9, s0 +; VBITS_EQ_256-NEXT: fmov w10, s3 ; VBITS_EQ_256-NEXT: insr z2.b, w9 -; VBITS_EQ_256-NEXT: fmov w9, s3 -; VBITS_EQ_256-NEXT: insr z1.b, w9 -; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0, x8] +; VBITS_EQ_256-NEXT: insr z1.b, w10 ; VBITS_EQ_256-NEXT: st1b { z2.b }, p0, [x0] +; VBITS_EQ_256-NEXT: st1b { z1.b }, p0, [x0, x8] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v64i8: @@ -109,9 +109,9 @@ ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v128i8: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 +; VBITS_GE_1024-NEXT: mov w8, #127 ; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mov w8, #127 ; VBITS_GE_1024-NEXT: whilels p1.b, xzr, x8 ; VBITS_GE_1024-NEXT: lastb w8, p1, z0.b ; VBITS_GE_1024-NEXT: insr z1.b, w8 @@ -143,9 +143,9 @@ ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v256i8: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.b, vl256 +; VBITS_GE_2048-NEXT: mov w8, #255 ; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mov w8, #255 ; VBITS_GE_2048-NEXT: whilels p1.b, xzr, x8 ; VBITS_GE_2048-NEXT: lastb w8, p1, z0.b ; VBITS_GE_2048-NEXT: insr z1.b, w8 @@ -238,13 +238,13 @@ ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_EQ_256-NEXT: ld1h { z2.h }, p0/z, [x1] ; VBITS_EQ_256-NEXT: mov z0.h, z0.h[15] -; VBITS_EQ_256-NEXT: fmov w9, s0 ; VBITS_EQ_256-NEXT: mov z3.h, z2.h[15] +; VBITS_EQ_256-NEXT: fmov w9, s0 +; VBITS_EQ_256-NEXT: fmov w10, s3 ; VBITS_EQ_256-NEXT: insr z2.h, w9 -; VBITS_EQ_256-NEXT: fmov w9, s3 -; VBITS_EQ_256-NEXT: insr z1.h, w9 -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] +; VBITS_EQ_256-NEXT: insr z1.h, w10 ; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v32i16: @@ -271,9 +271,9 @@ ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64i16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: mov w8, #63 ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mov w8, #63 ; VBITS_GE_1024-NEXT: whilels p1.h, xzr, x8 ; VBITS_GE_1024-NEXT: lastb w8, p1, z0.h ; VBITS_GE_1024-NEXT: insr z1.h, w8 @@ -297,9 +297,9 @@ ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128i16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: mov w8, #127 ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mov w8, #127 ; VBITS_GE_2048-NEXT: whilels p1.h, xzr, x8 ; VBITS_GE_2048-NEXT: lastb w8, p1, z0.h ; VBITS_GE_2048-NEXT: insr z1.h, w8 @@ -375,13 +375,13 @@ ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_EQ_256-NEXT: ld1w { z2.s }, p0/z, [x1] ; VBITS_EQ_256-NEXT: mov z0.s, z0.s[7] -; VBITS_EQ_256-NEXT: fmov w9, s0 ; VBITS_EQ_256-NEXT: mov z3.s, z2.s[7] +; VBITS_EQ_256-NEXT: fmov w9, s0 +; VBITS_EQ_256-NEXT: fmov w10, s3 ; VBITS_EQ_256-NEXT: insr z2.s, w9 -; VBITS_EQ_256-NEXT: fmov w9, s3 -; VBITS_EQ_256-NEXT: insr z1.s, w9 -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] +; VBITS_EQ_256-NEXT: insr z1.s, w10 ; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v16i32: @@ -406,9 +406,9 @@ ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32i32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: mov w8, #31 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mov w8, #31 ; VBITS_GE_1024-NEXT: whilels p1.s, xzr, x8 ; VBITS_GE_1024-NEXT: lastb w8, p1, z0.s ; VBITS_GE_1024-NEXT: insr z1.s, w8 @@ -428,9 +428,9 @@ ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64i32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: mov w8, #63 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mov w8, #63 ; VBITS_GE_2048-NEXT: whilels p1.s, xzr, x8 ; VBITS_GE_2048-NEXT: lastb w8, p1, z0.s ; VBITS_GE_2048-NEXT: insr z1.s, w8 @@ -488,13 +488,13 @@ ; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_EQ_256-NEXT: mov z0.d, z0.d[3] -; VBITS_EQ_256-NEXT: fmov x9, d0 ; VBITS_EQ_256-NEXT: mov z3.d, z2.d[3] +; VBITS_EQ_256-NEXT: fmov x9, d0 +; VBITS_EQ_256-NEXT: fmov x10, d3 ; VBITS_EQ_256-NEXT: insr z2.d, x9 -; VBITS_EQ_256-NEXT: fmov x9, d3 -; VBITS_EQ_256-NEXT: insr z1.d, x9 -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: insr z1.d, x10 ; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v8i64: @@ -518,9 +518,9 @@ ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16i64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: mov w8, #15 ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mov w8, #15 ; VBITS_GE_1024-NEXT: whilels p1.d, xzr, x8 ; VBITS_GE_1024-NEXT: lastb x8, p1, z0.d ; VBITS_GE_1024-NEXT: insr z1.d, x8 @@ -538,9 +538,9 @@ ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32i64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: mov w8, #31 ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mov w8, #31 ; VBITS_GE_2048-NEXT: whilels p1.d, xzr, x8 ; VBITS_GE_2048-NEXT: lastb x8, p1, z0.d ; VBITS_GE_2048-NEXT: insr z1.d, x8 @@ -607,8 +607,8 @@ ; VBITS_EQ_256-NEXT: mov z3.h, z2.h[15] ; VBITS_EQ_256-NEXT: insr z2.h, h0 ; VBITS_EQ_256-NEXT: insr z1.h, h3 -; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: st1h { z2.h }, p0, [x0] +; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v32f16: @@ -634,9 +634,9 @@ ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v64f16: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: mov w8, #63 ; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mov w8, #63 ; VBITS_GE_1024-NEXT: whilels p1.h, xzr, x8 ; VBITS_GE_1024-NEXT: lastb h0, p1, z0.h ; VBITS_GE_1024-NEXT: insr z1.h, h0 @@ -660,9 +660,9 @@ ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v128f16: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 +; VBITS_GE_2048-NEXT: mov w8, #127 ; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mov w8, #127 ; VBITS_GE_2048-NEXT: whilels p1.h, xzr, x8 ; VBITS_GE_2048-NEXT: lastb h0, p1, z0.h ; VBITS_GE_2048-NEXT: insr z1.h, h0 @@ -740,8 +740,8 @@ ; VBITS_EQ_256-NEXT: mov z3.s, z2.s[7] ; VBITS_EQ_256-NEXT: insr z2.s, s0 ; VBITS_EQ_256-NEXT: insr z1.s, s3 -; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: st1w { z2.s }, p0, [x0] +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v16f32: @@ -765,9 +765,9 @@ ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v32f32: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: mov w8, #31 ; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mov w8, #31 ; VBITS_GE_1024-NEXT: whilels p1.s, xzr, x8 ; VBITS_GE_1024-NEXT: lastb s0, p1, z0.s ; VBITS_GE_1024-NEXT: insr z1.s, s0 @@ -787,9 +787,9 @@ ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v64f32: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 +; VBITS_GE_2048-NEXT: mov w8, #63 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mov w8, #63 ; VBITS_GE_2048-NEXT: whilels p1.s, xzr, x8 ; VBITS_GE_2048-NEXT: lastb s0, p1, z0.s ; VBITS_GE_2048-NEXT: insr z1.s, s0 @@ -849,8 +849,8 @@ ; VBITS_EQ_256-NEXT: mov z3.d, z2.d[3] ; VBITS_EQ_256-NEXT: insr z2.d, d0 ; VBITS_EQ_256-NEXT: insr z1.d, d3 -; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: st1d { z2.d }, p0, [x0] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] ; VBITS_EQ_256-NEXT: ret ; ; VBITS_GE_512-LABEL: shuffle_ext_byone_v8f64: @@ -873,9 +873,9 @@ ; VBITS_GE_1024-LABEL: shuffle_ext_byone_v16f64: ; VBITS_GE_1024: // %bb.0: ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: mov w8, #15 ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mov w8, #15 ; VBITS_GE_1024-NEXT: whilels p1.d, xzr, x8 ; VBITS_GE_1024-NEXT: lastb d0, p1, z0.d ; VBITS_GE_1024-NEXT: insr z1.d, d0 @@ -893,9 +893,9 @@ ; VBITS_GE_2048-LABEL: shuffle_ext_byone_v32f64: ; VBITS_GE_2048: // %bb.0: ; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: mov w8, #31 ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mov w8, #31 ; VBITS_GE_2048-NEXT: whilels p1.d, xzr, x8 ; VBITS_GE_2048-NEXT: lastb d0, p1, z0.d ; VBITS_GE_2048-NEXT: insr z1.d, d0 @@ -939,14 +939,14 @@ ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov z2.d, z0.d[3] -; CHECK-NEXT: mov z3.d, z1.d[1] +; CHECK-NEXT: mov z2.d, z1.d[1] +; CHECK-NEXT: stp d1, d2, [sp, #16] +; CHECK-NEXT: mov z1.d, z0.d[3] ; CHECK-NEXT: mov z0.d, z0.d[2] -; CHECK-NEXT: stp d1, d3, [sp, #16] -; CHECK-NEXT: stp d0, d2, [sp] +; CHECK-NEXT: stp d0, d1, [sp] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: mov sp, x29 diff --git a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll --- a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll +++ b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll @@ -33,10 +33,9 @@ ; CHECK-LABEL: sti32ldi32ext: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: movprfx z1, z0 -; CHECK-NEXT: sxtw z1.d, p0/m, z0.d -; CHECK-NEXT: st1w { z0.d }, p0, [x0] -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: st1w { z1.d }, p0, [x0] ; CHECK-NEXT: ret entry: %0 = trunc %v to diff --git a/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll b/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll --- a/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll @@ -69,10 +69,10 @@ ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, #1, mul vl] ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, #2, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1, #3, mul vl] -; CHECK-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: fcvt z1.h, p0/m, z1.d -; CHECK-NEXT: fcvt z2.h, p0/m, z2.d +; CHECK-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: fcvt z3.h, p0/m, z3.d +; CHECK-NEXT: fcvt z2.h, p0/m, z2.d ; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h diff --git a/llvm/test/CodeGen/AArch64/sve-gep.ll b/llvm/test/CodeGen/AArch64/sve-gep.ll --- a/llvm/test/CodeGen/AArch64/sve-gep.ll +++ b/llvm/test/CodeGen/AArch64/sve-gep.ll @@ -34,8 +34,8 @@ ; CHECK-LABEL: fixed_of_scalable_1: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: dup v0.2d, x8 ; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: dup v0.2d, x8 ; CHECK-NEXT: add v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %d = getelementptr , * %base, <2 x i64> @@ -203,8 +203,8 @@ ; CHECK-LABEL: scalable_of_scalable_1: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: ret %idx = shufflevector insertelement ( undef, i64 1, i32 0), zeroinitializer, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll --- a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll +++ b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll @@ -175,11 +175,11 @@ define @zero_fill_non_zero_index( %pg, %a) #0 { ; CHECK-LABEL: zero_fill_non_zero_index: ; CHECK: // %bb.0: -; CHECK-NEXT: uminv d0, p0, z0.d ; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: uminv d0, p0, z0.d +; CHECK-NEXT: fmov x9, d0 ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov x9, d0 ; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z0.d ; CHECK-NEXT: mov z0.d, #0 // =0x0 @@ -195,8 +195,8 @@ define @zero_fill_type_mismatch( %pg, %a) #0 { ; CHECK-LABEL: zero_fill_type_mismatch: ; CHECK: // %bb.0: -; CHECK-NEXT: uminv d0, p0, z0.d ; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: uminv d0, p0, z0.d ; CHECK-NEXT: ret %t1 = call i64 @llvm.aarch64.sve.uminv.nxv2i64( %pg, %a) %t2 = insertelement zeroinitializer, i64 %t1, i64 0 @@ -211,11 +211,10 @@ ; CHECK-LABEL: zero_fill_no_zero_upper_lanes: ; CHECK: // %bb.0: ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z0.d -; CHECK-NEXT: ptrue p1.d, vl1 -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: mov z1.d, p1/m, x8 -; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret %t1 = call @llvm.aarch64.sve.umin.nxv2i64( %pg, %a, %a) %t2 = extractelement %t1, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll --- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll @@ -4,8 +4,8 @@ define @test_lane0_16xi8( %a) { ; CHECK-LABEL: test_lane0_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl1 ; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: ptrue p0.b, vl1 ; CHECK-NEXT: mov z0.b, p0/m, w8 ; CHECK-NEXT: ret %b = insertelement %a, i8 30, i32 0 @@ -15,8 +15,8 @@ define @test_lane0_8xi16( %a) { ; CHECK-LABEL: test_lane0_8xi16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, p0/m, w8 ; CHECK-NEXT: ret %b = insertelement %a, i16 30, i32 0 @@ -26,8 +26,8 @@ define @test_lane0_4xi32( %a) { ; CHECK-LABEL: test_lane0_4xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, p0/m, w8 ; CHECK-NEXT: ret %b = insertelement %a, i32 30, i32 0 @@ -37,8 +37,8 @@ define @test_lane0_2xi64( %a) { ; CHECK-LABEL: test_lane0_2xi64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret %b = insertelement %a, i64 30, i32 0 @@ -83,12 +83,12 @@ ; CHECK-LABEL: test_lane4_2xi64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #4 -; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov w9, #30 +; CHECK-NEXT: index z2.d, #0, #1 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d -; CHECK-NEXT: mov w8, #30 -; CHECK-NEXT: mov z0.d, p0/m, x8 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z1.d +; CHECK-NEXT: mov z0.d, p0/m, x9 ; CHECK-NEXT: ret %b = insertelement %a, i64 30, i32 4 ret %b @@ -99,11 +99,11 @@ ; CHECK-LABEL: test_lane9_8xf16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #9 -; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: fmov h1, #1.00000000 +; CHECK-NEXT: index z3.h, #0, #1 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h -; CHECK-NEXT: fmov h1, #1.00000000 +; CHECK-NEXT: cmpeq p0.h, p0/z, z3.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, h1 ; CHECK-NEXT: ret %b = insertelement %a, half 1.0, i32 9 @@ -114,12 +114,12 @@ ; CHECK-LABEL: test_lane1_16xi8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #1 -; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: mov w9, #30 +; CHECK-NEXT: index z2.b, #0, #1 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z2.b, w8 -; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b -; CHECK-NEXT: mov w8, #30 -; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: mov z1.b, w8 +; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z1.b +; CHECK-NEXT: mov z0.b, p0/m, w9 ; CHECK-NEXT: ret %b = insertelement %a, i8 30, i32 1 ret %b @@ -130,12 +130,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: mov w9, #30 +; CHECK-NEXT: index z2.b, #0, #1 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z2.b, w8 -; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b -; CHECK-NEXT: mov w8, #30 -; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: mov z1.b, w8 +; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z1.b +; CHECK-NEXT: mov z0.b, p0/m, w9 ; CHECK-NEXT: ret %b = insertelement %a, i8 30, i32 %x ret %b @@ -156,10 +156,10 @@ ; CHECK-LABEL: test_lane6_undef_8xi16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #6 -; CHECK-NEXT: index z0.h, #0, #1 -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: index z1.h, #0, #1 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z0.h, w8 +; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z0.h ; CHECK-NEXT: mov z0.h, p0/m, w0 ; CHECK-NEXT: ret %b = insertelement undef, i16 %a, i32 6 @@ -191,12 +191,12 @@ ; CHECK-LABEL: test_insert64_of_extract64_16xi8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #64 -; CHECK-NEXT: index z2.b, #0, #1 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: whilels p1.b, xzr, x8 -; CHECK-NEXT: mov z3.b, w8 -; CHECK-NEXT: lastb w8, p1, z1.b -; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b +; CHECK-NEXT: index z3.b, #0, #1 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: whilels p0.b, xzr, x8 +; CHECK-NEXT: mov z2.b, w8 +; CHECK-NEXT: lastb w8, p0, z1.b +; CHECK-NEXT: cmpeq p0.b, p1/z, z3.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, w8 ; CHECK-NEXT: ret %c = extractelement %b, i32 64 @@ -207,13 +207,13 @@ define @test_insert3_of_extract1_16xi8( %a, %b) { ; CHECK-LABEL: test_insert3_of_extract1_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #3 -; CHECK-NEXT: umov w8, v1.b[1] -; CHECK-NEXT: index z1.b, #0, #1 -; CHECK-NEXT: mov z2.b, w9 +; CHECK-NEXT: mov w8, #3 +; CHECK-NEXT: umov w9, v1.b[1] +; CHECK-NEXT: index z2.b, #0, #1 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b -; CHECK-NEXT: mov z0.b, p0/m, w8 +; CHECK-NEXT: mov z1.b, w8 +; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z1.b +; CHECK-NEXT: mov z0.b, p0/m, w9 ; CHECK-NEXT: ret %c = extractelement %b, i32 1 %d = insertelement %a, i8 %c, i32 3 @@ -278,10 +278,10 @@ define @test_insert_with_index_nxv2f16(half %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: index z2.d, #0, #1 +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov z2.d, x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret %res = insertelement undef, half %h, i64 %idx @@ -291,10 +291,10 @@ define @test_insert_with_index_nxv4f16(half %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, w0 -; CHECK-NEXT: index z2.s, #0, #1 +; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: mov z2.s, w0 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z1.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret %res = insertelement undef, half %h, i64 %idx @@ -304,10 +304,10 @@ define @test_insert_with_index_nxv8f16(half %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.h, w0 -; CHECK-NEXT: index z2.h, #0, #1 +; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: mov z2.h, w0 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z1.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret %res = insertelement undef, half %h, i64 %idx @@ -317,10 +317,10 @@ define @test_insert_with_index_nxv2f32(float %f, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: index z2.d, #0, #1 +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov z2.d, x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.s, p0/m, s0 ; CHECK-NEXT: ret %res = insertelement undef, float %f, i64 %idx @@ -330,10 +330,10 @@ define @test_insert_with_index_nxv4f32(float %f, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, w0 -; CHECK-NEXT: index z2.s, #0, #1 +; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: mov z2.s, w0 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z1.s +; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, s0 ; CHECK-NEXT: ret %res = insertelement undef, float %f, i64 %idx @@ -343,10 +343,10 @@ define @test_insert_with_index_nxv2f64(double %d, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: index z2.d, #0, #1 +; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: mov z2.d, x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.d, p0/m, d0 ; CHECK-NEXT: ret %res = insertelement undef, double %d, i64 %idx @@ -357,12 +357,12 @@ define @test_predicate_insert_2xi1_immediate ( %val, i1 %elt) { ; CHECK-LABEL: test_predicate_insert_2xi1_immediate: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d, vl1 ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: mov z0.d, p0/m, x0 -; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: mov z0.d, p1/m, x0 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z0.d, z0.d, #0x1 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ret %res = insertelement %val, i1 %elt, i32 0 @@ -373,10 +373,10 @@ ; CHECK-LABEL: test_predicate_insert_4xi1_immediate: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: index z1.s, #0, #1 ; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: cmpeq p2.s, p1/z, z0.s, z1.s +; CHECK-NEXT: mov z0.s, w8 +; CHECK-NEXT: cmpeq p2.s, p1/z, z1.s, z0.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 ; CHECK-NEXT: mov z0.s, p2/m, w0 ; CHECK-NEXT: and z0.s, z0.s, #0x1 @@ -391,15 +391,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: mov w9, #1 +; CHECK-NEXT: index z1.h, #0, #1 ; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 -; CHECK-NEXT: cmpeq p0.h, p1/z, z0.h, z2.h -; CHECK-NEXT: mov w8, #1 -; CHECK-NEXT: mov z1.h, p0/m, w8 -; CHECK-NEXT: and z1.h, z1.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p1/z, z1.h, #0 +; CHECK-NEXT: mov z0.h, w8 +; CHECK-NEXT: cmpeq p2.h, p1/z, z1.h, z0.h +; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 +; CHECK-NEXT: mov z0.h, p2/m, w9 +; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: cmpne p0.h, p1/z, z0.h, #0 ; CHECK-NEXT: ret %res = insertelement %val, i1 1, i32 %idx ret %res @@ -409,11 +409,11 @@ ; CHECK-LABEL: test_predicate_insert_16xi1_immediate: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: index z0.b, #0, #1 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: mov z1.b, w9 ; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z1.b +; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: mov z0.b, w9 +; CHECK-NEXT: cmpeq p2.b, p1/z, z1.b, z0.b ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 ; CHECK-NEXT: mov z0.b, p2/m, w8 ; CHECK-NEXT: and z0.b, z0.b, #0x1 @@ -429,12 +429,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: cmpeq p2.d, p1/z, z0.d, z1.d -; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: mov z0.d, x8 +; CHECK-NEXT: cmpeq p2.d, p1/z, z1.d, z0.d +; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 ; CHECK-NEXT: mov z0.d, p2/m, x0 ; CHECK-NEXT: and z0.d, z0.d, #0x1 ; CHECK-NEXT: cmpne p0.d, p1/z, z0.d, #0 @@ -448,10 +448,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: index z1.s, #0, #1 ; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: cmpeq p2.s, p1/z, z0.s, z1.s +; CHECK-NEXT: mov z0.s, w8 +; CHECK-NEXT: cmpeq p2.s, p1/z, z1.s, z0.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 ; CHECK-NEXT: mov z0.s, p2/m, w0 ; CHECK-NEXT: and z0.s, z0.s, #0x1 @@ -465,10 +465,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: index z1.h, #0, #1 ; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: mov z1.h, w8 -; CHECK-NEXT: cmpeq p2.h, p1/z, z0.h, z1.h +; CHECK-NEXT: mov z0.h, w8 +; CHECK-NEXT: cmpeq p2.h, p1/z, z1.h, z0.h ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 ; CHECK-NEXT: mov z0.h, p2/m, w0 ; CHECK-NEXT: and z0.h, z0.h, #0x1 @@ -483,10 +483,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: index z0.b, #0, #1 +; CHECK-NEXT: index z1.b, #0, #1 ; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: mov z1.b, w8 -; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z1.b +; CHECK-NEXT: mov z0.b, w8 +; CHECK-NEXT: cmpeq p2.b, p1/z, z1.b, z0.b ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 ; CHECK-NEXT: mov z0.b, p2/m, w0 ; CHECK-NEXT: and z0.b, z0.b, #0x1 @@ -503,21 +503,21 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: rdvl x8, #2 ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: sxtw x9, w1 -; CHECK-NEXT: sub x10, x10, #1 -; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 +; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: st1b { z0.b }, p1, [x8, #1, mul vl] -; CHECK-NEXT: st1b { z1.b }, p1, [sp] -; CHECK-NEXT: strb w0, [x8, x9] +; CHECK-NEXT: st1b { z0.b }, p1, [x10, #1, mul vl] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: st1b { z0.b }, p1, [sp] +; CHECK-NEXT: strb w0, [x10, x8] ; CHECK-NEXT: ld1b { z0.b }, p1/z, [sp] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x10, #1, mul vl] ; CHECK-NEXT: and z0.b, z0.b, #0x1 ; CHECK-NEXT: and z1.b, z1.b, #0x1 ; CHECK-NEXT: cmpne p0.b, p1/z, z0.b, #0 diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -22,15 +22,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: sub x9, x9, #2 -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: cmp x9, #2 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: cmp x8, #2 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsl x8, x8, #3 ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -61,15 +61,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: cntw x9 -; CHECK-NEXT: sub x9, x9, #4 -; CHECK-NEXT: mov w8, #4 -; CHECK-NEXT: cmp x9, #4 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: sub x8, x8, #4 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: cmp x8, #4 ; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsl x8, x8, #2 ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -100,15 +100,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: cnth x9 -; CHECK-NEXT: sub x9, x9, #8 -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: cmp x9, #8 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: sub x8, x8, #8 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: cmp x8, #8 ; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsl x8, x8, #1 ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -139,14 +139,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: sub x9, x9, #16 -; CHECK-NEXT: mov w8, #16 -; CHECK-NEXT: cmp x9, #16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: sub x8, x8, #16 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: cmp x8, #16 ; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -213,9 +213,9 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: str q1, [sp, #32] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, #1, mul vl] @@ -256,8 +256,8 @@ ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: str q0, [sp, #16] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, #1, mul vl] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [sp] @@ -304,9 +304,9 @@ ; CHECK-LABEL: insert_nxv8i16_nxv2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: uunpklo z2.s, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s -; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h ; CHECK-NEXT: ret %r = call @llvm.experimental.vector.insert.nxv8i16.nxv2i16( %vec, %in, i64 2) @@ -321,15 +321,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: sub x9, x9, #2 -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: cmp x9, #2 -; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: cmp x8, #2 ; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsl x8, x8, #3 ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -344,16 +344,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: subs x8, x8, #4 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] ; CHECK-NEXT: csel x8, xzr, x8, lo ; CHECK-NEXT: mov w9, #4 ; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: st1d { z1.d }, p0, [x9, x8, lsl #3] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll @@ -56,8 +56,8 @@ ; CHECK-LABEL: smax_i16_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #257 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -95,8 +95,8 @@ ; CHECK-LABEL: smax_i32_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-129 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 -129, i32 0 @@ -134,8 +134,8 @@ ; CHECK-LABEL: smax_i64_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #65535 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -200,8 +200,8 @@ ; CHECK-LABEL: smin_i16_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #257 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -239,8 +239,8 @@ ; CHECK-LABEL: smin_i32_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-129 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 -129, i32 0 @@ -278,8 +278,8 @@ ; CHECK-LABEL: smin_i64_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #65535 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -332,8 +332,8 @@ ; CHECK-LABEL: umax_i16_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #257 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -359,8 +359,8 @@ ; CHECK-LABEL: umax_i32_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #257 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 257, i32 0 @@ -386,8 +386,8 @@ ; CHECK-LABEL: umax_i64_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #65535 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -440,8 +440,8 @@ ; CHECK-LABEL: umin_i16_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #257 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -467,8 +467,8 @@ ; CHECK-LABEL: umin_i32_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #257 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 257, i32 0 @@ -494,8 +494,8 @@ ; CHECK-LABEL: umin_i64_out_of_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #65535 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -600,8 +600,8 @@ ; CHECK-LABEL: mul_i16_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #255 -; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 255, i32 0 @@ -614,8 +614,8 @@ ; CHECK-LABEL: mul_i32_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #255 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 255, i32 0 @@ -628,8 +628,8 @@ ; CHECK-LABEL: mul_i64_range: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #255 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 255, i32 0 diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll @@ -165,9 +165,9 @@ ; CHECK-LABEL: abs_nxv8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: abs z2.d, p0/m, z2.d ; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: abs z1.d, p0/m, z1.d -; CHECK-NEXT: abs z2.d, p0/m, z2.d ; CHECK-NEXT: abs z3.d, p0/m, z3.d ; CHECK-NEXT: ret %res = call @llvm.abs.nxv8i64( %a, i1 false) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll @@ -238,12 +238,12 @@ define @index_rr_i32_not_combine(i32 %a, i32 %b) { ; CHECK-LABEL: index_rr_i32_not_combine: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, w0 -; CHECK-NEXT: mov z1.s, w1 -; CHECK-NEXT: index z2.s, #0, #1 +; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: mov z1.s, w0 +; CHECK-NEXT: mov z2.s, w1 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mla z0.s, p0/m, z2.s, z1.s -; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: mla z1.s, p0/m, z0.s, z2.s +; CHECK-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEXT: ret %val = insertelement poison, i32 %a, i32 0 %val1 = shufflevector %val, poison, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith.ll @@ -413,8 +413,8 @@ define @add_i64_tuple3(* %out, %in1, %in2, %in3) { ; CHECK-LABEL: add_i64_tuple3: ; CHECK: // %bb.0: -; CHECK-NEXT: add z0.d, z0.d, z0.d ; CHECK-NEXT: add z1.d, z1.d, z1.d +; CHECK-NEXT: add z0.d, z0.d, z0.d ; CHECK-NEXT: add z2.d, z2.d, z2.d ; CHECK-NEXT: ret %tuple = tail call @llvm.aarch64.sve.tuple.create3.nxv6i64.nxv2i64( %in1, %in2, %in3) @@ -425,9 +425,9 @@ define @add_i64_tuple4(* %out, %in1, %in2, %in3, %in4) { ; CHECK-LABEL: add_i64_tuple4: ; CHECK: // %bb.0: +; CHECK-NEXT: add z2.d, z2.d, z2.d ; CHECK-NEXT: add z0.d, z0.d, z0.d ; CHECK-NEXT: add z1.d, z1.d, z1.d -; CHECK-NEXT: add z2.d, z2.d, z2.d ; CHECK-NEXT: add z3.d, z3.d, z3.d ; CHECK-NEXT: ret %tuple = tail call @llvm.aarch64.sve.tuple.create4.nxv8i64.nxv2i64( %in1, %in2, %in3, %in4) diff --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll --- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll @@ -9,9 +9,9 @@ ; CHECK-LABEL: test_post_ld1_insert: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr w8, [x0] -; CHECK-NEXT: add x9, x0, x2, lsl #2 -; CHECK-NEXT: str x9, [x1] ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: add x8, x0, x2, lsl #2 +; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: ret %load = load i32, i32* %a %ins = insertelement undef, i32 %load, i32 0 @@ -24,8 +24,8 @@ ; CHECK-LABEL: test_post_ld1_dup: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: add x8, x0, x2, lsl #3 +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: ret %load = load double, double* %a diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -20,8 +20,8 @@ ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: adrp x8, :got:g8 -; CHECK-NEXT: ldr x8, [x8, :got_lo12:g8] ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ldr x8, [x8, :got_lo12:g8] ; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: strb w8, [sp, #12] ; CHECK-NEXT: ld1rb { z0.b }, p0/z, [sp, #14] diff --git a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll --- a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll +++ b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -loop-reduce < %s | FileCheck %s --check-prefix=IR ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefix=ASM ; Note: To update this test, please run utils/update_test_checks.py and utils/update_llc_test_checks.py separately on opt/llc run line. diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll @@ -106,9 +106,9 @@ ; CHECK-LABEL: masked_gather_nxv8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: punpkhi p2.h, p1.b ; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: ld1h { z3.d }, p2/z, [z3.d] ; CHECK-NEXT: ld1h { z2.d }, p1/z, [z2.d] ; CHECK-NEXT: punpkhi p1.h, p0.b @@ -186,10 +186,10 @@ ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: sunpkhi z2.s, z0.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: ld1sb { z2.s }, p1/z, [x0, z2.s, sxtw] ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h @@ -205,9 +205,9 @@ ; CHECK-LABEL: masked_gather_nxv32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpklo p3.h, p2.b ; CHECK-NEXT: punpkhi p2.h, p2.b -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: ld1w { z0.s }, p3/z, [x0, z0.s, sxtw #2] ; CHECK-NEXT: ld1w { z1.s }, p2/z, [x0, z1.s, sxtw #2] ; CHECK-NEXT: punpklo p2.h, p0.b diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll --- a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll @@ -99,9 +99,9 @@ ; CHECK-NEXT: ld1w { z30.s }, p2/z, [x1, #1, mul vl] ; CHECK-NEXT: ld1w { z31.s }, p2/z, [x1] ; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpklo p3.h, p2.b ; CHECK-NEXT: punpkhi p2.h, p2.b -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1w { z0.s }, p3, [x0, z31.s, sxtw #2] ; CHECK-NEXT: st1w { z1.s }, p2, [x0, z30.s, sxtw #2] ; CHECK-NEXT: punpklo p2.h, p0.b diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll @@ -76,8 +76,8 @@ define void @masked_scatter_splat_constant_pointer ( %pg) { ; CHECK-LABEL: masked_scatter_splat_constant_pointer: ; CHECK: // %bb.0: // %vector.body -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1w { z0.d }, p1, [x8, z0.d, lsl #2] ; CHECK-NEXT: st1w { z0.d }, p0, [x8, z0.d, lsl #2] diff --git a/llvm/test/CodeGen/AArch64/sve-pred-arith.ll b/llvm/test/CodeGen/AArch64/sve-pred-arith.ll --- a/llvm/test/CodeGen/AArch64/sve-pred-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-arith.ll @@ -58,15 +58,15 @@ ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr p4, [x3] +; CHECK-NEXT: ldr p4, [x1] ; CHECK-NEXT: ldr p5, [x0] -; CHECK-NEXT: ldr p6, [x1] +; CHECK-NEXT: ldr p6, [x3] ; CHECK-NEXT: ldr p7, [x2] ; CHECK-NEXT: ptrue p8.b ; CHECK-NEXT: eor p0.b, p8/z, p0.b, p5.b -; CHECK-NEXT: eor p1.b, p8/z, p1.b, p6.b +; CHECK-NEXT: eor p1.b, p8/z, p1.b, p4.b ; CHECK-NEXT: eor p2.b, p8/z, p2.b, p7.b -; CHECK-NEXT: eor p3.b, p8/z, p3.b, p4.b +; CHECK-NEXT: eor p3.b, p8/z, p3.b, p6.b ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload @@ -138,15 +138,15 @@ ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr p4, [x3] +; CHECK-NEXT: ldr p4, [x1] ; CHECK-NEXT: ldr p5, [x0] -; CHECK-NEXT: ldr p6, [x1] +; CHECK-NEXT: ldr p6, [x3] ; CHECK-NEXT: ldr p7, [x2] ; CHECK-NEXT: ptrue p8.b ; CHECK-NEXT: eor p0.b, p8/z, p0.b, p5.b -; CHECK-NEXT: eor p1.b, p8/z, p1.b, p6.b +; CHECK-NEXT: eor p1.b, p8/z, p1.b, p4.b ; CHECK-NEXT: eor p2.b, p8/z, p2.b, p7.b -; CHECK-NEXT: eor p3.b, p8/z, p3.b, p4.b +; CHECK-NEXT: eor p3.b, p8/z, p3.b, p6.b ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll --- a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll +++ b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll @@ -240,11 +240,11 @@ ; CHECK-LABEL: sext_b_to_d: ; CHECK: // %bb.0: ; CHECK-NEXT: sunpklo z1.h, z0.b -; CHECK-NEXT: sunpkhi z0.h, z0.b +; CHECK-NEXT: sunpkhi z6.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpkhi z3.s, z1.h -; CHECK-NEXT: sunpklo z5.s, z0.h -; CHECK-NEXT: sunpkhi z7.s, z0.h +; CHECK-NEXT: sunpklo z5.s, z6.h +; CHECK-NEXT: sunpkhi z7.s, z6.h ; CHECK-NEXT: sunpklo z0.d, z2.s ; CHECK-NEXT: sunpkhi z1.d, z2.s ; CHECK-NEXT: sunpklo z2.d, z3.s @@ -309,11 +309,11 @@ ; CHECK-LABEL: zext_b_to_d: ; CHECK: // %bb.0: ; CHECK-NEXT: uunpklo z1.h, z0.b -; CHECK-NEXT: uunpkhi z0.h, z0.b +; CHECK-NEXT: uunpkhi z6.h, z0.b ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpkhi z3.s, z1.h -; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: uunpkhi z7.s, z0.h +; CHECK-NEXT: uunpklo z5.s, z6.h +; CHECK-NEXT: uunpkhi z7.s, z6.h ; CHECK-NEXT: uunpklo z0.d, z2.s ; CHECK-NEXT: uunpkhi z1.d, z2.s ; CHECK-NEXT: uunpklo z2.d, z3.s diff --git a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll @@ -23,17 +23,17 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: rdvl x8, #2 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x9, w0 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: cmp x9, x10 -; CHECK-NEXT: st1b { z1.b }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: st1b { z1.b }, p0, [x10, #1, mul vl] ; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: ldrb w0, [x8, x9] +; CHECK-NEXT: ldrb w0, [x10, x8] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -48,17 +48,17 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: rdvl x10, #1 +; CHECK-NEXT: rdvl x8, #1 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x9, w0 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: cmp x9, x10 -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: st1h { z1.h }, p0, [x10, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: ldrh w0, [x8, x9, lsl #1] +; CHECK-NEXT: ldrh w0, [x10, x8, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -73,17 +73,17 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: cnth x10 +; CHECK-NEXT: cnth x8 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x9, w0 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: cmp x9, x10 -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: st1w { z1.s }, p0, [x10, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: csel x9, x9, x10, lo -; CHECK-NEXT: ldr w0, [x8, x9, lsl #2] +; CHECK-NEXT: ldr w0, [x10, x8, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -98,18 +98,18 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: cnth x10 +; CHECK-NEXT: cnth x9 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sxtw x9, w0 -; CHECK-NEXT: sub x10, x10, #1 -; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw x10, w0 +; CHECK-NEXT: sub x9, x9, #1 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: cmp x10, x9 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: csel x9, x10, x9, lo ; CHECK-NEXT: st1d { z3.d }, p0, [x8, #3, mul vl] ; CHECK-NEXT: st1d { z2.d }, p0, [x8, #2, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: csel x9, x9, x10, lo ; CHECK-NEXT: ldr x0, [x8, x9, lsl #3] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -145,16 +145,16 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: rdvl x10, #1 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov w10, #128 +; CHECK-NEXT: cmp x8, #128 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #128 -; CHECK-NEXT: cmp x10, #128 -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x8, x8, x10, lo +; CHECK-NEXT: st1h { z1.h }, p0, [x9, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ldrh w0, [x8, x9, lsl #1] +; CHECK-NEXT: ldrh w0, [x9, x8, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -169,18 +169,18 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: mov w9, #34464 -; CHECK-NEXT: rdvl x10, #1 -; CHECK-NEXT: movk w9, #1, lsl #16 -; CHECK-NEXT: sub x10, x10, #1 -; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: mov w10, #34464 +; CHECK-NEXT: movk w10, #1, lsl #16 +; CHECK-NEXT: sub x9, x9, #1 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: cmp x10, x9 +; CHECK-NEXT: cmp x9, x10 +; CHECK-NEXT: csel x9, x9, x10, lo +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z3.s }, p0, [x8, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [x8, #2, mul vl] ; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: csel x9, x10, x9, lo ; CHECK-NEXT: ldr w0, [x8, x9, lsl #2] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -196,16 +196,16 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: cntw x10 -; CHECK-NEXT: sub x10, x10, #1 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov w10, #10 +; CHECK-NEXT: cmp x8, #10 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: mov w9, #10 -; CHECK-NEXT: cmp x10, #10 -; CHECK-NEXT: st1d { z1.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x8, x8, x10, lo +; CHECK-NEXT: st1d { z1.d }, p0, [x9, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: ldr x0, [x8, x9, lsl #3] +; CHECK-NEXT: ldr x0, [x9, x8, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll --- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll @@ -6,8 +6,8 @@ define @fcvts_nxv8f16( %a) { ; CHECK-LABEL: fcvts_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z1.s, z0.h ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uunpklo z1.s, z0.h ; CHECK-NEXT: uunpkhi z2.s, z0.h ; CHECK-NEXT: fcvt z0.s, p0/m, z1.h ; CHECK-NEXT: fcvt z1.s, p0/m, z2.h @@ -19,8 +19,8 @@ define @fcvtd_nxv4f16( %a) { ; CHECK-LABEL: fcvtd_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s ; CHECK-NEXT: fcvt z0.d, p0/m, z1.h ; CHECK-NEXT: fcvt z1.d, p0/m, z2.h @@ -51,8 +51,8 @@ define @fcvtd_nxv4f32( %a) { ; CHECK-LABEL: fcvtd_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s ; CHECK-NEXT: fcvt z0.d, p0/m, z1.s ; CHECK-NEXT: fcvt z1.d, p0/m, z2.s @@ -64,8 +64,8 @@ define @fcvtd_nxv8f32( %a) { ; CHECK-LABEL: fcvtd_nxv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: uunpkhi z3.d, z0.s ; CHECK-NEXT: uunpklo z4.d, z1.s ; CHECK-NEXT: uunpkhi z5.d, z1.s @@ -136,9 +136,9 @@ ; CHECK-LABEL: fcvts_nxv8f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z3.s, p0/m, z3.d ; CHECK-NEXT: fcvt z1.s, p0/m, z1.d ; CHECK-NEXT: fcvt z0.s, p0/m, z0.d -; CHECK-NEXT: fcvt z3.s, p0/m, z3.d ; CHECK-NEXT: fcvt z2.s, p0/m, z2.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: uzp1 z1.s, z2.s, z3.s @@ -182,8 +182,8 @@ define @fcvtzs_d_nxv4f32( %a) { ; CHECK-LABEL: fcvtzs_d_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s ; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.s ; CHECK-NEXT: fcvtzs z1.d, p0/m, z2.s @@ -195,8 +195,8 @@ define @fcvtzs_s_nxv16f16( %a) { ; CHECK-LABEL: fcvtzs_s_nxv16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: uunpkhi z3.s, z0.h ; CHECK-NEXT: uunpklo z4.s, z1.h ; CHECK-NEXT: uunpkhi z5.s, z1.h @@ -228,8 +228,8 @@ define @fcvtzu_d_nxv4f32( %a) { ; CHECK-LABEL: fcvtzu_d_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s ; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.s ; CHECK-NEXT: fcvtzu z1.d, p0/m, z2.s @@ -292,8 +292,8 @@ define @scvtf_d_nxv4i32( %a) { ; CHECK-LABEL: scvtf_d_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sunpklo z1.d, z0.s ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sunpklo z1.d, z0.s ; CHECK-NEXT: sunpkhi z2.d, z0.s ; CHECK-NEXT: scvtf z0.d, p0/m, z1.d ; CHECK-NEXT: scvtf z1.d, p0/m, z2.d @@ -305,13 +305,13 @@ define @scvtf_d_nxv4i1( %a) { ; CHECK-LABEL: scvtf_d_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: ptrue p2.d -; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: mov z0.d, p2/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.d, p2/m, z0.d -; CHECK-NEXT: scvtf z1.d, p2/m, z1.d +; CHECK-NEXT: scvtf z0.d, p1/m, z0.d +; CHECK-NEXT: scvtf z1.d, p1/m, z1.d ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -352,8 +352,8 @@ define @ucvtf_d_nxv4i32( %a) { ; CHECK-LABEL: ucvtf_d_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z1.d ; CHECK-NEXT: ucvtf z1.d, p0/m, z2.d @@ -365,13 +365,13 @@ define @ucvtf_d_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_d_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: ptrue p2.d -; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1 +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: mov z0.d, p2/z, #1 // =0x1 ; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.d, p2/m, z0.d -; CHECK-NEXT: ucvtf z1.d, p2/m, z1.d +; CHECK-NEXT: ucvtf z0.d, p1/m, z0.d +; CHECK-NEXT: ucvtf z1.d, p1/m, z1.d ; CHECK-NEXT: ret %res = uitofp %a to ret %res diff --git a/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll --- a/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll @@ -23,8 +23,8 @@ define float @faddv_nxv8f32(float %init, %a) { ; CHECK-LABEL: faddv_nxv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fadd z1.s, z1.s, z2.s ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fadd z1.s, z1.s, z2.s ; CHECK-NEXT: faddv s1, p0, z1.s ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll --- a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll @@ -6,10 +6,10 @@ define @promote_insert_8i8( %a, i8 %elt, i64 %idx) { ; CHECK-LABEL: promote_insert_8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.h, w1 -; CHECK-NEXT: index z2.h, #0, #1 +; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: mov z2.h, w1 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z1.h +; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, w0 ; CHECK-NEXT: ret %ins = insertelement %a, i8 %elt, i64 %idx @@ -24,11 +24,11 @@ ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdvl x8, #2 +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: sub x8, x8, #1 -; CHECK-NEXT: cmp x1, x8 ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cmp x1, x8 ; CHECK-NEXT: csel x8, x1, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1b { z1.b }, p0, [x9, #1, mul vl] ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: strb w0, [x9, x8] @@ -49,11 +49,11 @@ ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cnth x8 +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: sub x8, x8, #1 -; CHECK-NEXT: cmp x0, x8 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: cmp x0, x8 ; CHECK-NEXT: csel x8, x0, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1w { z1.s }, p0, [x9, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: str s2, [x9, x8, lsl #2] @@ -74,11 +74,11 @@ ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: cnth x8 +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: sub x8, x8, #1 -; CHECK-NEXT: cmp x1, x8 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cmp x1, x8 ; CHECK-NEXT: csel x8, x1, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1d { z3.d }, p0, [x9, #3, mul vl] ; CHECK-NEXT: st1d { z2.d }, p0, [x9, #2, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [x9, #1, mul vl] @@ -101,10 +101,10 @@ ; CHECK-LABEL: promote_insert_4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #5 -; CHECK-NEXT: index z1.s, #0, #1 -; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: index z2.s, #0, #1 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z1.s ; CHECK-NEXT: mov z0.s, p0/m, w0 ; CHECK-NEXT: ret %ins = insertelement %a, i16 %elt, i64 5 @@ -118,10 +118,10 @@ ; CHECK-LABEL: split_insert_32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #3 -; CHECK-NEXT: index z2.b, #0, #1 -; CHECK-NEXT: mov z3.b, w8 +; CHECK-NEXT: index z3.b, #0, #1 ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b +; CHECK-NEXT: mov z2.b, w8 +; CHECK-NEXT: cmpeq p0.b, p0/z, z3.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, w0 ; CHECK-NEXT: ret %ins = insertelement %a, i8 %elt, i64 3 @@ -135,21 +135,21 @@ ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: rdvl x10, #2 -; CHECK-NEXT: sub x10, x10, #1 -; CHECK-NEXT: mov w9, #128 -; CHECK-NEXT: cmp x10, #128 +; CHECK-NEXT: rdvl x8, #2 +; CHECK-NEXT: mov w10, #128 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: cmp x8, #128 ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: st1h { z3.h }, p0, [x8, #3, mul vl] -; CHECK-NEXT: st1h { z2.h }, p0, [x8, #2, mul vl] -; CHECK-NEXT: st1h { z1.h }, p0, [x8, #1, mul vl] +; CHECK-NEXT: csel x8, x8, x10, lo +; CHECK-NEXT: st1h { z3.h }, p0, [x9, #3, mul vl] +; CHECK-NEXT: st1h { z2.h }, p0, [x9, #2, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x9, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [sp] -; CHECK-NEXT: strh w0, [x8, x9, lsl #1] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x8, #1, mul vl] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8, #2, mul vl] -; CHECK-NEXT: ld1h { z3.h }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: strh w0, [x9, x8, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x9, #1, mul vl] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x9, #2, mul vl] +; CHECK-NEXT: ld1h { z3.h }, p0/z, [x9, #3, mul vl] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #4 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -165,18 +165,18 @@ ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #16960 -; CHECK-NEXT: cnth x10 ; CHECK-NEXT: movk w9, #15, lsl #16 -; CHECK-NEXT: sub x10, x10, #1 -; CHECK-NEXT: cmp x10, x9 +; CHECK-NEXT: sub x8, x8, #1 +; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: csel x9, x10, x9, lo -; CHECK-NEXT: st1w { z1.s }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x10, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] -; CHECK-NEXT: str w0, [x8, x9, lsl #2] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: str w0, [x10, x8, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10, #1, mul vl] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll --- a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll @@ -17,8 +17,8 @@ define i32 @andv_nxv8i32( %a) { ; CHECK-LABEL: andv_nxv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -45,8 +45,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: orr z1.d, z1.d, z3.d ; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: orv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret @@ -71,8 +71,8 @@ define i32 @xorv_nxv8i32( %a) { ; CHECK-LABEL: xorv_nxv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -97,8 +97,8 @@ define i16 @uaddv_nxv16i16( %a) { ; CHECK-LABEL: uaddv_nxv16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: uaddv d0, p0, z0.h ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -112,8 +112,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: add z1.s, z1.s, z3.s ; CHECK-NEXT: add z0.s, z0.s, z2.s -; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: uaddv d0, p0, z0.s ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 diff --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll --- a/llvm/test/CodeGen/AArch64/sve-split-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll @@ -123,10 +123,10 @@ ; CHECK-NEXT: punpklo p2.h, p1.b ; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x0, #1, mul vl] -; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: ld1d { z2.d }, p1/z, [x0, #2, mul vl] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #2, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #3, mul vl] ; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv8i64( *%a, i32 1, %pg, undef) diff --git a/llvm/test/CodeGen/AArch64/sve-split-store.ll b/llvm/test/CodeGen/AArch64/sve-split-store.ll --- a/llvm/test/CodeGen/AArch64/sve-split-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-store.ll @@ -80,11 +80,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: punpkhi p2.h, p1.b ; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpkhi p3.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1h { z3.h }, p2, [x0, #3, mul vl] ; CHECK-NEXT: st1h { z2.h }, p1, [x0, #2, mul vl] -; CHECK-NEXT: st1h { z1.h }, p3, [x0, #1, mul vl] +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1h { z1.h }, p1, [x0, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.nxv32i16( %data, *%a, i32 1, %pg) diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll @@ -135,10 +135,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov z0.s, #1.00000000 ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: st1w { z0.d }, p0, [x0, #2, mul vl] +; CHECK-NEXT: st1w { z1.d }, p1, [x0, #2, mul vl] ; CHECK-NEXT: ret %ins = insertelement undef, float 1.0, i32 0 %splat = shufflevector %ins, undef, zeroinitializer @@ -151,10 +151,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov z0.h, #1.00000000 ; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: st1h { z0.h }, p0, [x0] -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1h { z0.s }, p0, [x0, #2, mul vl] +; CHECK-NEXT: st1h { z1.s }, p1, [x0, #2, mul vl] ; CHECK-NEXT: ret %ins = insertelement undef, half 1.0, i32 0 %splat = shufflevector %ins, undef, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-stepvector.ll b/llvm/test/CodeGen/AArch64/sve-stepvector.ll --- a/llvm/test/CodeGen/AArch64/sve-stepvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-stepvector.ll @@ -49,8 +49,8 @@ ; CHECK-LABEL: stepvector_nxv4i64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cntd x8 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: add z1.d, z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -186,9 +186,9 @@ define @multiple_use_stepvector_nxv4i32_1(i32 %data) { ; CHECK-LABEL: multiple_use_stepvector_nxv4i32_1: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w0 ; CHECK-NEXT: index z1.s, w0, #1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: ret @@ -218,9 +218,9 @@ define @multiple_use_stepvector_nxv2i64_1(i64 %data) { ; CHECK-LABEL: multiple_use_stepvector_nxv2i64_1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, x0 -; CHECK-NEXT: index z1.d, #0, #1 -; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: mov z1.d, x0 +; CHECK-NEXT: add z1.d, z1.d, z0.d ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -343,9 +343,9 @@ ; CHECK-LABEL: sub_multiple_use_stepvector_nxv8i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: subr z1.h, z1.h, #2 // =0x2 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: @@ -386,13 +386,13 @@ define @split_sub_stepvector_nxv16i32() { ; CHECK-LABEL: split_sub_stepvector_nxv16i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cntw x9 -; CHECK-NEXT: cnth x8 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: neg x8, x8 ; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: index z0.s, #0, #-1 -; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: mov z1.s, w9 -; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: mov z3.s, w9 ; CHECK-NEXT: add z1.s, z0.s, z1.s ; CHECK-NEXT: add z2.s, z0.s, z3.s ; CHECK-NEXT: add z3.s, z1.s, z3.s diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll @@ -95,10 +95,10 @@ ; CHECK-NEXT: and z0.d, z0.d, #0x1 ; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0 ; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0 -; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s -; CHECK-NEXT: cmpne p2.d, p0/z, z1.d, #0 +; CHECK-NEXT: cmpne p3.d, p0/z, z1.d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: uzp1 p0.s, p0.s, p2.s +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: uzp1 p0.s, p0.s, p3.s ; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h ; CHECK-NEXT: ret entry: @@ -111,33 +111,35 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z7.d, z7.d, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z6.d, z6.d, #0x1 ; CHECK-NEXT: and z5.d, z5.d, #0x1 ; CHECK-NEXT: and z4.d, z4.d, #0x1 -; CHECK-NEXT: and z3.d, z3.d, #0x1 -; CHECK-NEXT: and z2.d, z2.d, #0x1 ; CHECK-NEXT: cmpne p1.d, p0/z, z7.d, #0 ; CHECK-NEXT: cmpne p2.d, p0/z, z6.d, #0 ; CHECK-NEXT: cmpne p3.d, p0/z, z5.d, #0 ; CHECK-NEXT: cmpne p4.d, p0/z, z4.d, #0 +; CHECK-NEXT: and z3.d, z3.d, #0x1 +; CHECK-NEXT: and z2.d, z2.d, #0x1 ; CHECK-NEXT: and z1.d, z1.d, #0x1 ; CHECK-NEXT: and z0.d, z0.d, #0x1 ; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s -; CHECK-NEXT: cmpne p2.d, p0/z, z3.d, #0 -; CHECK-NEXT: uzp1 p3.s, p4.s, p3.s +; CHECK-NEXT: uzp1 p2.s, p4.s, p3.s +; CHECK-NEXT: cmpne p3.d, p0/z, z3.d, #0 ; CHECK-NEXT: cmpne p4.d, p0/z, z2.d, #0 -; CHECK-NEXT: uzp1 p2.s, p4.s, p2.s -; CHECK-NEXT: cmpne p4.d, p0/z, z1.d, #0 +; CHECK-NEXT: cmpne p5.d, p0/z, z1.d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: uzp1 p0.s, p0.s, p4.s +; CHECK-NEXT: uzp1 p3.s, p4.s, p3.s +; CHECK-NEXT: uzp1 p0.s, p0.s, p5.s +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p1.h, p3.h, p1.h -; CHECK-NEXT: uzp1 p0.h, p0.h, p2.h +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h ; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -187,8 +189,8 @@ define void @trunc_promoteIntRes( %0, i16* %ptr) { ; CHECK-LABEL: trunc_promoteIntRes: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll --- a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll +++ b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll @@ -7,16 +7,16 @@ define void @func_vscale_none(<16 x i32>* %a, <16 x i32>* %b) #0 { ; CHECK-NOARG-LABEL: func_vscale_none: ; CHECK-NOARG: // %bb.0: -; CHECK-NOARG-NEXT: ldp q0, q1, [x0] -; CHECK-NOARG-NEXT: ldp q2, q3, [x1] -; CHECK-NOARG-NEXT: ldp q4, q5, [x0, #32] -; CHECK-NOARG-NEXT: ldp q7, q6, [x1, #32] -; CHECK-NOARG-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NOARG-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NOARG-NEXT: add v2.4s, v5.4s, v6.4s -; CHECK-NOARG-NEXT: add v3.4s, v4.4s, v7.4s -; CHECK-NOARG-NEXT: stp q3, q2, [x0, #32] -; CHECK-NOARG-NEXT: stp q0, q1, [x0] +; CHECK-NOARG-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NOARG-NEXT: ldp q4, q5, [x1, #32] +; CHECK-NOARG-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NOARG-NEXT: ldp q2, q3, [x0] +; CHECK-NOARG-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-NOARG-NEXT: ldp q6, q4, [x1] +; CHECK-NOARG-NEXT: stp q0, q1, [x0, #32] +; CHECK-NOARG-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-NOARG-NEXT: add v3.4s, v3.4s, v4.4s +; CHECK-NOARG-NEXT: stp q2, q3, [x0] ; CHECK-NOARG-NEXT: ret ; ; CHECK-ARG-LABEL: func_vscale_none: @@ -39,16 +39,16 @@ define void @func_vscale1_1(<16 x i32>* %a, <16 x i32>* %b) #1 { ; CHECK-LABEL: func_vscale1_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x1] -; CHECK-NEXT: ldp q4, q5, [x0, #32] -; CHECK-NEXT: ldp q7, q6, [x1, #32] -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: add v2.4s, v5.4s, v6.4s -; CHECK-NEXT: add v3.4s, v4.4s, v7.4s -; CHECK-NEXT: stp q3, q2, [x0, #32] -; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q4, q5, [x1, #32] +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-NEXT: ldp q6, q4, [x1] +; CHECK-NEXT: stp q0, q1, [x0, #32] +; CHECK-NEXT: add v2.4s, v2.4s, v6.4s +; CHECK-NEXT: add v3.4s, v3.4s, v4.4s +; CHECK-NEXT: stp q2, q3, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %op2 = load <16 x i32>, <16 x i32>* %b diff --git a/llvm/test/CodeGen/AArch64/sve-vscale.ll b/llvm/test/CodeGen/AArch64/sve-vscale.ll --- a/llvm/test/CodeGen/AArch64/sve-vscale.ll +++ b/llvm/test/CodeGen/AArch64/sve-vscale.ll @@ -70,8 +70,8 @@ ; CHECK-LABEL: rdvl_3: ; CHECK: rdvl [[VL_B:x[0-9]+]], #1 -; CHECK-NEXT: lsr [[VL_Q:x[0-9]+]], [[VL_B]], #4 ; CHECK-NEXT: mov w[[MUL:[0-9]+]], #3 +; CHECK-NEXT: lsr [[VL_Q:x[0-9]+]], [[VL_B]], #4 ; CHECK-NEXT: mul x0, [[VL_Q]], x[[MUL]] ; CHECK-NEXT: ret define i32 @rdvl_3() nounwind { diff --git a/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll b/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll @@ -119,8 +119,8 @@ ; CHECK-LABEL: sel_16_illegal_wrong_extension: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #128 -; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: mov z1.h, #0 // =0x0 +; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %vec = shufflevector insertelement ( undef, i16 128, i32 0), zeroinitializer, zeroinitializer @@ -132,8 +132,8 @@ ; CHECK-LABEL: sel_32_illegal_wrong_extension: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #128 -; CHECK-NEXT: mov z0.s, w8 ; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: mov z0.s, w8 ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %vec = shufflevector insertelement ( undef, i32 128, i32 0), zeroinitializer, zeroinitializer @@ -145,8 +145,8 @@ ; CHECK-LABEL: sel_64_illegal_wrong_extension: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #128 -; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %vec = shufflevector insertelement ( undef, i64 128, i32 0), zeroinitializer, zeroinitializer @@ -158,8 +158,8 @@ ; CHECK-LABEL: sel_16_illegal_shifted: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #513 -; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: mov z1.h, #0 // =0x0 +; CHECK-NEXT: mov z0.h, w8 ; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %vec = shufflevector insertelement ( undef, i16 513, i32 0), zeroinitializer, zeroinitializer @@ -171,8 +171,8 @@ ; CHECK-LABEL: sel_32_illegal_shifted: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #513 -; CHECK-NEXT: mov z0.s, w8 ; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: mov z0.s, w8 ; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %vec = shufflevector insertelement ( undef, i32 513, i32 0), zeroinitializer, zeroinitializer @@ -184,8 +184,8 @@ ; CHECK-LABEL: sel_64_illegal_shifted: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #513 -; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %vec = shufflevector insertelement ( undef, i64 513, i32 0), zeroinitializer, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/swift-async.ll b/llvm/test/CodeGen/AArch64/swift-async.ll --- a/llvm/test/CodeGen/AArch64/swift-async.ll +++ b/llvm/test/CodeGen/AArch64/swift-async.ll @@ -11,14 +11,14 @@ ; CHECK: sub sp, sp, #32 ; CHECK: stp x29, x30, [sp, #16] -; CHECK-NOAUTH: str x22, [sp, #8] +; CHECK-NOAUTH-DAG: str x22, [sp, #8] ; CHECK-AUTH: add x16, sp, #8 ; CHECK-AUTH: movk x16, #49946, lsl #48 ; CHECK-AUTH: mov x17, x22 ; CHECK-AUTH: pacdb x17, x16 ; CHECK-AUTH: str x17, [sp, #8] -; CHECK: add x29, sp, #16 +; CHECK-DAG: add x29, sp, #16 ; CHECK: .cfi_def_cfa w29, 16 ; CHECK: .cfi_offset w30, -8 ; CHECK: .cfi_offset w29, -16 @@ -38,14 +38,14 @@ ; CHECK: str x23, [sp, #-32]! ; CHECK: stp x29, x30, [sp, #16] -; CHECK-NOAUTH: str x22, [sp, #8] +; CHECK-NOAUTH-DAG: str x22, [sp, #8] ; CHECK-AUTH: add x16, sp, #8 ; CHECK-AUTH: movk x16, #49946, lsl #48 ; CHECK-AUTH: mov x17, x22 ; CHECK-AUTH: pacdb x17, x16 ; CHECK-AUTH: str x17, [sp, #8] -; CHECK: add x29, sp, #16 +; CHECK-DAG: add x29, sp, #16 ; CHECK: .cfi_def_cfa w29, 16 ; CHECK: .cfi_offset w30, -8 ; CHECK: .cfi_offset w29, -16 @@ -66,14 +66,14 @@ ; CHECK: sub sp, sp, #64 ; CHECK: stp x29, x30, [sp, #48] -; CHECK-NOAUTH: str x22, [sp, #40] +; CHECK-NOAUTH-DAG: str x22, [sp, #40] ; CHECK-AUTH: add x16, sp, #40 ; CHECK-AUTH: movk x16, #49946, lsl #48 ; CHECK-AUTH: mov x17, x22 ; CHECK-AUTH: pacdb x17, x16 ; CHECK-AUTH: str x17, [sp, #40] -; CHECK: add x29, sp, #48 +; CHECK-DAG: add x29, sp, #48 ; CHECK: .cfi_def_cfa w29, 16 ; CHECK: .cfi_offset w30, -8 ; CHECK: .cfi_offset w29, -16 @@ -138,8 +138,8 @@ ; CHECK-LABEL: large_frame: ; CHECK: str x28, [sp, #-32]! ; CHECK: stp x29, x30, [sp, #16] -; CHECK-NOAUTH: str x22, [sp, #8] -; CHECK: add x29, sp, #16 +; CHECK-NOAUTH-DAG: str x22, [sp, #8] +; CHECK-DAG: add x29, sp, #16 ; CHECK: sub sp, sp, #1024 ; [...] ; CHECK: add sp, sp, #1024 @@ -157,8 +157,8 @@ ; CHECK: str d8, [sp, #-48]! ; CHECK: str x19, [sp, #16] ; CHECK: stp x29, x30, [sp, #32] -; CHECK-NOAUTH: str x22, [sp, #24] -; CHECK: add x29, sp, #32 +; CHECK-NOAUTH-DAG: str x22, [sp, #24] +; CHECK-DAG: add x29, sp, #32 ; CHECK: .cfi_def_cfa w29, 16 ; CHECK: .cfi_offset w30, -8 diff --git a/llvm/test/CodeGen/AArch64/swift-return.ll b/llvm/test/CodeGen/AArch64/swift-return.ll --- a/llvm/test/CodeGen/AArch64/swift-return.ll +++ b/llvm/test/CodeGen/AArch64/swift-return.ll @@ -277,8 +277,8 @@ ; CHECK-LABEL: _test12 ; CHECK: fadd.4s v0, v0, v1 -; CHECK: fadd.4s v0, v0, v2 ; CHECK: fmov s1, s3 +; CHECK: fadd.4s v0, v0, v2 define swiftcc { <4 x float>, float } @test12() #0 { entry: %call = call swiftcc { <4 x float>, <4 x float>, <4 x float>, float } @gen12() diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll --- a/llvm/test/CodeGen/AArch64/swifterror.ll +++ b/llvm/test/CodeGen/AArch64/swifterror.ll @@ -14,8 +14,8 @@ ; CHECK-APPLE: mov w0, #16 ; CHECK-APPLE: malloc ; CHECK-APPLE: mov [[ID:w[0-9]+]], #1 -; CHECK-APPLE: strb [[ID]], [x0, #8] ; CHECK-APPLE: mov x21, x0 +; CHECK-APPLE: strb [[ID]], [x0, #8] ; CHECK-APPLE-NOT: x21 ; CHECK-O0-LABEL: foo: @@ -249,9 +249,9 @@ ; CHECK-APPLE: mov w0, #16 ; CHECK-APPLE: malloc ; CHECK-APPLE: mov [[ID:w[0-9]+]], #1 +; CHECK-APPLE: mov x21, x0 ; CHECK-APPLE: strb [[ID]], [x0, #8] ; CHECK-APPLE: str w{{.*}}, [{{.*}}[[SRET]], #4] -; CHECK-APPLE: mov x21, x0 ; CHECK-APPLE-NOT: x21 ; CHECK-O0-LABEL: foo_sret: @@ -375,10 +375,10 @@ ; CHECK-APPLE-LABEL: caller4: ; CHECK-APPLE-AARCH64: mov [[ID:x[0-9]+]], x0 +; CHECK-APPLE-AARCH64: mov x21, xzr ; CHECK-APPLE-AARCH64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8] ; CHECK-APPLE-AARCH64: str {{x[0-9]+}}, [sp] -; CHECK-APPLE-AARCH64: mov x21, xzr ; CHECK-APPLE-AARCH64: bl {{.*}}foo_vararg ; CHECK-APPLE-AARCH64: mov x0, x21 ; CHECK-APPLE-AARCH64: cbnz x21 diff --git a/llvm/test/CodeGen/AArch64/tiny-model-pic.ll b/llvm/test/CodeGen/AArch64/tiny-model-pic.ll --- a/llvm/test/CodeGen/AArch64/tiny-model-pic.ll +++ b/llvm/test/CodeGen/AArch64/tiny-model-pic.ll @@ -149,16 +149,16 @@ ; CHECK-PIC-LABEL: bar1: ; CHECK-PIC: // %bb.0: // %entry ; CHECK-PIC-NEXT: adr x8, lsrc -; CHECK-PIC-NEXT: ldrb w8, [x8] ; CHECK-PIC-NEXT: adr x9, ldst +; CHECK-PIC-NEXT: ldrb w8, [x8] ; CHECK-PIC-NEXT: strb w8, [x9] ; CHECK-PIC-NEXT: ret ; ; CHECK-PIC-GLOBISEL-LABEL: bar1: ; CHECK-PIC-GLOBISEL: // %bb.0: // %entry ; CHECK-PIC-GLOBISEL-NEXT: adr x8, lsrc -; CHECK-PIC-GLOBISEL-NEXT: ldrb w8, [x8] ; CHECK-PIC-GLOBISEL-NEXT: adr x9, ldst +; CHECK-PIC-GLOBISEL-NEXT: ldrb w8, [x8] ; CHECK-PIC-GLOBISEL-NEXT: strb w8, [x9] ; CHECK-PIC-GLOBISEL-NEXT: ret entry: @@ -223,8 +223,8 @@ ; CHECK-PIC-LABEL: bar3: ; CHECK-PIC: // %bb.0: // %entry ; CHECK-PIC-NEXT: adr x8, lsrc -; CHECK-PIC-NEXT: ldrb w8, [x8] ; CHECK-PIC-NEXT: ldr x9, lptr +; CHECK-PIC-NEXT: ldrb w8, [x8] ; CHECK-PIC-NEXT: strb w8, [x9] ; CHECK-PIC-NEXT: ret ; @@ -267,16 +267,16 @@ ; CHECK-PIC-LABEL: baz1: ; CHECK-PIC: // %bb.0: // %entry ; CHECK-PIC-NEXT: adr x8, lbsrc -; CHECK-PIC-NEXT: ldrb w8, [x8] ; CHECK-PIC-NEXT: adr x9, lbdst +; CHECK-PIC-NEXT: ldrb w8, [x8] ; CHECK-PIC-NEXT: strb w8, [x9] ; CHECK-PIC-NEXT: ret ; ; CHECK-PIC-GLOBISEL-LABEL: baz1: ; CHECK-PIC-GLOBISEL: // %bb.0: // %entry ; CHECK-PIC-GLOBISEL-NEXT: adr x8, lbsrc -; CHECK-PIC-GLOBISEL-NEXT: ldrb w8, [x8] ; CHECK-PIC-GLOBISEL-NEXT: adr x9, lbdst +; CHECK-PIC-GLOBISEL-NEXT: ldrb w8, [x8] ; CHECK-PIC-GLOBISEL-NEXT: strb w8, [x9] ; CHECK-PIC-GLOBISEL-NEXT: ret entry: @@ -341,8 +341,8 @@ ; CHECK-PIC-LABEL: baz3: ; CHECK-PIC: // %bb.0: // %entry ; CHECK-PIC-NEXT: adr x8, lbsrc -; CHECK-PIC-NEXT: ldrb w8, [x8] ; CHECK-PIC-NEXT: ldr x9, lptr +; CHECK-PIC-NEXT: ldrb w8, [x8] ; CHECK-PIC-NEXT: strb w8, [x9] ; CHECK-PIC-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/tiny-model-static.ll b/llvm/test/CodeGen/AArch64/tiny-model-static.ll --- a/llvm/test/CodeGen/AArch64/tiny-model-static.ll +++ b/llvm/test/CodeGen/AArch64/tiny-model-static.ll @@ -85,16 +85,16 @@ ; CHECK-LABEL: bar1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adr x8, lsrc -; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: adr x9, ldst +; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: strb w8, [x9] ; CHECK-NEXT: ret ; ; CHECK-GLOBISEL-LABEL: bar1: ; CHECK-GLOBISEL: // %bb.0: // %entry ; CHECK-GLOBISEL-NEXT: adr x8, lsrc -; CHECK-GLOBISEL-NEXT: ldrb w8, [x8] ; CHECK-GLOBISEL-NEXT: adr x9, ldst +; CHECK-GLOBISEL-NEXT: ldrb w8, [x8] ; CHECK-GLOBISEL-NEXT: strb w8, [x9] ; CHECK-GLOBISEL-NEXT: ret entry: @@ -128,8 +128,8 @@ ; CHECK-LABEL: bar3: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adr x8, lsrc -; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: ldr x9, lptr +; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: strb w8, [x9] ; CHECK-NEXT: ret ; @@ -156,16 +156,16 @@ ; CHECK-LABEL: baz1: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adr x8, lbsrc -; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: adr x9, lbdst +; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: strb w8, [x9] ; CHECK-NEXT: ret ; ; CHECK-GLOBISEL-LABEL: baz1: ; CHECK-GLOBISEL: // %bb.0: // %entry ; CHECK-GLOBISEL-NEXT: adr x8, lbsrc -; CHECK-GLOBISEL-NEXT: ldrb w8, [x8] ; CHECK-GLOBISEL-NEXT: adr x9, lbdst +; CHECK-GLOBISEL-NEXT: ldrb w8, [x8] ; CHECK-GLOBISEL-NEXT: strb w8, [x9] ; CHECK-GLOBISEL-NEXT: ret entry: @@ -199,8 +199,8 @@ ; CHECK-LABEL: baz3: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adr x8, lbsrc -; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: ldr x9, lptr +; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: strb w8, [x9] ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll --- a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll +++ b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll @@ -5,12 +5,12 @@ ; CHECK-LABEL: overflow_add: ; CHECK: // %bb.0: ; CHECK-NEXT: add w8, w1, w0 +; CHECK-NEXT: mov w9, #2 ; CHECK-NEXT: orr w8, w8, #0x1 ; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: mov w9, #5 ; CHECK-NEXT: cmp w8, #1024 -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: csel w0, w8, w9, hi +; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: csel w0, w9, w8, hi ; CHECK-NEXT: ret %add = add i16 %b, %a %or = or i16 %add, 1 @@ -23,12 +23,12 @@ ; CHECK-LABEL: overflow_sub: ; CHECK: // %bb.0: ; CHECK-NEXT: sub w8, w0, w1 +; CHECK-NEXT: mov w9, #2 ; CHECK-NEXT: orr w8, w8, #0x1 ; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: mov w9, #5 ; CHECK-NEXT: cmp w8, #1024 -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: csel w0, w8, w9, hi +; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: csel w0, w9, w8, hi ; CHECK-NEXT: ret %add = sub i16 %a, %b %or = or i16 %add, 1 @@ -40,13 +40,13 @@ define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) { ; CHECK-LABEL: overflow_mul: ; CHECK: // %bb.0: -; CHECK-NEXT: mul w8, w1, w0 -; CHECK-NEXT: orr w8, w8, #0x1 -; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: mov w9, #5 -; CHECK-NEXT: cmp w8, #1024 -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: csel w0, w8, w9, hi +; CHECK-NEXT: mul w9, w1, w0 +; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: orr w9, w9, #0x1 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: cmp w9, #1024 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: csel w0, w9, w8, hi ; CHECK-NEXT: ret %add = mul i16 %b, %a %or = or i16 %add, 1 @@ -58,13 +58,13 @@ define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) { ; CHECK-LABEL: overflow_shl: ; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w0, w1 -; CHECK-NEXT: orr w8, w8, #0x1 -; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: mov w9, #5 -; CHECK-NEXT: cmp w8, #1024 -; CHECK-NEXT: mov w8, #2 -; CHECK-NEXT: csel w0, w8, w9, hi +; CHECK-NEXT: lsl w9, w0, w1 +; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: orr w9, w9, #0x1 +; CHECK-NEXT: and w9, w9, #0xffff +; CHECK-NEXT: cmp w9, #1024 +; CHECK-NEXT: mov w9, #2 +; CHECK-NEXT: csel w0, w9, w8, hi ; CHECK-NEXT: ret %add = shl i16 %a, %b %or = or i16 %add, 1 @@ -76,11 +76,11 @@ define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %limit) { ; CHECK-LABEL: overflow_add_no_consts: ; CHECK: // %bb.0: -; CHECK-NEXT: add w8, w1, w0 -; CHECK-NEXT: mov w9, #16 -; CHECK-NEXT: cmp w2, w8, uxtb -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: csel w0, w8, w9, lo +; CHECK-NEXT: add w9, w1, w0 +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: cmp w2, w9, uxtb +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: csel w0, w9, w8, lo ; CHECK-NEXT: ret %add = add i8 %b, %a %cmp = icmp ugt i8 %add, %limit @@ -92,11 +92,11 @@ ; CHECK-LABEL: overflow_add_const_limit: ; CHECK: // %bb.0: ; CHECK-NEXT: add w8, w1, w0 +; CHECK-NEXT: mov w9, #8 ; CHECK-NEXT: and w8, w8, #0xff -; CHECK-NEXT: mov w9, #16 ; CHECK-NEXT: cmp w8, #128 -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: csel w0, w8, w9, hi +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: csel w0, w9, w8, hi ; CHECK-NEXT: ret %add = add i8 %b, %a %cmp = icmp ugt i8 %add, -128 @@ -108,10 +108,10 @@ ; CHECK-LABEL: overflow_add_positive_const_limit: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-1 -; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: mov w9, #8 ; CHECK-NEXT: cmp w8, w0, sxtb -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: csel w0, w8, w9, gt +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: csel w0, w9, w8, gt ; CHECK-NEXT: ret %cmp = icmp slt i8 %a, -1 %res = select i1 %cmp, i32 8, i32 16 @@ -121,8 +121,8 @@ define i32 @unsafe_add_underflow(i8 zeroext %a) { ; CHECK-LABEL: unsafe_add_underflow: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #1 ; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: cmp w0, #1 ; CHECK-NEXT: mov w9, #8 ; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret @@ -134,8 +134,8 @@ define i32 @safe_add_underflow(i8 zeroext %a) { ; CHECK-LABEL: safe_add_underflow: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w9, #8 ; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret @@ -147,11 +147,11 @@ define i32 @safe_add_underflow_neg(i8 zeroext %a) { ; CHECK-LABEL: safe_add_underflow_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: sub w8, w0, #2 -; CHECK-NEXT: mov w9, #16 -; CHECK-NEXT: cmp w8, #251 -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: csel w0, w8, w9, lo +; CHECK-NEXT: sub w9, w0, #2 +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: cmp w9, #251 +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: csel w0, w9, w8, lo ; CHECK-NEXT: ret %add = add i8 %a, -2 %cmp = icmp ult i8 %add, -5 @@ -163,10 +163,10 @@ ; CHECK-LABEL: overflow_sub_negative_const_limit: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-1 -; CHECK-NEXT: mov w9, #16 +; CHECK-NEXT: mov w9, #8 ; CHECK-NEXT: cmp w8, w0, sxtb -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: csel w0, w8, w9, gt +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: csel w0, w9, w8, gt ; CHECK-NEXT: ret %cmp = icmp slt i8 %a, -1 %res = select i1 %cmp, i32 8, i32 16 @@ -176,12 +176,12 @@ define i32 @unsafe_sub_underflow(i8 zeroext %a) { ; CHECK-LABEL: unsafe_sub_underflow: ; CHECK: // %bb.0: -; CHECK-NEXT: sub w8, w0, #6 -; CHECK-NEXT: and w8, w8, #0xff -; CHECK-NEXT: mov w9, #16 -; CHECK-NEXT: cmp w8, #250 -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: csel w0, w8, w9, hi +; CHECK-NEXT: sub w9, w0, #6 +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: and w9, w9, #0xff +; CHECK-NEXT: cmp w9, #250 +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: csel w0, w9, w8, hi ; CHECK-NEXT: ret %sub = add i8 %a, -6 %cmp = icmp ugt i8 %sub, -6 @@ -192,8 +192,8 @@ define i32 @safe_sub_underflow(i8 zeroext %a) { ; CHECK-LABEL: safe_sub_underflow: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: mov w9, #16 ; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret @@ -205,11 +205,11 @@ define i32 @safe_sub_underflow_neg(i8 zeroext %a) { ; CHECK-LABEL: safe_sub_underflow_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: sub w8, w0, #4 -; CHECK-NEXT: mov w9, #16 -; CHECK-NEXT: cmp w8, #250 -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: csel w0, w8, w9, hi +; CHECK-NEXT: sub w9, w0, #4 +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: cmp w9, #250 +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: csel w0, w9, w8, hi ; CHECK-NEXT: ret %sub = add i8 %a, -4 %cmp = icmp ugt i8 %sub, -6 @@ -220,12 +220,12 @@ define i32 @unsafe_sub_underflow_neg(i8 zeroext %a) { ; CHECK-LABEL: unsafe_sub_underflow_neg: ; CHECK: // %bb.0: -; CHECK-NEXT: sub w8, w0, #4 -; CHECK-NEXT: and w8, w8, #0xff -; CHECK-NEXT: mov w9, #16 -; CHECK-NEXT: cmp w8, #253 -; CHECK-NEXT: mov w8, #8 -; CHECK-NEXT: csel w0, w8, w9, lo +; CHECK-NEXT: sub w9, w0, #4 +; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: and w9, w9, #0xff +; CHECK-NEXT: cmp w9, #253 +; CHECK-NEXT: mov w9, #8 +; CHECK-NEXT: csel w0, w9, w8, lo ; CHECK-NEXT: ret %sub = add i8 %a, -4 %cmp = icmp ult i8 %sub, -3 @@ -280,14 +280,14 @@ define i8 @convert_add_order(i8 zeroext %arg) { ; CHECK-LABEL: convert_add_order: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w0, #0x1 -; CHECK-NEXT: sub w10, w8, #40 -; CHECK-NEXT: mov w9, #1 +; CHECK-NEXT: orr w9, w0, #0x1 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: sub w10, w9, #40 ; CHECK-NEXT: cmp w10, #20 -; CHECK-NEXT: cinc w9, w9, hs -; CHECK-NEXT: cmp w8, #50 -; CHECK-NEXT: mov w8, #255 -; CHECK-NEXT: csel w8, w9, w8, lo +; CHECK-NEXT: cinc w8, w8, hs +; CHECK-NEXT: cmp w9, #50 +; CHECK-NEXT: mov w9, #255 +; CHECK-NEXT: csel w8, w8, w9, lo ; CHECK-NEXT: and w0, w8, w0 ; CHECK-NEXT: ret %shl = or i8 %arg, 1 @@ -304,11 +304,11 @@ ; CHECK-LABEL: underflow_if_sub: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: mov w9, #100 ; CHECK-NEXT: cset w8, gt ; CHECK-NEXT: and w8, w8, w0 ; CHECK-NEXT: add w8, w8, #245 ; CHECK-NEXT: cmp w8, w1 -; CHECK-NEXT: mov w9, #100 ; CHECK-NEXT: csel w0, w8, w9, lo ; CHECK-NEXT: ret %cmp = icmp sgt i32 %arg, 0 @@ -325,11 +325,11 @@ ; CHECK-LABEL: underflow_if_sub_signext: ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: mov w9, #100 ; CHECK-NEXT: cset w8, gt ; CHECK-NEXT: and w8, w8, w0 ; CHECK-NEXT: add w8, w8, #245 ; CHECK-NEXT: cmp w8, w1, uxtb -; CHECK-NEXT: mov w9, #100 ; CHECK-NEXT: csel w0, w8, w9, lo ; CHECK-NEXT: ret %cmp = icmp sgt i32 %arg, 0 diff --git a/llvm/test/CodeGen/AArch64/typepromotion-signed.ll b/llvm/test/CodeGen/AArch64/typepromotion-signed.ll --- a/llvm/test/CodeGen/AArch64/typepromotion-signed.ll +++ b/llvm/test/CodeGen/AArch64/typepromotion-signed.ll @@ -56,13 +56,13 @@ define i32 @test_signext_b(i8* nocapture readonly %ptr, i8 signext %arg) { ; CHECK-LABEL: test_signext_b: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: mov w9, #20894 -; CHECK-NEXT: add w8, w8, w1 -; CHECK-NEXT: sxtb w8, w8 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: csel w0, w8, w9, ge +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: mov w8, #20894 +; CHECK-NEXT: add w9, w9, w1 +; CHECK-NEXT: sxtb w9, w9 +; CHECK-NEXT: cmp w9, #0 +; CHECK-NEXT: mov w9, #42 +; CHECK-NEXT: csel w0, w9, w8, ge ; CHECK-NEXT: ret entry: %0 = load i8, i8* %ptr, align 1 @@ -76,12 +76,12 @@ ; CHECK-LABEL: test_signext_b_ult_slt: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: mov w9, #57 -; CHECK-NEXT: add w10, w8, w1, uxtb -; CHECK-NEXT: cmp w10, #127 +; CHECK-NEXT: add w9, w8, w1, uxtb +; CHECK-NEXT: cmp w9, #127 +; CHECK-NEXT: mov w9, #42 ; CHECK-NEXT: ccmp w8, #0, #0, ne -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: csel w0, w8, w9, eq +; CHECK-NEXT: mov w8, #57 +; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret entry: %0 = load i8, i8* %ptr, align 1 @@ -96,13 +96,13 @@ define i32 @test_signext_h(i16* nocapture readonly %ptr, i16 signext %arg) { ; CHECK-LABEL: test_signext_h: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: mov w9, #20894 -; CHECK-NEXT: add w8, w8, w1 -; CHECK-NEXT: sxth w8, w8 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: csel w0, w8, w9, ge +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: mov w8, #20894 +; CHECK-NEXT: add w9, w9, w1 +; CHECK-NEXT: sxth w9, w9 +; CHECK-NEXT: cmp w9, #0 +; CHECK-NEXT: mov w9, #42 +; CHECK-NEXT: csel w0, w9, w8, ge ; CHECK-NEXT: ret entry: %0 = load i16, i16* %ptr, align 1 diff --git a/llvm/test/CodeGen/AArch64/uadd_sat.ll b/llvm/test/CodeGen/AArch64/uadd_sat.ll --- a/llvm/test/CodeGen/AArch64/uadd_sat.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat.ll @@ -31,8 +31,8 @@ ; CHECK-LABEL: func16: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: add w8, w8, w1, uxth ; CHECK-NEXT: mov w9, #65535 +; CHECK-NEXT: add w8, w8, w1, uxth ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: csel w0, w8, w9, lo ; CHECK-NEXT: ret @@ -44,9 +44,9 @@ ; CHECK-LABEL: func8: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov w9, #255 ; CHECK-NEXT: add w8, w8, w1, uxtb ; CHECK-NEXT: cmp w8, #255 -; CHECK-NEXT: mov w9, #255 ; CHECK-NEXT: csel w0, w8, w9, lo ; CHECK-NEXT: ret %tmp = call i8 @llvm.uadd.sat.i8(i8 %x, i8 %y); @@ -59,8 +59,8 @@ ; CHECK-NEXT: and w8, w1, #0xf ; CHECK-NEXT: and w9, w0, #0xf ; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: cmp w8, #15 ; CHECK-NEXT: mov w9, #15 +; CHECK-NEXT: cmp w8, #15 ; CHECK-NEXT: csel w0, w8, w9, lo ; CHECK-NEXT: ret %tmp = call i4 @llvm.uadd.sat.i4(i4 %x, i4 %y); diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_plus.ll b/llvm/test/CodeGen/AArch64/uadd_sat_plus.ll --- a/llvm/test/CodeGen/AArch64/uadd_sat_plus.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_plus.ll @@ -33,12 +33,12 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; CHECK-LABEL: func16: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff ; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: add w8, w8, w9, uxth -; CHECK-NEXT: mov w9, #65535 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: csel w0, w8, w9, lo +; CHECK-NEXT: and w10, w0, #0xffff +; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: add w9, w10, w9, uxth +; CHECK-NEXT: cmp w9, w8 +; CHECK-NEXT: csel w0, w9, w8, lo ; CHECK-NEXT: ret %a = mul i16 %y, %z %tmp = call i16 @llvm.uadd.sat.i16(i16 %x, i16 %a) @@ -48,12 +48,12 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; CHECK-LABEL: func8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xff ; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: add w8, w8, w9, uxtb -; CHECK-NEXT: cmp w8, #255 -; CHECK-NEXT: mov w9, #255 -; CHECK-NEXT: csel w0, w8, w9, lo +; CHECK-NEXT: and w10, w0, #0xff +; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: add w9, w10, w9, uxtb +; CHECK-NEXT: cmp w9, #255 +; CHECK-NEXT: csel w0, w9, w8, lo ; CHECK-NEXT: ret %a = mul i8 %y, %z %tmp = call i8 @llvm.uadd.sat.i8(i8 %x, i8 %a) @@ -64,12 +64,12 @@ ; CHECK-LABEL: func4: ; CHECK: // %bb.0: ; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: and w8, w0, #0xf +; CHECK-NEXT: and w10, w0, #0xf +; CHECK-NEXT: mov w8, #15 ; CHECK-NEXT: and w9, w9, #0xf -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: cmp w8, #15 -; CHECK-NEXT: mov w9, #15 -; CHECK-NEXT: csel w0, w8, w9, lo +; CHECK-NEXT: add w9, w10, w9 +; CHECK-NEXT: cmp w9, #15 +; CHECK-NEXT: csel w0, w9, w8, lo ; CHECK-NEXT: ret %a = mul i4 %y, %z %tmp = call i4 @llvm.uadd.sat.i4(i4 %x, i4 %a) diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -54,9 +54,9 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; CHECK-LABEL: v64i8: ; CHECK: // %bb.0: +; CHECK-NEXT: uqadd v2.16b, v2.16b, v6.16b ; CHECK-NEXT: uqadd v0.16b, v0.16b, v4.16b ; CHECK-NEXT: uqadd v1.16b, v1.16b, v5.16b -; CHECK-NEXT: uqadd v2.16b, v2.16b, v6.16b ; CHECK-NEXT: uqadd v3.16b, v3.16b, v7.16b ; CHECK-NEXT: ret %z = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %x, <64 x i8> %y) @@ -85,9 +85,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; CHECK-LABEL: v32i16: ; CHECK: // %bb.0: +; CHECK-NEXT: uqadd v2.8h, v2.8h, v6.8h ; CHECK-NEXT: uqadd v0.8h, v0.8h, v4.8h ; CHECK-NEXT: uqadd v1.8h, v1.8h, v5.8h -; CHECK-NEXT: uqadd v2.8h, v2.8h, v6.8h ; CHECK-NEXT: uqadd v3.8h, v3.8h, v7.8h ; CHECK-NEXT: ret %z = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %x, <32 x i16> %y) @@ -112,13 +112,13 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind { ; CHECK-LABEL: v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ldr s1, [x1] -; CHECK-NEXT: movi d2, #0xff00ff00ff00ff -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: movi d0, #0xff00ff00ff00ff +; CHECK-NEXT: ldr s2, [x1] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: umin v0.4h, v0.4h, v2.4h +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-NEXT: umin v0.4h, v1.4h, v0.4h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -132,17 +132,17 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x1] -; CHECK-NEXT: ldrb w10, [x0, #1] -; CHECK-NEXT: ldrb w11, [x1, #1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v2.s[1], w11 -; CHECK-NEXT: movi d1, #0x0000ff000000ff -; CHECK-NEXT: add v0.2s, v0.2s, v2.2s -; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ldrb w8, [x1] +; CHECK-NEXT: movi d0, #0x0000ff000000ff +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: ldrb w10, [x1, #1] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: mov v2.s[1], w10 +; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-NEXT: umin v0.2s, v1.2s, v0.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w9, [x2] @@ -173,17 +173,17 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x1] -; CHECK-NEXT: ldrh w10, [x0, #2] -; CHECK-NEXT: ldrh w11, [x1, #2] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v2.s[1], w11 -; CHECK-NEXT: movi d1, #0x00ffff0000ffff -; CHECK-NEXT: add v0.2s, v0.2s, v2.2s -; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ldrh w8, [x1] +; CHECK-NEXT: movi d0, #0x00ffff0000ffff +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: ldrh w10, [x1, #2] +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: ldrh w9, [x0, #2] +; CHECK-NEXT: mov v2.s[1], w10 +; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-NEXT: umin v0.2s, v1.2s, v0.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w9, [x2] @@ -208,10 +208,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { ; CHECK-LABEL: v12i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: uqadd v1.8h, v1.8h, v2.8h -; CHECK-NEXT: uqadd v0.8h, v0.8h, v3.8h +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: uqadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uqadd v1.8h, v2.8h, v3.8h ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: str d1, [x2, #16] ; CHECK-NEXT: ret @@ -305,9 +305,9 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; CHECK-LABEL: v16i32: ; CHECK: // %bb.0: +; CHECK-NEXT: uqadd v2.4s, v2.4s, v6.4s ; CHECK-NEXT: uqadd v0.4s, v0.4s, v4.4s ; CHECK-NEXT: uqadd v1.4s, v1.4s, v5.4s -; CHECK-NEXT: uqadd v2.4s, v2.4s, v6.4s ; CHECK-NEXT: uqadd v3.4s, v3.4s, v7.4s ; CHECK-NEXT: ret %z = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y) @@ -336,9 +336,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: v8i64: ; CHECK: // %bb.0: +; CHECK-NEXT: uqadd v2.2d, v2.2d, v6.2d ; CHECK-NEXT: uqadd v0.2d, v0.2d, v4.2d ; CHECK-NEXT: uqadd v1.2d, v1.2d, v5.2d -; CHECK-NEXT: uqadd v2.2d, v2.2d, v6.2d ; CHECK-NEXT: uqadd v3.2d, v3.2d, v7.2d ; CHECK-NEXT: ret %z = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y) diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll @@ -5,26 +5,25 @@ ; AARCH-LABEL: muloti_test: ; AARCH: // %bb.0: // %start ; AARCH-NEXT: cmp x3, #0 -; AARCH-NEXT: umulh x8, x1, x2 -; AARCH-NEXT: cset w10, ne +; AARCH-NEXT: umulh x10, x1, x2 +; AARCH-NEXT: cset w8, ne ; AARCH-NEXT: cmp x1, #0 +; AARCH-NEXT: cset w9, ne +; AARCH-NEXT: cmp xzr, x10 +; AARCH-NEXT: and w8, w9, w8 ; AARCH-NEXT: mul x9, x3, x0 +; AARCH-NEXT: umulh x10, x3, x0 ; AARCH-NEXT: cset w11, ne -; AARCH-NEXT: cmp xzr, x8 -; AARCH-NEXT: umulh x8, x3, x0 ; AARCH-NEXT: madd x9, x1, x2, x9 -; AARCH-NEXT: and w10, w11, w10 +; AARCH-NEXT: orr w8, w8, w11 +; AARCH-NEXT: cmp xzr, x10 +; AARCH-NEXT: umulh x10, x0, x2 ; AARCH-NEXT: cset w11, ne -; AARCH-NEXT: cmp xzr, x8 -; AARCH-NEXT: umulh x8, x0, x2 -; AARCH-NEXT: orr w10, w10, w11 -; AARCH-NEXT: cset w11, ne -; AARCH-NEXT: adds x1, x8, x9 -; AARCH-NEXT: orr w8, w10, w11 -; AARCH-NEXT: cset w9, hs -; AARCH-NEXT: orr w8, w8, w9 ; AARCH-NEXT: mul x0, x0, x2 -; AARCH-NEXT: mov w2, w8 +; AARCH-NEXT: adds x1, x10, x9 +; AARCH-NEXT: orr w8, w8, w11 +; AARCH-NEXT: cset w9, hs +; AARCH-NEXT: orr w2, w8, w9 ; AARCH-NEXT: ret start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2 diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-innerouter.ll @@ -10,8 +10,8 @@ ; CHECK-LABEL: out8_constmask: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr w8, w0, #2 -; CHECK-NEXT: bfi w1, w8, #2, #4 ; CHECK-NEXT: mov w0, w1 +; CHECK-NEXT: bfi w0, w8, #2, #4 ; CHECK-NEXT: ret %mx = and i8 %x, 60 %my = and i8 %y, -61 @@ -23,8 +23,8 @@ ; CHECK-LABEL: out16_constmask: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr w8, w0, #4 -; CHECK-NEXT: bfi w1, w8, #4, #8 ; CHECK-NEXT: mov w0, w1 +; CHECK-NEXT: bfi w0, w8, #4, #8 ; CHECK-NEXT: ret %mx = and i16 %x, 4080 %my = and i16 %y, -4081 @@ -36,8 +36,8 @@ ; CHECK-LABEL: out32_constmask: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr w8, w0, #8 -; CHECK-NEXT: bfi w1, w8, #8, #16 ; CHECK-NEXT: mov w0, w1 +; CHECK-NEXT: bfi w0, w8, #8, #16 ; CHECK-NEXT: ret %mx = and i32 %x, 16776960 %my = and i32 %y, -16776961 @@ -49,8 +49,8 @@ ; CHECK-LABEL: out64_constmask: ; CHECK: // %bb.0: ; CHECK-NEXT: lsr x8, x0, #16 -; CHECK-NEXT: bfi x1, x8, #16, #32 ; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: bfi x0, x8, #16, #32 ; CHECK-NEXT: ret %mx = and i64 %x, 281474976645120 %my = and i64 %y, -281474976645121 @@ -206,8 +206,8 @@ ; CHECK-NEXT: eor w8, w0, w1 ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: and w20, w8, #0xffff00 -; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl use32 ; CHECK-NEXT: eor w0, w20, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload @@ -245,10 +245,10 @@ define i32 @n0_badconstmask(i32 %x, i32 %y) { ; CHECK-LABEL: n0_badconstmask: ; CHECK: // %bb.0: -; CHECK-NEXT: and w9, w1, #0xffffff00 -; CHECK-NEXT: and w8, w0, #0xffff00 -; CHECK-NEXT: and w9, w9, #0xff0001ff -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: and w8, w1, #0xffffff00 +; CHECK-NEXT: and w9, w0, #0xffff00 +; CHECK-NEXT: and w8, w8, #0xff0001ff +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %mx = and i32 %x, 16776960 %my = and i32 %y, -16776960 ; instead of -16776961 diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbits.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbits.ll --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbits.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbits.ll @@ -212,8 +212,8 @@ ; CHECK-NEXT: eor w8, w0, w1 ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: and w20, w8, #0x55555555 -; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl use32 ; CHECK-NEXT: eor w0, w20, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload @@ -251,11 +251,11 @@ define i32 @n0_badconstmask(i32 %x, i32 %y) { ; CHECK-LABEL: n0_badconstmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: and w8, w0, #0x55555555 -; CHECK-NEXT: and w9, w1, w9 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: and w9, w0, #0x55555555 +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: and w8, w1, w8 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %mx = and i32 %x, 1431655765 %my = and i32 %y, -1431655765 ; instead of -1431655766 diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll @@ -208,8 +208,8 @@ ; CHECK-NEXT: eor w8, w0, w1 ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: and w20, w8, #0xf0f0f0f -; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl use32 ; CHECK-NEXT: eor w0, w20, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload @@ -247,11 +247,11 @@ define i32 @n0_badconstmask(i32 %x, i32 %y) { ; CHECK-LABEL: n0_badconstmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #61681 -; CHECK-NEXT: movk w9, #61680, lsl #16 -; CHECK-NEXT: and w8, w0, #0xf0f0f0f -; CHECK-NEXT: and w9, w1, w9 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: mov w8, #61681 +; CHECK-NEXT: and w9, w0, #0xf0f0f0f +; CHECK-NEXT: movk w8, #61680, lsl #16 +; CHECK-NEXT: and w8, w1, w8 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %mx = and i32 %x, 252645135 %my = and i32 %y, -252645135 ; instead of -252645136 diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-lowhigh.ll --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-lowhigh.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-constmask-lowhigh.ll @@ -201,8 +201,8 @@ ; CHECK-NEXT: eor w8, w0, w1 ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: and w20, w8, #0xffff -; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl use32 ; CHECK-NEXT: eor w0, w20, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll --- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-scalar-variablemask.ll @@ -6,9 +6,9 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) { ; CHECK-LABEL: out8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, w2 -; CHECK-NEXT: bic w9, w1, w2 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: bic w8, w1, w2 +; CHECK-NEXT: and w9, w0, w2 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %mx = and i8 %x, %mask %notmask = xor i8 %mask, -1 @@ -20,9 +20,9 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) { ; CHECK-LABEL: out16: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, w2 -; CHECK-NEXT: bic w9, w1, w2 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: bic w8, w1, w2 +; CHECK-NEXT: and w9, w0, w2 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %mx = and i16 %x, %mask %notmask = xor i16 %mask, -1 @@ -34,9 +34,9 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) { ; CHECK-LABEL: out32: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, w2 -; CHECK-NEXT: bic w9, w1, w2 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: bic w8, w1, w2 +; CHECK-NEXT: and w9, w0, w2 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %mx = and i32 %x, %mask %notmask = xor i32 %mask, -1 @@ -48,9 +48,9 @@ define i64 @out64(i64 %x, i64 %y, i64 %mask) { ; CHECK-LABEL: out64: ; CHECK: // %bb.0: -; CHECK-NEXT: and x8, x0, x2 -; CHECK-NEXT: bic x9, x1, x2 -; CHECK-NEXT: orr x0, x8, x9 +; CHECK-NEXT: bic x8, x1, x2 +; CHECK-NEXT: and x9, x0, x2 +; CHECK-NEXT: orr x0, x9, x8 ; CHECK-NEXT: ret %mx = and i64 %x, %mask %notmask = xor i64 %mask, -1 @@ -155,9 +155,9 @@ define i32 @in_commutativity_1_0_0(i32 %x, i32 %y, i32 %mask) { ; CHECK-LABEL: in_commutativity_1_0_0: ; CHECK: // %bb.0: -; CHECK-NEXT: bic w8, w0, w2 -; CHECK-NEXT: and w9, w1, w2 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: and w8, w1, w2 +; CHECK-NEXT: bic w9, w0, w2 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %n0 = xor i32 %x, %y %n1 = and i32 %n0, %mask @@ -167,9 +167,9 @@ define i32 @in_commutativity_1_0_1(i32 %x, i32 %y, i32 %mask) { ; CHECK-LABEL: in_commutativity_1_0_1: ; CHECK: // %bb.0: -; CHECK-NEXT: bic w8, w0, w2 -; CHECK-NEXT: and w9, w1, w2 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: and w8, w1, w2 +; CHECK-NEXT: bic w9, w0, w2 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %n0 = xor i32 %x, %y %n1 = and i32 %mask, %n0 ; swapped @@ -179,9 +179,9 @@ define i32 @in_commutativity_1_1_0(i32 %x, i32 %y, i32 %mask) { ; CHECK-LABEL: in_commutativity_1_1_0: ; CHECK: // %bb.0: -; CHECK-NEXT: bic w8, w0, w2 -; CHECK-NEXT: and w9, w1, w2 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: and w8, w1, w2 +; CHECK-NEXT: bic w9, w0, w2 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %n0 = xor i32 %x, %y %n1 = and i32 %n0, %mask @@ -191,9 +191,9 @@ define i32 @in_commutativity_1_1_1(i32 %x, i32 %y, i32 %mask) { ; CHECK-LABEL: in_commutativity_1_1_1: ; CHECK: // %bb.0: -; CHECK-NEXT: bic w8, w0, w2 -; CHECK-NEXT: and w9, w1, w2 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: and w8, w1, w2 +; CHECK-NEXT: bic w9, w0, w2 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %n0 = xor i32 %x, %y %n1 = and i32 %mask, %n0 ; swapped @@ -384,10 +384,10 @@ define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) { ; CHECK-LABEL: out_constant_varx_42: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #42 -; CHECK-NEXT: and w8, w2, w0 -; CHECK-NEXT: bic w9, w9, w2 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: and w9, w2, w0 +; CHECK-NEXT: bic w8, w8, w2 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %notmask = xor i32 %mask, -1 %mx = and i32 %mask, %x @@ -399,8 +399,8 @@ ; CHECK-LABEL: in_constant_varx_42: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: bic w8, w8, w2 ; CHECK-NEXT: and w9, w0, w2 +; CHECK-NEXT: bic w8, w8, w2 ; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %n0 = xor i32 %x, 42 ; %x @@ -412,10 +412,10 @@ define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-LABEL: out_constant_varx_42_invmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #42 -; CHECK-NEXT: bic w8, w0, w2 -; CHECK-NEXT: and w9, w2, w9 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: bic w9, w0, w2 +; CHECK-NEXT: and w8, w2, w8 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %notmask = xor i32 %mask, -1 %mx = and i32 %notmask, %x @@ -428,8 +428,8 @@ ; CHECK-LABEL: in_constant_varx_42_invmask: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: and w8, w2, w8 ; CHECK-NEXT: bic w9, w0, w2 +; CHECK-NEXT: and w8, w2, w8 ; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %notmask = xor i32 %mask, -1 @@ -491,8 +491,8 @@ ; CHECK-LABEL: out_constant_42_vary: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: and w8, w2, w8 ; CHECK-NEXT: bic w9, w1, w2 +; CHECK-NEXT: and w8, w2, w8 ; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %notmask = xor i32 %mask, -1 @@ -504,10 +504,10 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) { ; CHECK-LABEL: in_constant_42_vary: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #42 -; CHECK-NEXT: bic w8, w1, w2 -; CHECK-NEXT: and w9, w2, w9 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: bic w9, w1, w2 +; CHECK-NEXT: and w8, w2, w8 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %n0 = xor i32 42, %y ; %x %n1 = and i32 %n0, %mask @@ -519,8 +519,8 @@ ; CHECK-LABEL: out_constant_42_vary_invmask: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: bic w8, w8, w2 ; CHECK-NEXT: and w9, w2, w1 +; CHECK-NEXT: bic w8, w8, w2 ; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %notmask = xor i32 %mask, -1 @@ -533,10 +533,10 @@ define i32 @in_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-LABEL: in_constant_42_vary_invmask: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #42 -; CHECK-NEXT: and w8, w1, w2 -; CHECK-NEXT: bic w9, w9, w2 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: and w9, w1, w2 +; CHECK-NEXT: bic w8, w8, w2 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %notmask = xor i32 %mask, -1 %n0 = xor i32 42, %y ; %x @@ -556,8 +556,8 @@ ; CHECK-NEXT: eor w8, w0, w1 ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: and w20, w8, w3 -; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl use32 ; CHECK-NEXT: eor w0, w20, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload @@ -605,10 +605,10 @@ define i32 @n0_badxor(i32 %x, i32 %y, i32 %mask) { ; CHECK-LABEL: n0_badxor: ; CHECK: // %bb.0: -; CHECK-NEXT: eor w9, w2, #0x1 -; CHECK-NEXT: and w8, w0, w2 -; CHECK-NEXT: and w9, w1, w9 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: eor w8, w2, #0x1 +; CHECK-NEXT: and w9, w0, w2 +; CHECK-NEXT: and w8, w1, w8 +; CHECK-NEXT: orr w0, w9, w8 ; CHECK-NEXT: ret %mx = and i32 %x, %mask %notmask = xor i32 %mask, 1 ; instead of -1 diff --git a/llvm/test/CodeGen/AArch64/urem-lkk.ll b/llvm/test/CodeGen/AArch64/urem-lkk.ll --- a/llvm/test/CodeGen/AArch64/urem-lkk.ll +++ b/llvm/test/CodeGen/AArch64/urem-lkk.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: lsr x8, x8, #32 ; CHECK-NEXT: sub w9, w0, w8 ; CHECK-NEXT: add w8, w8, w9, lsr #1 -; CHECK-NEXT: lsr w8, w8, #6 ; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: lsr w8, w8, #6 ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = urem i32 %x, 95 @@ -23,10 +23,10 @@ ; CHECK-LABEL: fold_urem_positive_even: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #16323 +; CHECK-NEXT: mov w9, #1060 ; CHECK-NEXT: movk w8, #63310, lsl #16 ; CHECK-NEXT: umull x8, w0, w8 ; CHECK-NEXT: lsr x8, x8, #42 -; CHECK-NEXT: mov w9, #1060 ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = urem i32 %x, 1060 @@ -44,8 +44,8 @@ ; CHECK-NEXT: lsr x8, x8, #32 ; CHECK-NEXT: sub w9, w0, w8 ; CHECK-NEXT: add w8, w8, w9, lsr #1 -; CHECK-NEXT: lsr w8, w8, #6 ; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: lsr w8, w8, #6 ; CHECK-NEXT: msub w9, w8, w9, w0 ; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret @@ -88,14 +88,14 @@ define i64 @dont_fold_urem_i64(i64 %x) { ; CHECK-LABEL: dont_fold_urem_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #58849 -; CHECK-NEXT: movk x9, #48148, lsl #16 -; CHECK-NEXT: movk x9, #33436, lsl #32 -; CHECK-NEXT: lsr x8, x0, #1 -; CHECK-NEXT: movk x9, #21399, lsl #48 -; CHECK-NEXT: umulh x8, x8, x9 -; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mov x8, #58849 +; CHECK-NEXT: lsr x9, x0, #1 +; CHECK-NEXT: movk x8, #48148, lsl #16 +; CHECK-NEXT: movk x8, #33436, lsl #32 +; CHECK-NEXT: movk x8, #21399, lsl #48 +; CHECK-NEXT: umulh x8, x9, x8 ; CHECK-NEXT: mov w9, #98 +; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: msub x0, x8, x9, x0 ; CHECK-NEXT: ret %1 = urem i64 %x, 98 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll @@ -65,27 +65,27 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; CHECK-LABEL: test_urem_vec: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] ; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: adrp x9, .LCPI4_1 +; CHECK-NEXT: adrp x8, .LCPI4_0 ; CHECK-NEXT: mov v0.h[1], w1 -; CHECK-NEXT: ldr d3, [x9, :lo12:.LCPI4_1] -; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: adrp x8, .LCPI4_1 ; CHECK-NEXT: mov v0.h[2], w2 ; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_2] -; CHECK-NEXT: mul v0.4h, v0.4h, v3.4h +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h +; CHECK-NEXT: movi d1, #0x0000000000ffff +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_2] ; CHECK-NEXT: adrp x8, .LCPI4_3 -; CHECK-NEXT: shl v3.4h, v0.4h, #1 -; CHECK-NEXT: movi d2, #0x0000000000ffff -; CHECK-NEXT: ushl v1.4h, v3.4h, v1.4h -; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: shl v2.4h, v0.4h, #1 ; CHECK-NEXT: bic v0.4h, #248, lsl #8 -; CHECK-NEXT: ushl v0.4h, v0.4h, v2.4h -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ushl v2.4h, v2.4h, v3.4h +; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b ; CHECK-NEXT: bic v0.4h, #248, lsl #8 -; CHECK-NEXT: cmhi v0.4h, v0.4h, v3.4h +; CHECK-NEXT: cmhi v0.4h, v0.4h, v1.4h ; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: umov w1, v0.h[1] ; CHECK-NEXT: umov w2, v0.h[2] diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll @@ -5,8 +5,8 @@ ; CHECK-LABEL: t32_3_1: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: mov w9, #1431655765 +; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo @@ -20,8 +20,8 @@ ; CHECK-LABEL: t32_3_2: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: mov w9, #-1431655766 +; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: mov w9, #1431655765 ; CHECK-NEXT: cmp w8, w9 @@ -37,8 +37,8 @@ ; CHECK-LABEL: t32_5_1: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: mov w9, #858993459 +; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo @@ -52,8 +52,8 @@ ; CHECK-LABEL: t32_5_2: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: mov w9, #1717986918 +; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: mov w9, #858993459 ; CHECK-NEXT: cmp w8, w9 @@ -68,8 +68,8 @@ ; CHECK-LABEL: t32_5_3: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: mov w9, #-1717986919 +; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: mov w9, #858993459 ; CHECK-NEXT: cmp w8, w9 @@ -84,8 +84,8 @@ ; CHECK-LABEL: t32_5_4: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: mov w9, #-858993460 +; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: mov w9, #858993459 ; CHECK-NEXT: cmp w8, w9 @@ -101,12 +101,12 @@ ; CHECK-LABEL: t32_6_1: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: mov w9, #1431655765 +; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -119,12 +119,12 @@ ; CHECK-LABEL: t32_6_2: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: mov w9, #-1431655766 +; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 ; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -137,12 +137,12 @@ ; CHECK-LABEL: t32_6_3: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: mov w9, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: movk w9, #10922, lsl #16 ; CHECK-NEXT: mul w8, w0, w8 ; CHECK-NEXT: sub w8, w8, #1 -; CHECK-NEXT: mov w9, #43691 ; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: movk w9, #10922, lsl #16 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -154,13 +154,13 @@ define i1 @t32_6_4(i32 %X) nounwind { ; CHECK-LABEL: t32_6_4: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: sub w8, w0, #4 -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: sub w9, w0, #4 +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: mul w8, w9, w8 ; CHECK-NEXT: mov w9, #43690 -; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -172,13 +172,13 @@ define i1 @t32_6_5(i32 %X) nounwind { ; CHECK-LABEL: t32_6_5: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: sub w8, w0, #5 -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: mul w8, w8, w9 +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: sub w9, w0, #5 +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: mul w8, w9, w8 ; CHECK-NEXT: mov w9, #43690 -; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: ror w8, w8, #1 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -224,8 +224,8 @@ ; CHECK-LABEL: t64_3_2: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-6148914691236517206 -; CHECK-NEXT: movk x8, #43691 ; CHECK-NEXT: mov x9, #-6148914691236517206 +; CHECK-NEXT: movk x8, #43691 ; CHECK-NEXT: madd x8, x0, x8, x9 ; CHECK-NEXT: mov x9, #6148914691236517205 ; CHECK-NEXT: cmp x8, x9 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll b/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll @@ -5,12 +5,12 @@ ; CHECK-LABEL: test_minsize: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: mov w9, #42 ; CHECK-NEXT: udiv w8, w0, w8 ; CHECK-NEXT: add w8, w8, w8, lsl #2 -; CHECK-NEXT: mov w9, #-10 ; CHECK-NEXT: cmp w0, w8 -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: csel w0, w8, w9, eq +; CHECK-NEXT: mov w8, #-10 +; CHECK-NEXT: csel w0, w9, w8, eq ; CHECK-NEXT: ret %rem = urem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 @@ -22,14 +22,14 @@ ; CHECK-LABEL: test_optsize: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: mov w9, #13108 +; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #13107, lsl #16 ; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: mov w10, #-10 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: csel w0, w8, w10, lo +; CHECK-NEXT: mov w8, #-10 +; CHECK-NEXT: mov w9, #42 +; CHECK-NEXT: csel w0, w9, w8, lo ; CHECK-NEXT: ret %rem = urem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -6,17 +6,17 @@ ; CHECK-LABEL: test_urem_odd_even: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: adrp x9, .LCPI0_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: adrp x8, .LCPI0_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] -; CHECK-NEXT: adrp x8, .LCPI0_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] -; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x8, .LCPI0_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -34,11 +34,11 @@ ; CHECK-LABEL: test_urem_odd_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: adrp x9, .LCPI1_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI1_1] +; CHECK-NEXT: adrp x8, .LCPI1_1 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_1] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -51,11 +51,11 @@ ; CHECK-LABEL: test_urem_odd_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: adrp x9, .LCPI2_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_1] +; CHECK-NEXT: adrp x8, .LCPI2_1 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -70,17 +70,17 @@ ; CHECK-LABEL: test_urem_even_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: adrp x9, .LCPI3_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: adrp x8, .LCPI3_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: adrp x8, .LCPI3_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -94,17 +94,17 @@ ; CHECK-LABEL: test_urem_even_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: adrp x9, .LCPI4_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] ; CHECK-NEXT: adrp x8, .LCPI4_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] -; CHECK-NEXT: adrp x8, .LCPI4_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] -; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI4_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -120,17 +120,17 @@ ; CHECK-LABEL: test_urem_odd_even_allones_eq: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI5_0 +; CHECK-NEXT: adrp x9, .LCPI5_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] ; CHECK-NEXT: adrp x8, .LCPI5_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] -; CHECK-NEXT: adrp x8, .LCPI5_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] -; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI5_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_1] +; CHECK-NEXT: adrp x8, .LCPI5_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -144,17 +144,17 @@ ; CHECK-LABEL: test_urem_odd_even_allones_ne: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI6_0 +; CHECK-NEXT: adrp x9, .LCPI6_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] ; CHECK-NEXT: adrp x8, .LCPI6_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] -; CHECK-NEXT: adrp x8, .LCPI6_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2] -; CHECK-NEXT: adrp x8, .LCPI6_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI6_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_1] +; CHECK-NEXT: adrp x8, .LCPI6_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -172,17 +172,17 @@ ; CHECK-LABEL: test_urem_odd_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI7_0 +; CHECK-NEXT: adrp x9, .LCPI7_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] ; CHECK-NEXT: adrp x8, .LCPI7_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1] -; CHECK-NEXT: adrp x8, .LCPI7_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2] -; CHECK-NEXT: adrp x8, .LCPI7_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI7_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_1] +; CHECK-NEXT: adrp x8, .LCPI7_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -198,17 +198,17 @@ ; CHECK-LABEL: test_urem_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: adrp x9, .LCPI8_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] ; CHECK-NEXT: adrp x8, .LCPI8_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1] -; CHECK-NEXT: adrp x8, .LCPI8_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI8_2] -; CHECK-NEXT: adrp x8, .LCPI8_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_1] +; CHECK-NEXT: adrp x8, .LCPI8_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -224,17 +224,17 @@ ; CHECK-LABEL: test_urem_odd_even_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: adrp x9, .LCPI9_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] ; CHECK-NEXT: adrp x8, .LCPI9_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] -; CHECK-NEXT: adrp x8, .LCPI9_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] -; CHECK-NEXT: adrp x8, .LCPI9_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI9_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_1] +; CHECK-NEXT: adrp x8, .LCPI9_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -251,12 +251,12 @@ define <4 x i32> @test_urem_odd_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_odd_one: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI10_0] ; CHECK-NEXT: mov w8, #52429 +; CHECK-NEXT: adrp x9, .LCPI10_0 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI10_0] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -273,14 +273,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #28087 ; CHECK-NEXT: movk w8, #46811, lsl #16 -; CHECK-NEXT: adrp x9, .LCPI11_0 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI11_0] +; CHECK-NEXT: adrp x8, .LCPI11_0 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: shl v1.4s, v0.4s, #31 ; CHECK-NEXT: ushr v0.4s, v0.4s, #1 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -295,17 +295,17 @@ ; CHECK-LABEL: test_urem_odd_even_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: adrp x9, .LCPI12_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] ; CHECK-NEXT: adrp x8, .LCPI12_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] -; CHECK-NEXT: adrp x8, .LCPI12_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] -; CHECK-NEXT: adrp x8, .LCPI12_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI12_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_1] +; CHECK-NEXT: adrp x8, .LCPI12_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -323,17 +323,17 @@ ; CHECK-LABEL: test_urem_odd_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: adrp x9, .LCPI13_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: adrp x8, .LCPI13_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1] -; CHECK-NEXT: adrp x8, .LCPI13_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2] -; CHECK-NEXT: adrp x8, .LCPI13_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI13_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_1] +; CHECK-NEXT: adrp x8, .LCPI13_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -349,17 +349,17 @@ ; CHECK-LABEL: test_urem_even_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: adrp x9, .LCPI14_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: adrp x8, .LCPI14_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] -; CHECK-NEXT: adrp x8, .LCPI14_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2] -; CHECK-NEXT: adrp x8, .LCPI14_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_1] +; CHECK-NEXT: adrp x8, .LCPI14_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -375,17 +375,17 @@ ; CHECK-LABEL: test_urem_odd_even_INT_MIN: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI15_0 +; CHECK-NEXT: adrp x9, .LCPI15_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: adrp x8, .LCPI15_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] -; CHECK-NEXT: adrp x8, .LCPI15_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] -; CHECK-NEXT: adrp x8, .LCPI15_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI15_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_1] +; CHECK-NEXT: adrp x8, .LCPI15_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -403,17 +403,17 @@ ; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: adrp x9, .LCPI16_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] ; CHECK-NEXT: adrp x8, .LCPI16_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] -; CHECK-NEXT: adrp x8, .LCPI16_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] -; CHECK-NEXT: adrp x8, .LCPI16_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI16_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_1] +; CHECK-NEXT: adrp x8, .LCPI16_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -429,17 +429,17 @@ ; CHECK-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: adrp x9, .LCPI17_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] ; CHECK-NEXT: adrp x8, .LCPI17_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1] -; CHECK-NEXT: adrp x8, .LCPI17_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2] -; CHECK-NEXT: adrp x8, .LCPI17_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI17_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_1] +; CHECK-NEXT: adrp x8, .LCPI17_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -455,17 +455,17 @@ ; CHECK-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: adrp x9, .LCPI18_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] ; CHECK-NEXT: adrp x8, .LCPI18_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] -; CHECK-NEXT: adrp x8, .LCPI18_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] -; CHECK-NEXT: adrp x8, .LCPI18_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_1] +; CHECK-NEXT: adrp x8, .LCPI18_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -483,11 +483,11 @@ ; CHECK-LABEL: test_urem_odd_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: adrp x9, .LCPI19_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI19_1] +; CHECK-NEXT: adrp x8, .LCPI19_1 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_1] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -502,17 +502,17 @@ ; CHECK-LABEL: test_urem_even_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: adrp x9, .LCPI20_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] ; CHECK-NEXT: adrp x8, .LCPI20_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1] -; CHECK-NEXT: adrp x8, .LCPI20_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2] -; CHECK-NEXT: adrp x8, .LCPI20_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI20_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_1] +; CHECK-NEXT: adrp x8, .LCPI20_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -528,17 +528,17 @@ ; CHECK-LABEL: test_urem_odd_even_allones_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI21_0 +; CHECK-NEXT: adrp x9, .LCPI21_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] ; CHECK-NEXT: adrp x8, .LCPI21_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] -; CHECK-NEXT: adrp x8, .LCPI21_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2] -; CHECK-NEXT: adrp x8, .LCPI21_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI21_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_1] +; CHECK-NEXT: adrp x8, .LCPI21_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -556,17 +556,17 @@ ; CHECK-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI22_0 +; CHECK-NEXT: adrp x9, .LCPI22_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] ; CHECK-NEXT: adrp x8, .LCPI22_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1] -; CHECK-NEXT: adrp x8, .LCPI22_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2] -; CHECK-NEXT: adrp x8, .LCPI22_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI22_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_1] +; CHECK-NEXT: adrp x8, .LCPI22_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -582,17 +582,17 @@ ; CHECK-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: adrp x9, .LCPI23_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] ; CHECK-NEXT: adrp x8, .LCPI23_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1] -; CHECK-NEXT: adrp x8, .LCPI23_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2] -; CHECK-NEXT: adrp x8, .LCPI23_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI23_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_1] +; CHECK-NEXT: adrp x8, .LCPI23_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -608,17 +608,17 @@ ; CHECK-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI24_0 +; CHECK-NEXT: adrp x9, .LCPI24_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] ; CHECK-NEXT: adrp x8, .LCPI24_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] -; CHECK-NEXT: adrp x8, .LCPI24_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2] -; CHECK-NEXT: adrp x8, .LCPI24_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI24_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_1] +; CHECK-NEXT: adrp x8, .LCPI24_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -635,17 +635,17 @@ ; CHECK-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI25_0 +; CHECK-NEXT: adrp x9, .LCPI25_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] ; CHECK-NEXT: adrp x8, .LCPI25_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_1] -; CHECK-NEXT: adrp x8, .LCPI25_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_2] -; CHECK-NEXT: adrp x8, .LCPI25_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI25_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_1] +; CHECK-NEXT: adrp x8, .LCPI25_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -660,17 +660,17 @@ ; CHECK-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI26_0 +; CHECK-NEXT: adrp x9, .LCPI26_2 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] ; CHECK-NEXT: adrp x8, .LCPI26_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] -; CHECK-NEXT: adrp x8, .LCPI26_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_2] -; CHECK-NEXT: adrp x8, .LCPI26_3 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI26_2] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_1] +; CHECK-NEXT: adrp x8, .LCPI26_3 +; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_3] -; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s -; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s -; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -6,14 +6,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: adrp x9, .LCPI0_1 ; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] ; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: adrp x8, .LCPI0_1 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -28,12 +28,12 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: mov w9, #13106 -; CHECK-NEXT: movk w9, #13107, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s -; CHECK-NEXT: dup v1.4s, w9 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: mov w8, #13106 +; CHECK-NEXT: movk w8, #13107, lsl #16 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret @@ -49,15 +49,15 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: mov w8, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mov w9, #43690 -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s -; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: mov w8, #43690 +; CHECK-NEXT: movk w8, #10922, lsl #16 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: shl v1.4s, v0.4s, #31 ; CHECK-NEXT: ushr v0.4s, v0.4s, #1 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: dup v1.4s, w9 +; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret @@ -71,17 +71,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: dup v2.4s, w9 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: shl v1.4s, v0.4s, #31 ; CHECK-NEXT: ushr v0.4s, v0.4s, #1 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -94,16 +94,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI4_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: adrp x8, .LCPI4_1 -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: dup v3.4s, w9 +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mul v0.4s, v0.4s, v3.4s -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s -; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_1] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi d1, #0x00ffffffff0000 +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll @@ -7,12 +7,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #23593 ; CHECK-NEXT: movk w8, #49807, lsl #16 -; CHECK-NEXT: mov w9, #28835 -; CHECK-NEXT: movk w9, #2621, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mov w8, #28835 +; CHECK-NEXT: movk w8, #2621, lsl #16 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -29,14 +29,14 @@ ; CHECK-NEXT: mov w8, #23593 ; CHECK-NEXT: movk w8, #49807, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: mov w9, #23592 +; CHECK-NEXT: mov w8, #23592 +; CHECK-NEXT: movk w8, #655, lsl #16 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: movk w9, #655, lsl #16 ; CHECK-NEXT: shl v1.4s, v0.4s, #30 ; CHECK-NEXT: ushr v0.4s, v0.4s, #2 -; CHECK-NEXT: dup v2.4s, w9 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -53,11 +53,11 @@ ; CHECK-LABEL: test_urem_odd_neg25: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: adrp x9, .LCPI2_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_1] +; CHECK-NEXT: adrp x8, .LCPI2_1 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -74,12 +74,12 @@ ; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: shl v1.4s, v0.4s, #30 ; CHECK-NEXT: ushr v0.4s, v0.4s, #2 ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -98,15 +98,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #34079 ; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: movi v1.4s, #25 ; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s ; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s ; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: movi v1.4s, #25 ; CHECK-NEXT: ushr v2.4s, v2.4s, #3 ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -120,15 +120,15 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #34079 ; CHECK-NEXT: movk w8, #20971, lsl #16 +; CHECK-NEXT: movi v1.4s, #100 ; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s ; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s ; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: movi v1.4s, #100 ; CHECK-NEXT: ushr v2.4s, v2.4s, #5 ; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -168,8 +168,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #15 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -197,8 +197,8 @@ define <4 x i32> @test_urem_allones(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_allones: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v0.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: neg v0.4s, v0.4s ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll @@ -7,9 +7,9 @@ ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: adrp x8, .LCPI0_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -20,15 +20,15 @@ define <4 x i1> @t1_all_odd_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: t1_all_odd_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] ; CHECK-NEXT: mov w8, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] ; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s -; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -39,15 +39,15 @@ define <4 x i1> @t1_all_odd_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: t1_all_odd_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: mov w8, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s -; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -58,14 +58,14 @@ define <8 x i1> @t2_narrow(<8 x i16> %X) nounwind { ; CHECK-LABEL: t2_narrow: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: dup v1.8h, w8 ; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: dup v2.8h, w8 -; CHECK-NEXT: mul v0.8h, v0.8h, v2.8h ; CHECK-NEXT: cmhs v0.8h, v1.8h, v0.8h -; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <8 x i16> %X, @@ -76,19 +76,19 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { ; CHECK-LABEL: t3_wide: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #-6148914691236517206 -; CHECK-NEXT: adrp x11, .LCPI4_0 -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: movk x9, #43691 -; CHECK-NEXT: fmov x10, d0 -; CHECK-NEXT: ldr q0, [x11, :lo12:.LCPI4_0] -; CHECK-NEXT: mul x10, x10, x9 -; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: fmov d1, x10 -; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: cmhs v0.2d, v0.2d, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: mov x8, #-6148914691236517206 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: movk x8, #43691 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: mul x9, x9, x8 +; CHECK-NEXT: mul x8, x10, x8 +; CHECK-NEXT: fmov d0, x9 +; CHECK-NEXT: adrp x9, .LCPI4_0 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI4_0] +; CHECK-NEXT: cmhs v0.2d, v1.2d, v0.2d ; CHECK-NEXT: movi d1, #0xffffffff00000000 +; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <2 x i64> %X, diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -9,10 +9,10 @@ ; CHECK-LABEL: test_urem_odd: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 -; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: mov w9, #13108 -; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #13107, lsl #16 +; CHECK-NEXT: mul w8, w0, w8 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -26,10 +26,10 @@ ; CHECK-LABEL: test_urem_odd_25: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #23593 -; CHECK-NEXT: movk w8, #49807, lsl #16 ; CHECK-NEXT: mov w9, #28836 -; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: movk w8, #49807, lsl #16 ; CHECK-NEXT: movk w9, #2621, lsl #16 +; CHECK-NEXT: mul w8, w0, w8 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -97,11 +97,11 @@ ; CHECK-LABEL: test_urem_even_100: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: mov w9, #23593 ; CHECK-NEXT: movk w8, #49807, lsl #16 +; CHECK-NEXT: movk w9, #655, lsl #16 ; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: mov w9, #23593 ; CHECK-NEXT: ror w8, w8, #2 -; CHECK-NEXT: movk w9, #655, lsl #16 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -154,9 +154,9 @@ ; CHECK-LABEL: test_urem_odd_setne: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 +; CHECK-NEXT: mov w9, #858993459 ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: mul w8, w0, w8 -; CHECK-NEXT: mov w9, #858993459 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll --- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -4,42 +4,42 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w11, #33437 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: movk w11, #21399, lsl #16 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov w9, #16913 -; CHECK-NEXT: mov w12, #98 -; CHECK-NEXT: lsr x11, x11, #37 -; CHECK-NEXT: movk w9, #8456, lsl #16 -; CHECK-NEXT: msub w10, w11, w12, w10 -; CHECK-NEXT: ubfx w12, w8, #2, #14 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: mov w11, #124 -; CHECK-NEXT: lsr x9, x9, #34 -; CHECK-NEXT: msub w8, w9, w11, w8 +; CHECK-NEXT: umov w8, v0.h[0] ; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: umov w12, v0.h[0] ; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w12, w9 +; CHECK-NEXT: umov w10, v0.h[1] +; CHECK-NEXT: mov w12, #16913 +; CHECK-NEXT: mov w13, #95 +; CHECK-NEXT: movk w12, #8456, lsl #16 +; CHECK-NEXT: umull x9, w8, w9 +; CHECK-NEXT: ubfx w14, w10, #2, #14 ; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w11, w12, w9 +; CHECK-NEXT: sub w11, w8, w9 +; CHECK-NEXT: umull x12, w14, w12 ; CHECK-NEXT: add w9, w9, w11, lsr #1 -; CHECK-NEXT: mov w11, #95 +; CHECK-NEXT: umov w11, v0.h[2] ; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w9, w9, w11, w12 -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov w9, #2287 -; CHECK-NEXT: movk w9, #16727, lsl #16 +; CHECK-NEXT: lsr x12, x12, #34 +; CHECK-NEXT: msub w8, w9, w13, w8 +; CHECK-NEXT: mov w9, #33437 +; CHECK-NEXT: movk w9, #21399, lsl #16 +; CHECK-NEXT: mov w13, #124 ; CHECK-NEXT: umull x9, w11, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #1003 +; CHECK-NEXT: msub w10, w12, w13, w10 +; CHECK-NEXT: umov w12, v0.h[3] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: mov w13, #2287 +; CHECK-NEXT: lsr x8, x9, #37 +; CHECK-NEXT: mov w9, #98 +; CHECK-NEXT: movk w13, #16727, lsl #16 +; CHECK-NEXT: msub w8, w8, w9, w11 +; CHECK-NEXT: mov v0.h[1], w10 +; CHECK-NEXT: umull x9, w12, w13 +; CHECK-NEXT: mov w10, #1003 ; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: mov v0.h[2], w10 -; CHECK-NEXT: msub w8, w9, w8, w11 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: msub w8, w9, w10, w12 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -50,41 +50,41 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: movk w9, #22765, lsl #16 ; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: umull x13, w8, w9 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: umull x14, w10, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: umov w12, v0.h[3] -; CHECK-NEXT: umull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: sub w16, w8, w13 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w13, w13, w16, lsr #1 -; CHECK-NEXT: sub w16, w10, w14 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w14, w14, w16, lsr #1 -; CHECK-NEXT: sub w16, w11, w15 -; CHECK-NEXT: add w15, w15, w16, lsr #1 -; CHECK-NEXT: sub w16, w12, w9 -; CHECK-NEXT: add w9, w9, w16, lsr #1 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: lsr w13, w13, #6 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: lsr w13, w14, #6 -; CHECK-NEXT: msub w10, w13, w16, w10 -; CHECK-NEXT: lsr w13, w15, #6 +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: umov w9, v0.h[1] +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: umov w15, v0.h[2] +; CHECK-NEXT: umov w16, v0.h[3] +; CHECK-NEXT: umull x12, w10, w8 +; CHECK-NEXT: umull x11, w9, w8 +; CHECK-NEXT: lsr x12, x12, #32 +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: sub w14, w10, w12 +; CHECK-NEXT: sub w13, w9, w11 +; CHECK-NEXT: add w12, w12, w14, lsr #1 +; CHECK-NEXT: umull x14, w15, w8 +; CHECK-NEXT: add w11, w11, w13, lsr #1 +; CHECK-NEXT: mov w13, #95 +; CHECK-NEXT: lsr w12, w12, #6 +; CHECK-NEXT: lsr w11, w11, #6 +; CHECK-NEXT: umull x8, w16, w8 +; CHECK-NEXT: msub w10, w12, w13, w10 +; CHECK-NEXT: lsr x12, x14, #32 +; CHECK-NEXT: msub w9, w11, w13, w9 +; CHECK-NEXT: sub w11, w15, w12 +; CHECK-NEXT: lsr x8, x8, #32 ; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w13, w16, w11 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 +; CHECK-NEXT: add w10, w12, w11, lsr #1 +; CHECK-NEXT: lsr w10, w10, #6 +; CHECK-NEXT: sub w11, w16, w8 +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: msub w9, w10, w13, w15 +; CHECK-NEXT: add w8, w8, w11, lsr #1 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: mov v0.h[2], w9 +; CHECK-NEXT: msub w8, w8, w13, w16 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -97,47 +97,47 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; CHECK-LABEL: combine_urem_udiv: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8969 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: mov w8, #8969 ; CHECK-NEXT: movk w8, #22765, lsl #16 -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: umull x13, w9, w8 +; CHECK-NEXT: umov w10, v0.h[1] ; CHECK-NEXT: umov w11, v0.h[2] +; CHECK-NEXT: mov w15, #95 +; CHECK-NEXT: umov w13, v0.h[3] +; CHECK-NEXT: umull x12, w9, w8 ; CHECK-NEXT: umull x14, w10, w8 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: umov w12, v0.h[3] -; CHECK-NEXT: umull x15, w11, w8 +; CHECK-NEXT: lsr x12, x12, #32 +; CHECK-NEXT: umull x17, w11, w8 +; CHECK-NEXT: sub w16, w9, w12 ; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: sub w16, w9, w13 -; CHECK-NEXT: umull x8, w12, w8 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w13, w13, w16, lsr #1 +; CHECK-NEXT: lsr x17, x17, #32 +; CHECK-NEXT: umull x8, w13, w8 +; CHECK-NEXT: add w12, w12, w16, lsr #1 ; CHECK-NEXT: sub w16, w10, w14 +; CHECK-NEXT: lsr w12, w12, #6 ; CHECK-NEXT: lsr x8, x8, #32 ; CHECK-NEXT: add w14, w14, w16, lsr #1 -; CHECK-NEXT: sub w16, w11, w15 -; CHECK-NEXT: add w15, w15, w16, lsr #1 -; CHECK-NEXT: sub w16, w12, w8 -; CHECK-NEXT: add w8, w8, w16, lsr #1 -; CHECK-NEXT: mov w16, #95 +; CHECK-NEXT: sub w16, w11, w17 +; CHECK-NEXT: msub w9, w12, w15, w9 ; CHECK-NEXT: lsr w14, w14, #6 -; CHECK-NEXT: lsr w13, w13, #6 -; CHECK-NEXT: msub w10, w14, w16, w10 -; CHECK-NEXT: lsr w15, w15, #6 -; CHECK-NEXT: msub w9, w13, w16, w9 -; CHECK-NEXT: fmov s0, w14 -; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: add w16, w17, w16, lsr #1 +; CHECK-NEXT: fmov s1, w12 +; CHECK-NEXT: msub w10, w14, w15, w10 +; CHECK-NEXT: sub w17, w13, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: lsr w9, w16, #6 +; CHECK-NEXT: mov v1.h[1], w14 +; CHECK-NEXT: add w8, w8, w17, lsr #1 +; CHECK-NEXT: msub w11, w9, w15, w11 ; CHECK-NEXT: lsr w8, w8, #6 -; CHECK-NEXT: msub w11, w15, w16, w11 -; CHECK-NEXT: mov v0.h[1], w13 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: msub w12, w8, w16, w12 -; CHECK-NEXT: mov v0.h[2], w15 -; CHECK-NEXT: mov v1.h[2], w11 -; CHECK-NEXT: mov v1.h[3], w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: mov v0.h[1], w10 +; CHECK-NEXT: msub w10, w8, w15, w13 +; CHECK-NEXT: mov v1.h[2], w9 +; CHECK-NEXT: mov v0.h[2], w11 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, @@ -150,26 +150,26 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_urem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w10, w8, w9 -; CHECK-NEXT: add w9, w9, w10, lsr #1 -; CHECK-NEXT: mov w10, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: and w9, w9, #0x3f -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w10, w10, #0x1f -; CHECK-NEXT: and w9, w9, #0x7 -; CHECK-NEXT: mov v1.h[1], w10 -; CHECK-NEXT: mov v1.h[2], w9 +; CHECK-NEXT: umov w10, v0.h[0] +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: umov w11, v0.h[1] +; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: and w10, w10, #0x3f +; CHECK-NEXT: umull x8, w9, w8 +; CHECK-NEXT: and w11, w11, #0x1f +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: sub w12, w9, w8 +; CHECK-NEXT: mov v1.h[1], w11 +; CHECK-NEXT: add w8, w8, w12, lsr #1 +; CHECK-NEXT: and w10, w10, #0x7 +; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: mov w11, #95 +; CHECK-NEXT: msub w8, w8, w11, w9 +; CHECK-NEXT: mov v1.h[2], w10 ; CHECK-NEXT: mov v1.h[3], w8 ; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret @@ -181,32 +181,32 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: mov w10, #23 -; CHECK-NEXT: lsr x9, x9, #36 -; CHECK-NEXT: umov w11, v0.h[1] -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: mov w9, #30865 -; CHECK-NEXT: movk w9, #51306, lsl #16 -; CHECK-NEXT: ubfx w10, w11, #1, #15 -; CHECK-NEXT: umull x9, w10, w9 -; CHECK-NEXT: mov w10, #654 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: msub w9, w9, w10, w11 -; CHECK-NEXT: mov w11, #47143 -; CHECK-NEXT: umov w10, v0.h[3] -; CHECK-NEXT: movk w11, #24749, lsl #16 +; CHECK-NEXT: umov w9, v0.h[1] +; CHECK-NEXT: mov w8, #30865 +; CHECK-NEXT: movk w8, #51306, lsl #16 +; CHECK-NEXT: umov w11, v0.h[2] +; CHECK-NEXT: mov w12, #654 ; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: mov w9, #5423 -; CHECK-NEXT: lsr x11, x11, #43 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: msub w8, w11, w9, w10 +; CHECK-NEXT: mov w13, #47143 +; CHECK-NEXT: ubfx w10, w9, #1, #15 +; CHECK-NEXT: movk w13, #24749, lsl #16 +; CHECK-NEXT: umull x8, w10, w8 +; CHECK-NEXT: mov w10, #17097 +; CHECK-NEXT: movk w10, #45590, lsl #16 +; CHECK-NEXT: lsr x8, x8, #40 +; CHECK-NEXT: umull x10, w11, w10 +; CHECK-NEXT: msub w8, w8, w12, w9 +; CHECK-NEXT: umov w9, v0.h[3] +; CHECK-NEXT: lsr x10, x10, #36 +; CHECK-NEXT: mov w12, #23 +; CHECK-NEXT: msub w10, w10, w12, w11 +; CHECK-NEXT: mov w11, #5423 +; CHECK-NEXT: mov v1.h[1], w8 +; CHECK-NEXT: umull x8, w9, w13 +; CHECK-NEXT: lsr x8, x8, #43 +; CHECK-NEXT: mov v1.h[2], w10 +; CHECK-NEXT: msub w8, w8, w11, w9 ; CHECK-NEXT: mov v1.h[3], w8 ; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret @@ -227,39 +227,39 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) { ; CHECK-LABEL: dont_fold_urem_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x10, #12109 -; CHECK-NEXT: movk x10, #52170, lsl #16 -; CHECK-NEXT: movk x10, #28749, lsl #32 -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: movk x10, #49499, lsl #48 -; CHECK-NEXT: umulh x10, x8, x10 -; CHECK-NEXT: mov w11, #5423 -; CHECK-NEXT: lsr x10, x10, #12 -; CHECK-NEXT: msub x8, x10, x11, x8 -; CHECK-NEXT: mov x10, #21445 -; CHECK-NEXT: movk x10, #1603, lsl #16 -; CHECK-NEXT: mov x12, v0.d[1] -; CHECK-NEXT: movk x10, #15432, lsl #32 -; CHECK-NEXT: movk x10, #25653, lsl #48 -; CHECK-NEXT: lsr x11, x12, #1 -; CHECK-NEXT: umulh x10, x11, x10 -; CHECK-NEXT: mov w11, #654 -; CHECK-NEXT: lsr x10, x10, #7 -; CHECK-NEXT: msub x10, x10, x11, x12 -; CHECK-NEXT: mov x11, #17097 -; CHECK-NEXT: movk x11, #45590, lsl #16 -; CHECK-NEXT: movk x11, #34192, lsl #32 +; CHECK-NEXT: mov x8, #17097 ; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: movk x11, #25644, lsl #48 -; CHECK-NEXT: umulh x11, x9, x11 -; CHECK-NEXT: sub x12, x9, x11 -; CHECK-NEXT: add x11, x11, x12, lsr #1 -; CHECK-NEXT: mov w12, #23 -; CHECK-NEXT: lsr x11, x11, #4 -; CHECK-NEXT: msub x9, x11, x12, x9 +; CHECK-NEXT: movk x8, #45590, lsl #16 +; CHECK-NEXT: mov x13, #21445 +; CHECK-NEXT: movk x8, #34192, lsl #32 +; CHECK-NEXT: movk x13, #1603, lsl #16 +; CHECK-NEXT: movk x8, #25644, lsl #48 +; CHECK-NEXT: movk x13, #15432, lsl #32 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: movk x13, #25653, lsl #48 +; CHECK-NEXT: umulh x8, x9, x8 +; CHECK-NEXT: mov x11, v1.d[1] +; CHECK-NEXT: sub x12, x9, x8 +; CHECK-NEXT: lsr x14, x10, #1 +; CHECK-NEXT: add x8, x8, x12, lsr #1 +; CHECK-NEXT: mov x12, #12109 +; CHECK-NEXT: movk x12, #52170, lsl #16 +; CHECK-NEXT: umulh x13, x14, x13 +; CHECK-NEXT: movk x12, #28749, lsl #32 +; CHECK-NEXT: mov w14, #23 +; CHECK-NEXT: movk x12, #49499, lsl #48 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: lsr x13, x13, #7 +; CHECK-NEXT: umulh x12, x11, x12 +; CHECK-NEXT: msub x8, x8, x14, x9 +; CHECK-NEXT: mov w9, #5423 +; CHECK-NEXT: lsr x12, x12, #12 +; CHECK-NEXT: mov w14, #654 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: msub x9, x12, x9, x11 +; CHECK-NEXT: msub x10, x13, x14, x10 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov v1.d[1], x9 ; CHECK-NEXT: mov v0.d[1], x10 ; CHECK-NEXT: ret %1 = urem <4 x i64> %x, diff --git a/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll b/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll --- a/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll +++ b/llvm/test/CodeGen/AArch64/use-cr-result-of-dom-icmp-st.ll @@ -44,8 +44,8 @@ ; CHECK-NEXT: lsl x8, x0, x1 ; CHECK-NEXT: cmn x8, #1 ; CHECK-NEXT: csinc x9, x1, xzr, eq -; CHECK-NEXT: mul x9, x9, x0 ; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: mul x9, x9, x0 ; CHECK-NEXT: csel x0, x1, x9, ge ; CHECK-NEXT: ret entry: @@ -162,8 +162,8 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmn x0, #1 ; CHECK-NEXT: csinc x8, x1, xzr, eq -; CHECK-NEXT: mul x8, x8, x0 ; CHECK-NEXT: cmp x0, #0 +; CHECK-NEXT: mul x8, x8, x0 ; CHECK-NEXT: csel x0, x1, x8, ge ; CHECK-NEXT: ret entry: @@ -279,8 +279,8 @@ ; CHECK-NEXT: lsl w8, w0, w1 ; CHECK-NEXT: cmn w8, #1 ; CHECK-NEXT: csinc w9, w1, wzr, eq -; CHECK-NEXT: mul w9, w9, w0 ; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: mul w9, w9, w0 ; CHECK-NEXT: csel w8, w1, w9, ge ; CHECK-NEXT: sxtw x0, w8 ; CHECK-NEXT: ret @@ -412,8 +412,8 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmn w0, #1 ; CHECK-NEXT: csinc w8, w1, wzr, eq -; CHECK-NEXT: mul w8, w8, w0 ; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: mul w8, w8, w0 ; CHECK-NEXT: csel w8, w1, w8, ge ; CHECK-NEXT: sxtw x0, w8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/usub_sat_plus.ll b/llvm/test/CodeGen/AArch64/usub_sat_plus.ll --- a/llvm/test/CodeGen/AArch64/usub_sat_plus.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_plus.ll @@ -33,9 +33,9 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; CHECK-LABEL: func16: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: subs w8, w8, w9, uxth +; CHECK-NEXT: mul w8, w1, w2 +; CHECK-NEXT: and w9, w0, #0xffff +; CHECK-NEXT: subs w8, w9, w8, uxth ; CHECK-NEXT: csel w0, wzr, w8, lo ; CHECK-NEXT: ret %a = mul i16 %y, %z @@ -46,9 +46,9 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; CHECK-LABEL: func8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: subs w8, w8, w9, uxtb +; CHECK-NEXT: mul w8, w1, w2 +; CHECK-NEXT: and w9, w0, #0xff +; CHECK-NEXT: subs w8, w9, w8, uxtb ; CHECK-NEXT: csel w0, wzr, w8, lo ; CHECK-NEXT: ret %a = mul i8 %y, %z @@ -59,10 +59,10 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind { ; CHECK-LABEL: func4: ; CHECK: // %bb.0: -; CHECK-NEXT: mul w9, w1, w2 -; CHECK-NEXT: and w8, w0, #0xf -; CHECK-NEXT: and w9, w9, #0xf -; CHECK-NEXT: subs w8, w8, w9 +; CHECK-NEXT: mul w8, w1, w2 +; CHECK-NEXT: and w9, w0, #0xf +; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: subs w8, w9, w8 ; CHECK-NEXT: csel w0, wzr, w8, lo ; CHECK-NEXT: ret %a = mul i4 %y, %z diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -55,9 +55,9 @@ define <64 x i8> @v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; CHECK-LABEL: v64i8: ; CHECK: // %bb.0: +; CHECK-NEXT: uqsub v2.16b, v2.16b, v6.16b ; CHECK-NEXT: uqsub v0.16b, v0.16b, v4.16b ; CHECK-NEXT: uqsub v1.16b, v1.16b, v5.16b -; CHECK-NEXT: uqsub v2.16b, v2.16b, v6.16b ; CHECK-NEXT: uqsub v3.16b, v3.16b, v7.16b ; CHECK-NEXT: ret %z = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %x, <64 x i8> %y) @@ -86,9 +86,9 @@ define <32 x i16> @v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; CHECK-LABEL: v32i16: ; CHECK: // %bb.0: +; CHECK-NEXT: uqsub v2.8h, v2.8h, v6.8h ; CHECK-NEXT: uqsub v0.8h, v0.8h, v4.8h ; CHECK-NEXT: uqsub v1.8h, v1.8h, v5.8h -; CHECK-NEXT: uqsub v2.8h, v2.8h, v6.8h ; CHECK-NEXT: uqsub v3.8h, v3.8h, v7.8h ; CHECK-NEXT: ret %z = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %x, <32 x i16> %y) @@ -131,14 +131,14 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x1] -; CHECK-NEXT: ldrb w10, [x0, #1] -; CHECK-NEXT: ldrb w11, [x1, #1] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v1.s[1], w11 +; CHECK-NEXT: ldrb w8, [x1] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: ldrb w10, [x1, #1] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 @@ -170,14 +170,14 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x1] -; CHECK-NEXT: ldrh w10, [x0, #2] -; CHECK-NEXT: ldrh w11, [x1, #2] -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v1.s[1], w11 +; CHECK-NEXT: ldrh w8, [x1] +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: ldrh w10, [x1, #2] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: ldrh w9, [x0, #2] +; CHECK-NEXT: mov v1.s[1], w10 +; CHECK-NEXT: mov v0.s[1], w9 ; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 @@ -203,10 +203,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind { ; CHECK-LABEL: v12i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q3, q2, [x1] -; CHECK-NEXT: uqsub v1.8h, v1.8h, v2.8h -; CHECK-NEXT: uqsub v0.8h, v0.8h, v3.8h +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: uqsub v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uqsub v1.8h, v2.8h, v3.8h ; CHECK-NEXT: str q0, [x2] ; CHECK-NEXT: str d1, [x2, #16] ; CHECK-NEXT: ret @@ -301,9 +301,9 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; CHECK-LABEL: v16i32: ; CHECK: // %bb.0: +; CHECK-NEXT: uqsub v2.4s, v2.4s, v6.4s ; CHECK-NEXT: uqsub v0.4s, v0.4s, v4.4s ; CHECK-NEXT: uqsub v1.4s, v1.4s, v5.4s -; CHECK-NEXT: uqsub v2.4s, v2.4s, v6.4s ; CHECK-NEXT: uqsub v3.4s, v3.4s, v7.4s ; CHECK-NEXT: ret %z = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %x, <16 x i32> %y) @@ -332,9 +332,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; CHECK-LABEL: v8i64: ; CHECK: // %bb.0: +; CHECK-NEXT: uqsub v2.2d, v2.2d, v6.2d ; CHECK-NEXT: uqsub v0.2d, v0.2d, v4.2d ; CHECK-NEXT: uqsub v1.2d, v1.2d, v5.2d -; CHECK-NEXT: uqsub v2.2d, v2.2d, v6.2d ; CHECK-NEXT: uqsub v3.2d, v3.2d, v7.2d ; CHECK-NEXT: ret %z = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> %x, <8 x i64> %y) diff --git a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll --- a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll +++ b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll @@ -7,10 +7,10 @@ ; CHECK-NEXT: ldp q0, q1, [x0] ; CHECK-NEXT: fadd v0.4s, v0.4s, v0.4s ; CHECK-NEXT: fadd v1.4s, v1.4s, v1.4s -; CHECK-NEXT: fcvtzs v1.4s, v1.4s ; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: fcvtzs v1.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: xtn v1.4h, v1.4s ; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %l = load <8 x float>, <8 x float>* %in diff --git a/llvm/test/CodeGen/AArch64/vec-libcalls.ll b/llvm/test/CodeGen/AArch64/vec-libcalls.ll --- a/llvm/test/CodeGen/AArch64/vec-libcalls.ll +++ b/llvm/test/CodeGen/AArch64/vec-libcalls.ll @@ -61,8 +61,8 @@ ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v0.s[1], v1.s[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: add sp, sp, #48 @@ -91,8 +91,8 @@ ; CHECK-NEXT: mov s0, v0.s[2] ; CHECK-NEXT: bl sinf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: add sp, sp, #48 @@ -124,12 +124,12 @@ ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: mov s0, v0.s[3] +; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: bl sinf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v1.s[3], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: add sp, sp, #48 @@ -143,11 +143,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str d12, [sp, #-48]! // 8-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #8] // 16-byte Folded Spill +; CHECK-NEXT: fmov s10, s2 ; CHECK-NEXT: stp d9, d8, [sp, #24] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #40] // 8-byte Folded Spill ; CHECK-NEXT: fmov s8, s4 +; CHECK-NEXT: str x30, [sp, #40] // 8-byte Folded Spill ; CHECK-NEXT: fmov s9, s3 -; CHECK-NEXT: fmov s10, s2 ; CHECK-NEXT: fmov s11, s1 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: fmov s12, s0 @@ -167,9 +167,9 @@ ; CHECK-NEXT: fmov s3, s9 ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: fmov s4, s0 ; CHECK-NEXT: fmov s0, s12 +; CHECK-NEXT: ldp d11, d10, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: ldr d12, [sp], #48 // 8-byte Folded Reload ; CHECK-NEXT: ret %r = call <5 x float> @llvm.sin.v5f32(<5 x float> %x) @@ -181,11 +181,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: stp d13, d12, [sp, #-64]! // 16-byte Folded Spill ; CHECK-NEXT: stp d11, d10, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: fmov s10, s3 ; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill ; CHECK-NEXT: fmov s8, s5 +; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill ; CHECK-NEXT: fmov s9, s4 -; CHECK-NEXT: fmov s10, s3 ; CHECK-NEXT: fmov s11, s2 ; CHECK-NEXT: fmov s12, s1 ; CHECK-NEXT: bl sinf @@ -209,9 +209,9 @@ ; CHECK-NEXT: fmov s4, s9 ; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: fmov s5, s0 ; CHECK-NEXT: fmov s0, s13 +; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: fmov s1, s12 ; CHECK-NEXT: ldp d13, d12, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -224,8 +224,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill ; CHECK-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #24] // 8-byte Folded Spill ; CHECK-NEXT: fmov d8, d2 +; CHECK-NEXT: str x30, [sp, #24] // 8-byte Folded Spill ; CHECK-NEXT: fmov d9, d1 ; CHECK-NEXT: bl sin ; CHECK-NEXT: fmov d10, d0 @@ -283,8 +283,8 @@ ; CHECK-NEXT: mov s0, v0.s[2] ; CHECK-NEXT: bl cosf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: add sp, sp, #48 @@ -313,8 +313,8 @@ ; CHECK-NEXT: mov s0, v0.s[2] ; CHECK-NEXT: bl expf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: add sp, sp, #48 @@ -343,8 +343,8 @@ ; CHECK-NEXT: mov s0, v0.s[2] ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: add sp, sp, #48 @@ -382,8 +382,8 @@ ; CHECK-NEXT: mov s0, v0.s[2] ; CHECK-NEXT: bl logf ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: add sp, sp, #48 @@ -412,8 +412,8 @@ ; CHECK-NEXT: mov s0, v0.s[2] ; CHECK-NEXT: bl log10f ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: add sp, sp, #48 @@ -442,8 +442,8 @@ ; CHECK-NEXT: mov s0, v0.s[2] ; CHECK-NEXT: bl log2f ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: add sp, sp, #48 diff --git a/llvm/test/CodeGen/AArch64/vec_cttz.ll b/llvm/test/CodeGen/AArch64/vec_cttz.ll --- a/llvm/test/CodeGen/AArch64/vec_cttz.ll +++ b/llvm/test/CodeGen/AArch64/vec_cttz.ll @@ -29,8 +29,8 @@ ; CHECK-NEXT: movi v1.4h, #1 ; CHECK-NEXT: sub v1.4h, v0.4h, v1.4h ; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b -; CHECK-NEXT: clz v0.4h, v0.4h ; CHECK-NEXT: movi v1.4h, #16 +; CHECK-NEXT: clz v0.4h, v0.4h ; CHECK-NEXT: sub v0.4h, v1.4h, v0.4h ; CHECK-NEXT: ret %b = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true) @@ -43,8 +43,8 @@ ; CHECK-NEXT: movi v1.2s, #1 ; CHECK-NEXT: sub v1.2s, v0.2s, v1.2s ; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b -; CHECK-NEXT: clz v0.2s, v0.2s ; CHECK-NEXT: movi v1.2s, #32 +; CHECK-NEXT: clz v0.2s, v0.2s ; CHECK-NEXT: sub v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret %b = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true) @@ -85,8 +85,8 @@ ; CHECK-NEXT: movi v1.8h, #1 ; CHECK-NEXT: sub v1.8h, v0.8h, v1.8h ; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b -; CHECK-NEXT: clz v0.8h, v0.8h ; CHECK-NEXT: movi v1.8h, #16 +; CHECK-NEXT: clz v0.8h, v0.8h ; CHECK-NEXT: sub v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %b = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) @@ -99,8 +99,8 @@ ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: sub v1.4s, v0.4s, v1.4s ; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b -; CHECK-NEXT: clz v0.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #32 +; CHECK-NEXT: clz v0.4s, v0.4s ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %b = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -81,34 +81,34 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { ; CHECK-LABEL: uaddo_v6i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov s2, w6 -; CHECK-NEXT: ldr s0, [sp, #16] -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mov v2.s[1], w7 -; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: fmov s0, w6 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: add x9, sp, #8 +; CHECK-NEXT: ldr s2, [sp, #16] +; CHECK-NEXT: fmov s3, w4 +; CHECK-NEXT: mov v0.s[1], w7 +; CHECK-NEXT: mov v1.s[1], w1 +; CHECK-NEXT: mov v3.s[1], w5 +; CHECK-NEXT: ld1 { v0.s }[2], [x8] ; CHECK-NEXT: add x8, sp, #24 -; CHECK-NEXT: add x10, sp, #8 -; CHECK-NEXT: ld1 { v0.s }[1], [x8] -; CHECK-NEXT: fmov s3, w0 -; CHECK-NEXT: ldr x11, [sp, #32] -; CHECK-NEXT: ld1 { v2.s }[3], [x10] -; CHECK-NEXT: fmov s1, w4 -; CHECK-NEXT: mov v3.s[1], w1 -; CHECK-NEXT: mov v1.s[1], w5 -; CHECK-NEXT: mov v3.s[2], w2 -; CHECK-NEXT: mov v3.s[3], w3 +; CHECK-NEXT: mov v1.s[2], w2 +; CHECK-NEXT: ld1 { v2.s }[1], [x8] +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: mov v1.s[3], w3 +; CHECK-NEXT: ldr x8, [sp, #32] +; CHECK-NEXT: add v2.4s, v3.4s, v2.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: cmhi v3.4s, v3.4s, v2.4s +; CHECK-NEXT: str d2, [x8, #16] ; CHECK-NEXT: cmhi v1.4s, v1.4s, v0.4s -; CHECK-NEXT: str d0, [x11, #16] -; CHECK-NEXT: add v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmhi v2.4s, v3.4s, v0.4s -; CHECK-NEXT: mov w5, v1.s[1] -; CHECK-NEXT: mov w1, v2.s[1] -; CHECK-NEXT: mov w2, v2.s[2] -; CHECK-NEXT: mov w3, v2.s[3] -; CHECK-NEXT: fmov w4, s1 -; CHECK-NEXT: fmov w0, s2 -; CHECK-NEXT: str q0, [x11] +; CHECK-NEXT: str q0, [x8] +; CHECK-NEXT: mov w5, v3.s[1] +; CHECK-NEXT: fmov w4, s3 +; CHECK-NEXT: mov w1, v1.s[1] +; CHECK-NEXT: mov w2, v1.s[2] +; CHECK-NEXT: mov w3, v1.s[3] +; CHECK-NEXT: fmov w0, s1 ; CHECK-NEXT: ret %t = call {<6 x i32>, <6 x i1>} @llvm.uadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 @@ -121,10 +121,10 @@ define <8 x i32> @uaddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { ; CHECK-LABEL: uaddo_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: add v2.4s, v0.4s, v2.4s ; CHECK-NEXT: add v3.4s, v1.4s, v3.4s -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s ; CHECK-NEXT: cmhi v1.4s, v1.4s, v3.4s +; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s ; CHECK-NEXT: stp q2, q3, [x0] ; CHECK-NEXT: ret %t = call {<8 x i32>, <8 x i1>} @llvm.uadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) @@ -140,24 +140,24 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: add v4.16b, v0.16b, v1.16b ; CHECK-NEXT: cmhi v0.16b, v0.16b, v4.16b -; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b -; CHECK-NEXT: zip2 v2.8b, v0.8b, v0.8b -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: zip1 v3.8b, v0.8b, v0.8b +; CHECK-NEXT: str q4, [x0] +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b ; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b -; CHECK-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: zip1 v3.8b, v1.8b, v0.8b +; CHECK-NEXT: zip2 v1.8b, v1.8b, v0.8b +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: shl v2.4s, v2.4s, #31 ; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: sshr v0.4s, v1.4s, #31 -; CHECK-NEXT: sshr v1.4s, v2.4s, #31 -; CHECK-NEXT: shl v2.4s, v3.4s, #31 -; CHECK-NEXT: shl v3.4s, v5.4s, #31 -; CHECK-NEXT: sshr v2.4s, v2.4s, #31 -; CHECK-NEXT: sshr v3.4s, v3.4s, #31 -; CHECK-NEXT: str q4, [x0] +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: shl v5.4s, v0.4s, #31 +; CHECK-NEXT: sshr v0.4s, v2.4s, #31 +; CHECK-NEXT: shl v3.4s, v3.4s, #31 +; CHECK-NEXT: shl v6.4s, v1.4s, #31 +; CHECK-NEXT: sshr v1.4s, v5.4s, #31 +; CHECK-NEXT: sshr v2.4s, v3.4s, #31 +; CHECK-NEXT: sshr v3.4s, v6.4s, #31 ; CHECK-NEXT: ret %t = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 @@ -172,6 +172,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: add v2.8h, v0.8h, v1.8h ; CHECK-NEXT: cmhi v0.8h, v0.8h, v2.8h +; CHECK-NEXT: str q2, [x0] ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b ; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b @@ -181,7 +182,6 @@ ; CHECK-NEXT: shl v3.4s, v0.4s, #31 ; CHECK-NEXT: sshr v0.4s, v1.4s, #31 ; CHECK-NEXT: sshr v1.4s, v3.4s, #31 -; CHECK-NEXT: str q2, [x0] ; CHECK-NEXT: ret %t = call {<8 x i16>, <8 x i1>} @llvm.uadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 @@ -196,8 +196,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: add v1.2d, v0.2d, v1.2d ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d -; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: ret %t = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 @@ -215,24 +215,24 @@ ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov w8, v0.s[3] -; CHECK-NEXT: bic v1.4s, #255, lsl #24 ; CHECK-NEXT: mov w9, v0.s[2] ; CHECK-NEXT: mov w10, v0.s[1] +; CHECK-NEXT: fmov w11, s0 +; CHECK-NEXT: bic v1.4s, #255, lsl #24 ; CHECK-NEXT: sturh w8, [x0, #9] ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: cmeq v1.4s, v1.4s, v0.4s -; CHECK-NEXT: fmov w11, s0 ; CHECK-NEXT: strh w9, [x0, #6] ; CHECK-NEXT: sturh w10, [x0, #3] ; CHECK-NEXT: lsr w9, w9, #16 -; CHECK-NEXT: lsr w10, w10, #16 -; CHECK-NEXT: strb w8, [x0, #11] -; CHECK-NEXT: mvn v0.16b, v1.16b -; CHECK-NEXT: lsr w8, w11, #16 ; CHECK-NEXT: strh w11, [x0] +; CHECK-NEXT: cmeq v1.4s, v1.4s, v0.4s +; CHECK-NEXT: strb w8, [x0, #11] +; CHECK-NEXT: lsr w8, w10, #16 +; CHECK-NEXT: lsr w10, w11, #16 ; CHECK-NEXT: strb w9, [x0, #8] -; CHECK-NEXT: strb w10, [x0, #5] -; CHECK-NEXT: strb w8, [x0, #2] +; CHECK-NEXT: mvn v0.16b, v1.16b +; CHECK-NEXT: strb w8, [x0, #5] +; CHECK-NEXT: strb w10, [x0, #2] ; CHECK-NEXT: ret %t = call {<4 x i24>, <4 x i1>} @llvm.uadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 @@ -248,21 +248,21 @@ ; CHECK-NEXT: movi v2.4h, #1 ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: add v1.4h, v0.4h, v1.4h -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: umov w8, v1.h[0] -; CHECK-NEXT: and w9, w9, #0x1 -; CHECK-NEXT: bfi w8, w9, #1, #1 -; CHECK-NEXT: umov w9, v1.h[2] -; CHECK-NEXT: and v0.8b, v1.8b, v2.8b +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: and v1.8b, v0.8b, v2.8b +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: umov w10, v0.h[0] +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: cmeq v1.4h, v1.4h, v0.4h +; CHECK-NEXT: and w8, w8, #0x1 ; CHECK-NEXT: and w9, w9, #0x1 -; CHECK-NEXT: cmeq v0.4h, v0.4h, v1.4h -; CHECK-NEXT: bfi w8, w9, #2, #1 -; CHECK-NEXT: umov w9, v1.h[3] -; CHECK-NEXT: mvn v0.8b, v0.8b -; CHECK-NEXT: bfi w8, w9, #3, #29 -; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: bfi w10, w8, #1, #1 +; CHECK-NEXT: mvn v1.8b, v1.8b +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: bfi w10, w11, #3, #29 +; CHECK-NEXT: and w8, w10, #0xf +; CHECK-NEXT: sshll v0.4s, v1.4h, #0 ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) @@ -276,27 +276,27 @@ define <2 x i32> @uaddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { ; CHECK-LABEL: uaddo_v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: adds x9, x2, x6 -; CHECK-NEXT: adcs x10, x3, x7 -; CHECK-NEXT: cmp x9, x2 +; CHECK-NEXT: adds x8, x2, x6 +; CHECK-NEXT: adcs x9, x3, x7 +; CHECK-NEXT: cmp x8, x2 +; CHECK-NEXT: cset w10, lo +; CHECK-NEXT: cmp x9, x3 ; CHECK-NEXT: cset w11, lo -; CHECK-NEXT: cmp x10, x3 -; CHECK-NEXT: cset w12, lo -; CHECK-NEXT: csel w11, w11, w12, eq -; CHECK-NEXT: adds x12, x0, x4 -; CHECK-NEXT: adcs x13, x1, x5 -; CHECK-NEXT: cmp x12, x0 +; CHECK-NEXT: csel w10, w10, w11, eq +; CHECK-NEXT: adds x11, x0, x4 +; CHECK-NEXT: adcs x12, x1, x5 +; CHECK-NEXT: cmp x11, x0 +; CHECK-NEXT: cset w13, lo +; CHECK-NEXT: cmp x12, x1 ; CHECK-NEXT: cset w14, lo -; CHECK-NEXT: cmp x13, x1 -; CHECK-NEXT: cset w15, lo -; CHECK-NEXT: csel w14, w14, w15, eq -; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: fmov s0, w14 -; CHECK-NEXT: mov v0.s[1], w11 +; CHECK-NEXT: csel w13, w13, w14, eq +; CHECK-NEXT: fmov s0, w13 +; CHECK-NEXT: mov v0.s[1], w10 +; CHECK-NEXT: ldr x10, [sp] +; CHECK-NEXT: stp x8, x9, [x10, #16] ; CHECK-NEXT: shl v0.2s, v0.2s, #31 +; CHECK-NEXT: stp x11, x12, [x10] ; CHECK-NEXT: sshr v0.2s, v0.2s, #31 -; CHECK-NEXT: stp x9, x10, [x8, #16] -; CHECK-NEXT: stp x12, x13, [x8] ; CHECK-NEXT: ret %t = call {<2 x i128>, <2 x i1>} @llvm.uadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll --- a/llvm/test/CodeGen/AArch64/vec_umulo.ll +++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll @@ -21,9 +21,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: shrn v0.2s, v1.2d, #32 -; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s ; CHECK-NEXT: xtn v1.2s, v1.2d ; CHECK-NEXT: str s1, [x0] +; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s ; CHECK-NEXT: ret %t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 @@ -38,9 +38,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: shrn v0.2s, v1.2d, #32 -; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s ; CHECK-NEXT: xtn v1.2s, v1.2d ; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s ; CHECK-NEXT: ret %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 @@ -54,13 +54,14 @@ ; CHECK-LABEL: umulo_v3i32: ; CHECK: // %bb.0: ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: add x8, x0, #8 ; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s ; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s -; CHECK-NEXT: uzp2 v0.4s, v3.4s, v2.4s -; CHECK-NEXT: add x8, x0, #8 -; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s +; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s ; CHECK-NEXT: st1 { v1.s }[2], [x8] ; CHECK-NEXT: str d1, [x0] +; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 @@ -75,10 +76,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s ; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s +; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s +; CHECK-NEXT: str q1, [x0] ; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s -; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret %t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) @@ -92,40 +93,40 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { ; CHECK-LABEL: umulo_v6i32: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov s2, w6 -; CHECK-NEXT: ldr s0, [sp, #16] -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mov v2.s[1], w7 -; CHECK-NEXT: ld1 { v2.s }[2], [x9] +; CHECK-NEXT: fmov s0, w6 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: add x9, sp, #8 +; CHECK-NEXT: ldr s2, [sp, #16] +; CHECK-NEXT: fmov s3, w4 +; CHECK-NEXT: mov v0.s[1], w7 +; CHECK-NEXT: mov v1.s[1], w1 +; CHECK-NEXT: mov v3.s[1], w5 +; CHECK-NEXT: ld1 { v0.s }[2], [x8] ; CHECK-NEXT: add x8, sp, #24 -; CHECK-NEXT: add x10, sp, #8 -; CHECK-NEXT: ld1 { v0.s }[1], [x8] -; CHECK-NEXT: fmov s3, w0 -; CHECK-NEXT: ldr x11, [sp, #32] -; CHECK-NEXT: ld1 { v2.s }[3], [x10] -; CHECK-NEXT: fmov s1, w4 -; CHECK-NEXT: mov v3.s[1], w1 -; CHECK-NEXT: mov v1.s[1], w5 -; CHECK-NEXT: mov v3.s[2], w2 -; CHECK-NEXT: mov v3.s[3], w3 +; CHECK-NEXT: mov v1.s[2], w2 +; CHECK-NEXT: ld1 { v2.s }[1], [x8] +; CHECK-NEXT: ld1 { v0.s }[3], [x9] +; CHECK-NEXT: mov v1.s[3], w3 +; CHECK-NEXT: ldr x8, [sp, #32] +; CHECK-NEXT: umull2 v6.2d, v3.4s, v2.4s +; CHECK-NEXT: umull v7.2d, v3.2s, v2.2s ; CHECK-NEXT: umull2 v4.2d, v1.4s, v0.4s ; CHECK-NEXT: umull v5.2d, v1.2s, v0.2s -; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v1.4s, v5.4s, v4.4s -; CHECK-NEXT: str d0, [x11, #16] -; CHECK-NEXT: umull2 v0.2d, v3.4s, v2.4s -; CHECK-NEXT: umull v4.2d, v3.2s, v2.2s -; CHECK-NEXT: uzp2 v0.4s, v4.4s, v0.4s -; CHECK-NEXT: cmtst v1.4s, v1.4s, v1.4s -; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s ; CHECK-NEXT: mul v2.4s, v3.4s, v2.4s -; CHECK-NEXT: mov w5, v1.s[1] -; CHECK-NEXT: mov w1, v0.s[1] -; CHECK-NEXT: mov w2, v0.s[2] -; CHECK-NEXT: mov w3, v0.s[3] -; CHECK-NEXT: fmov w4, s1 -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: str q2, [x11] +; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp2 v4.4s, v5.4s, v4.4s +; CHECK-NEXT: uzp2 v5.4s, v7.4s, v6.4s +; CHECK-NEXT: str d2, [x8, #16] +; CHECK-NEXT: str q0, [x8] +; CHECK-NEXT: cmtst v4.4s, v4.4s, v4.4s +; CHECK-NEXT: cmtst v3.4s, v5.4s, v5.4s +; CHECK-NEXT: mov w1, v4.s[1] +; CHECK-NEXT: mov w2, v4.s[2] +; CHECK-NEXT: mov w3, v4.s[3] +; CHECK-NEXT: mov w5, v3.s[1] +; CHECK-NEXT: fmov w0, s4 +; CHECK-NEXT: fmov w4, s3 ; CHECK-NEXT: ret %t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 @@ -138,17 +139,19 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { ; CHECK-LABEL: umulo_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: umull2 v4.2d, v0.4s, v2.4s -; CHECK-NEXT: umull v5.2d, v0.2s, v2.2s -; CHECK-NEXT: umull2 v6.2d, v1.4s, v3.4s -; CHECK-NEXT: mul v2.4s, v0.4s, v2.4s -; CHECK-NEXT: umull v0.2d, v1.2s, v3.2s +; CHECK-NEXT: umull2 v4.2d, v1.4s, v3.4s +; CHECK-NEXT: umull2 v5.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v6.2d, v0.2s, v2.2s +; CHECK-NEXT: umull v7.2d, v1.2s, v3.2s ; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s -; CHECK-NEXT: uzp2 v1.4s, v5.4s, v4.4s -; CHECK-NEXT: uzp2 v4.4s, v0.4s, v6.4s -; CHECK-NEXT: cmtst v0.4s, v1.4s, v1.4s -; CHECK-NEXT: cmtst v1.4s, v4.4s, v4.4s +; CHECK-NEXT: mul v2.4s, v0.4s, v2.4s +; CHECK-NEXT: uzp2 v5.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp2 v6.4s, v7.4s, v4.4s ; CHECK-NEXT: stp q2, q3, [x0] +; CHECK-NEXT: cmtst v4.4s, v5.4s, v5.4s +; CHECK-NEXT: cmtst v5.4s, v6.4s, v6.4s +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b ; CHECK-NEXT: ret %t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 @@ -163,27 +166,29 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b ; CHECK-NEXT: umull v3.8h, v0.8b, v1.8b -; CHECK-NEXT: mul v4.16b, v0.16b, v1.16b -; CHECK-NEXT: uzp2 v0.16b, v3.16b, v2.16b -; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b -; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b -; CHECK-NEXT: zip2 v2.8b, v0.8b, v0.8b -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: uzp2 v2.16b, v3.16b, v2.16b +; CHECK-NEXT: cmtst v2.16b, v2.16b, v2.16b +; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: zip1 v4.8b, v2.8b, v0.8b +; CHECK-NEXT: zip2 v2.8b, v2.8b, v0.8b +; CHECK-NEXT: zip1 v5.8b, v3.8b, v0.8b +; CHECK-NEXT: zip2 v3.8b, v3.8b, v0.8b +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: zip1 v3.8b, v0.8b, v0.8b -; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b -; CHECK-NEXT: shl v1.4s, v1.4s, #31 -; CHECK-NEXT: shl v2.4s, v2.4s, #31 +; CHECK-NEXT: ushll v5.4s, v5.4h, #0 ; CHECK-NEXT: ushll v3.4s, v3.4h, #0 -; CHECK-NEXT: ushll v5.4s, v0.4h, #0 -; CHECK-NEXT: sshr v0.4s, v1.4s, #31 -; CHECK-NEXT: sshr v1.4s, v2.4s, #31 -; CHECK-NEXT: shl v2.4s, v3.4s, #31 -; CHECK-NEXT: shl v3.4s, v5.4s, #31 -; CHECK-NEXT: sshr v2.4s, v2.4s, #31 +; CHECK-NEXT: shl v4.4s, v4.4s, #31 +; CHECK-NEXT: shl v2.4s, v2.4s, #31 +; CHECK-NEXT: shl v6.4s, v5.4s, #31 +; CHECK-NEXT: shl v3.4s, v3.4s, #31 +; CHECK-NEXT: sshr v4.4s, v4.4s, #31 +; CHECK-NEXT: sshr v5.4s, v2.4s, #31 +; CHECK-NEXT: sshr v2.4s, v6.4s, #31 ; CHECK-NEXT: sshr v3.4s, v3.4s, #31 -; CHECK-NEXT: str q4, [x0] +; CHECK-NEXT: mul v6.16b, v0.16b, v1.16b +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b +; CHECK-NEXT: str q6, [x0] ; CHECK-NEXT: ret %t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 @@ -198,18 +203,20 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h ; CHECK-NEXT: umull v3.4s, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v2.8h, v3.8h, v2.8h +; CHECK-NEXT: cmtst v2.8h, v2.8h, v2.8h +; CHECK-NEXT: xtn v2.8b, v2.8h +; CHECK-NEXT: zip1 v3.8b, v2.8b, v0.8b +; CHECK-NEXT: zip2 v2.8b, v2.8b, v0.8b +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: shl v3.4s, v3.4s, #31 +; CHECK-NEXT: shl v4.4s, v2.4s, #31 +; CHECK-NEXT: sshr v2.4s, v3.4s, #31 +; CHECK-NEXT: sshr v3.4s, v4.4s, #31 ; CHECK-NEXT: mul v4.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp2 v0.8h, v3.8h, v2.8h -; CHECK-NEXT: cmtst v0.8h, v0.8h, v0.8h -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b -; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: shl v1.4s, v1.4s, #31 -; CHECK-NEXT: shl v2.4s, v0.4s, #31 -; CHECK-NEXT: sshr v0.4s, v1.4s, #31 -; CHECK-NEXT: sshr v1.4s, v2.4s, #31 +; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: mov v1.16b, v3.16b ; CHECK-NEXT: str q4, [x0] ; CHECK-NEXT: ret %t = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) @@ -224,23 +231,23 @@ ; CHECK-LABEL: umulo_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov x9, v0.d[1] ; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: umulh x12, x11, x9 -; CHECK-NEXT: mul x9, x11, x9 -; CHECK-NEXT: umulh x11, x10, x8 -; CHECK-NEXT: cmp xzr, x11 -; CHECK-NEXT: csetm x11, ne +; CHECK-NEXT: umulh x12, x9, x8 +; CHECK-NEXT: umulh x13, x11, x10 ; CHECK-NEXT: cmp xzr, x12 +; CHECK-NEXT: mul x10, x11, x10 ; CHECK-NEXT: csetm x12, ne -; CHECK-NEXT: fmov d0, x12 -; CHECK-NEXT: mul x8, x10, x8 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: mov v0.d[1], x11 -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: cmp xzr, x13 +; CHECK-NEXT: csetm x13, ne +; CHECK-NEXT: mul x8, x9, x8 +; CHECK-NEXT: fmov d1, x10 +; CHECK-NEXT: fmov d0, x13 ; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.d[1], x12 ; CHECK-NEXT: str q1, [x0] +; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: ret %t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 @@ -263,22 +270,22 @@ ; CHECK-NEXT: mov w8, v0.s[3] ; CHECK-NEXT: mov w9, v0.s[2] ; CHECK-NEXT: mov w10, v0.s[1] +; CHECK-NEXT: cmeq v1.4s, v1.4s, #0 ; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: cmeq v0.4s, v1.4s, #0 -; CHECK-NEXT: cmtst v1.4s, v2.4s, v2.4s +; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s ; CHECK-NEXT: sturh w8, [x0, #9] ; CHECK-NEXT: lsr w8, w8, #16 ; CHECK-NEXT: strh w9, [x0, #6] -; CHECK-NEXT: sturh w10, [x0, #3] ; CHECK-NEXT: lsr w9, w9, #16 -; CHECK-NEXT: lsr w10, w10, #16 -; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b +; CHECK-NEXT: sturh w10, [x0, #3] +; CHECK-NEXT: orn v0.16b, v2.16b, v1.16b ; CHECK-NEXT: strb w8, [x0, #11] -; CHECK-NEXT: lsr w8, w11, #16 +; CHECK-NEXT: lsr w8, w10, #16 +; CHECK-NEXT: lsr w10, w11, #16 ; CHECK-NEXT: strh w11, [x0] ; CHECK-NEXT: strb w9, [x0, #8] -; CHECK-NEXT: strb w10, [x0, #5] -; CHECK-NEXT: strb w8, [x0, #2] +; CHECK-NEXT: strb w8, [x0, #5] +; CHECK-NEXT: strb w10, [x0, #2] ; CHECK-NEXT: ret %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 @@ -291,18 +298,19 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { ; CHECK-LABEL: umulo_v4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: and w9, w9, #0x1 -; CHECK-NEXT: bfi w8, w9, #1, #1 -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w9, w9, #0x1 -; CHECK-NEXT: bfi w8, w9, #2, #1 -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: bfi w8, w9, #3, #29 -; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: fmov d2, d0 ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: and v1.8b, v2.8b, v1.8b +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: umov w10, v1.h[0] +; CHECK-NEXT: umov w11, v1.h[3] +; CHECK-NEXT: and w8, w8, #0x1 +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w10, w8, #1, #1 +; CHECK-NEXT: bfi w10, w9, #2, #1 +; CHECK-NEXT: bfi w10, w11, #3, #29 +; CHECK-NEXT: and w8, w10, #0xf ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) @@ -318,51 +326,51 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: cmp x7, #0 ; CHECK-NEXT: umulh x8, x3, x6 -; CHECK-NEXT: cset w13, ne -; CHECK-NEXT: cmp x3, #0 -; CHECK-NEXT: umulh x9, x7, x2 ; CHECK-NEXT: mul x10, x7, x2 -; CHECK-NEXT: cset w14, ne -; CHECK-NEXT: cmp xzr, x8 -; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: umulh x11, x2, x6 +; CHECK-NEXT: cset w9, ne +; CHECK-NEXT: cmp x3, #0 +; CHECK-NEXT: umulh x11, x7, x2 +; CHECK-NEXT: cset w12, ne ; CHECK-NEXT: madd x10, x3, x6, x10 -; CHECK-NEXT: and w13, w14, w13 -; CHECK-NEXT: cset w14, ne -; CHECK-NEXT: cmp xzr, x9 -; CHECK-NEXT: orr w13, w13, w14 -; CHECK-NEXT: cset w14, ne -; CHECK-NEXT: adds x10, x11, x10 -; CHECK-NEXT: mul x12, x2, x6 -; CHECK-NEXT: orr w13, w13, w14 -; CHECK-NEXT: cset w14, hs +; CHECK-NEXT: cmp xzr, x8 +; CHECK-NEXT: umulh x8, x2, x6 +; CHECK-NEXT: cset w13, ne +; CHECK-NEXT: cmp xzr, x11 +; CHECK-NEXT: cset w11, ne +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: cset w10, hs ; CHECK-NEXT: cmp x5, #0 -; CHECK-NEXT: umulh x17, x1, x4 -; CHECK-NEXT: stp x12, x10, [x8, #16] -; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: cset w14, ne ; CHECK-NEXT: cmp x1, #0 -; CHECK-NEXT: umulh x9, x5, x0 -; CHECK-NEXT: mul x11, x5, x0 -; CHECK-NEXT: cset w12, ne -; CHECK-NEXT: cmp xzr, x17 -; CHECK-NEXT: umulh x15, x0, x4 -; CHECK-NEXT: madd x11, x1, x4, x11 -; CHECK-NEXT: and w10, w12, w10 -; CHECK-NEXT: cset w12, ne -; CHECK-NEXT: cmp xzr, x9 -; CHECK-NEXT: orr w9, w10, w12 -; CHECK-NEXT: cset w10, ne -; CHECK-NEXT: adds x11, x15, x11 -; CHECK-NEXT: orr w9, w9, w10 -; CHECK-NEXT: cset w10, hs +; CHECK-NEXT: umulh x15, x1, x4 +; CHECK-NEXT: cset w16, ne +; CHECK-NEXT: mul x17, x5, x0 +; CHECK-NEXT: and w14, w16, w14 +; CHECK-NEXT: umulh x16, x5, x0 +; CHECK-NEXT: cmp xzr, x15 +; CHECK-NEXT: madd x15, x1, x4, x17 +; CHECK-NEXT: cset w17, ne +; CHECK-NEXT: umulh x18, x0, x4 +; CHECK-NEXT: cmp xzr, x16 +; CHECK-NEXT: orr w14, w14, w17 +; CHECK-NEXT: cset w16, ne +; CHECK-NEXT: adds x15, x18, x15 +; CHECK-NEXT: orr w14, w14, w16 +; CHECK-NEXT: cset w16, hs +; CHECK-NEXT: and w9, w12, w9 +; CHECK-NEXT: orr w12, w14, w16 +; CHECK-NEXT: orr w9, w9, w13 +; CHECK-NEXT: orr w9, w9, w11 +; CHECK-NEXT: mul x11, x0, x4 ; CHECK-NEXT: orr w9, w9, w10 -; CHECK-NEXT: orr w10, w13, w14 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov v0.s[1], w10 +; CHECK-NEXT: ldr x10, [sp] +; CHECK-NEXT: fmov s0, w12 +; CHECK-NEXT: stp x11, x15, [x10] +; CHECK-NEXT: mov v0.s[1], w9 +; CHECK-NEXT: mul x9, x2, x6 ; CHECK-NEXT: shl v0.2s, v0.2s, #31 -; CHECK-NEXT: mul x16, x0, x4 +; CHECK-NEXT: stp x9, x8, [x10, #16] ; CHECK-NEXT: sshr v0.2s, v0.2s, #31 -; CHECK-NEXT: stp x16, x11, [x8] ; CHECK-NEXT: ret %t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -96,8 +96,9 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK-LABEL: test_v9i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 ; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: umov w12, v0.b[4] ; CHECK-NEXT: mov v1.b[9], w8 ; CHECK-NEXT: mov v1.b[10], w8 ; CHECK-NEXT: mov v1.b[11], w8 @@ -106,19 +107,18 @@ ; CHECK-NEXT: and v1.8b, v0.8b, v1.8b ; CHECK-NEXT: umov w8, v1.b[1] ; CHECK-NEXT: umov w9, v1.b[0] +; CHECK-NEXT: umov w10, v1.b[2] +; CHECK-NEXT: umov w11, v1.b[3] ; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: umov w9, v1.b[2] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v1.b[3] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: umov w10, v0.b[6] +; CHECK-NEXT: and w8, w8, w11 +; CHECK-NEXT: umov w11, v0.b[7] +; CHECK-NEXT: and w8, w8, w12 ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[6] -; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: and w0, w8, w11 ; CHECK-NEXT: ret %b = call i8 @llvm.vector.reduce.and.v9i8(<9 x i8> %a) ret i8 %b @@ -128,8 +128,8 @@ ; CHECK-LABEL: test_v3i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v1.8b, v0.8b, v1.8b ; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: and v1.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: and w0, w9, w8 ; CHECK-NEXT: ret @@ -141,13 +141,13 @@ ; CHECK-LABEL: test_v4i1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: and w9, w10, w9 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: and w8, w8, w11 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll @@ -94,8 +94,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fadd s1, s1, s0 ; CHECK-NEXT: mov s2, v0.s[1] -; CHECK-NEXT: fadd s1, s1, s2 ; CHECK-NEXT: mov s0, v0.s[2] +; CHECK-NEXT: fadd s1, s1, s2 ; CHECK-NEXT: fadd s0, s1, s0 ; CHECK-NEXT: ret %b = call float @llvm.vector.reduce.fadd.f32.v3f32(float %s, <3 x float> %a) @@ -144,8 +144,8 @@ ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload @@ -166,34 +166,34 @@ define float @test_v16f32(<16 x float> %a, float %s) nounwind { ; CHECK-LABEL: test_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov s22, v0.s[3] -; CHECK-NEXT: mov s23, v0.s[2] -; CHECK-NEXT: mov s24, v0.s[1] +; CHECK-NEXT: mov s6, v0.s[1] +; CHECK-NEXT: fadd s4, s4, s0 +; CHECK-NEXT: mov s7, v0.s[2] +; CHECK-NEXT: mov s0, v0.s[3] +; CHECK-NEXT: mov s5, v3.s[1] +; CHECK-NEXT: fadd s4, s4, s6 +; CHECK-NEXT: mov s6, v1.s[2] +; CHECK-NEXT: fadd s4, s4, s7 ; CHECK-NEXT: fadd s0, s4, s0 -; CHECK-NEXT: fadd s0, s0, s24 -; CHECK-NEXT: fadd s0, s0, s23 -; CHECK-NEXT: fadd s0, s0, s22 -; CHECK-NEXT: mov s21, v1.s[1] +; CHECK-NEXT: mov s4, v1.s[1] +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: mov s1, v1.s[3] +; CHECK-NEXT: fadd s0, s0, s4 +; CHECK-NEXT: mov s4, v2.s[2] +; CHECK-NEXT: fadd s0, s0, s6 ; CHECK-NEXT: fadd s0, s0, s1 -; CHECK-NEXT: mov s20, v1.s[2] -; CHECK-NEXT: fadd s0, s0, s21 -; CHECK-NEXT: mov s19, v1.s[3] -; CHECK-NEXT: fadd s0, s0, s20 -; CHECK-NEXT: fadd s0, s0, s19 -; CHECK-NEXT: mov s18, v2.s[1] +; CHECK-NEXT: mov s1, v2.s[1] ; CHECK-NEXT: fadd s0, s0, s2 -; CHECK-NEXT: mov s17, v2.s[2] -; CHECK-NEXT: fadd s0, s0, s18 -; CHECK-NEXT: mov s16, v2.s[3] -; CHECK-NEXT: fadd s0, s0, s17 -; CHECK-NEXT: fadd s0, s0, s16 -; CHECK-NEXT: mov s7, v3.s[1] +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: mov s1, v2.s[3] +; CHECK-NEXT: mov s2, v3.s[3] +; CHECK-NEXT: fadd s0, s0, s4 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: mov s1, v3.s[2] ; CHECK-NEXT: fadd s0, s0, s3 -; CHECK-NEXT: mov s6, v3.s[2] -; CHECK-NEXT: fadd s0, s0, s7 -; CHECK-NEXT: mov s5, v3.s[3] -; CHECK-NEXT: fadd s0, s0, s6 ; CHECK-NEXT: fadd s0, s0, s5 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: fadd s0, s0, s2 ; CHECK-NEXT: ret %b = call float @llvm.vector.reduce.fadd.f32.v16f32(float %s, <16 x float> %a) ret float %b @@ -202,32 +202,32 @@ define float @test_v16f32_neutral(<16 x float> %a) nounwind { ; CHECK-LABEL: test_v16f32_neutral: ; CHECK: // %bb.0: -; CHECK-NEXT: mov s21, v0.s[3] -; CHECK-NEXT: mov s22, v0.s[2] -; CHECK-NEXT: faddp s0, v0.2s -; CHECK-NEXT: fadd s0, s0, s22 -; CHECK-NEXT: fadd s0, s0, s21 -; CHECK-NEXT: mov s20, v1.s[1] +; CHECK-NEXT: mov s5, v0.s[2] +; CHECK-NEXT: faddp s6, v0.2s +; CHECK-NEXT: mov s0, v0.s[3] +; CHECK-NEXT: mov s4, v2.s[1] +; CHECK-NEXT: fadd s5, s6, s5 +; CHECK-NEXT: mov s6, v1.s[2] +; CHECK-NEXT: fadd s0, s5, s0 +; CHECK-NEXT: mov s5, v1.s[1] ; CHECK-NEXT: fadd s0, s0, s1 -; CHECK-NEXT: mov s19, v1.s[2] -; CHECK-NEXT: fadd s0, s0, s20 -; CHECK-NEXT: mov s18, v1.s[3] -; CHECK-NEXT: fadd s0, s0, s19 -; CHECK-NEXT: fadd s0, s0, s18 -; CHECK-NEXT: mov s17, v2.s[1] -; CHECK-NEXT: fadd s0, s0, s2 -; CHECK-NEXT: mov s16, v2.s[2] -; CHECK-NEXT: fadd s0, s0, s17 -; CHECK-NEXT: mov s7, v2.s[3] -; CHECK-NEXT: fadd s0, s0, s16 -; CHECK-NEXT: fadd s0, s0, s7 -; CHECK-NEXT: mov s6, v3.s[1] -; CHECK-NEXT: fadd s0, s0, s3 -; CHECK-NEXT: mov s5, v3.s[2] -; CHECK-NEXT: fadd s0, s0, s6 -; CHECK-NEXT: mov s4, v3.s[3] +; CHECK-NEXT: mov s1, v1.s[3] ; CHECK-NEXT: fadd s0, s0, s5 +; CHECK-NEXT: fadd s0, s0, s6 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: mov s1, v2.s[2] +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: mov s2, v2.s[3] ; CHECK-NEXT: fadd s0, s0, s4 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: mov s1, v3.s[1] +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: mov s2, v3.s[2] +; CHECK-NEXT: fadd s0, s0, s3 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: mov s1, v3.s[3] +; CHECK-NEXT: fadd s0, s0, s2 +; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret %b = call float @llvm.vector.reduce.fadd.f32.v16f32(float -0.0, <16 x float> %a) ret float %b diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -65,12 +65,12 @@ ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-NEXT: movi v5.4s, #128, lsl #24 -; CHECK-NEXT: mov v0.s[1], v1.s[0] -; CHECK-NEXT: mov v0.s[2], v2.s[0] ; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-NEXT: mov v0.s[3], v3.s[0] +; CHECK-NEXT: mov v0.s[1], v1.s[0] ; CHECK-NEXT: mov v5.s[0], v4.s[0] +; CHECK-NEXT: mov v0.s[2], v2.s[0] +; CHECK-NEXT: mov v0.s[3], v3.s[0] ; CHECK-NEXT: fadd v0.4s, v0.4s, v5.4s ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s ; CHECK-NEXT: faddp s0, v0.2s diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll @@ -26,20 +26,20 @@ ; CHECKNOFP16-LABEL: add_HalfH: ; CHECKNOFP16: // %bb.0: ; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECKNOFP16-NEXT: mov h3, v0.h[1] -; CHECKNOFP16-NEXT: mov h1, v0.h[3] +; CHECKNOFP16-NEXT: mov h1, v0.h[1] +; CHECKNOFP16-NEXT: fcvt s2, h0 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s1, s2, s1 ; CHECKNOFP16-NEXT: mov h2, v0.h[2] -; CHECKNOFP16-NEXT: fcvt s0, h0 -; CHECKNOFP16-NEXT: fcvt s3, h3 -; CHECKNOFP16-NEXT: fadd s0, s0, s3 -; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: mov h0, v0.h[3] +; CHECKNOFP16-NEXT: fcvt h1, s1 ; CHECKNOFP16-NEXT: fcvt s2, h2 ; CHECKNOFP16-NEXT: fcvt s0, h0 -; CHECKNOFP16-NEXT: fadd s0, s0, s2 -; CHECKNOFP16-NEXT: fcvt h0, s0 -; CHECKNOFP16-NEXT: fcvt s0, h0 ; CHECKNOFP16-NEXT: fcvt s1, h1 -; CHECKNOFP16-NEXT: fadd s0, s0, s1 +; CHECKNOFP16-NEXT: fadd s1, s1, s2 +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s0, s1, s0 ; CHECKNOFP16-NEXT: fcvt h0, s0 ; CHECKNOFP16-NEXT: ret %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx) @@ -57,40 +57,40 @@ ; ; CHECKNOFP16-LABEL: add_H: ; CHECKNOFP16: // %bb.0: -; CHECKNOFP16-NEXT: mov h7, v0.h[1] -; CHECKNOFP16-NEXT: mov h1, v0.h[7] +; CHECKNOFP16-NEXT: mov h1, v0.h[1] +; CHECKNOFP16-NEXT: fcvt s2, h0 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s1, s2, s1 +; CHECKNOFP16-NEXT: mov h2, v0.h[2] +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s1, s1, s2 +; CHECKNOFP16-NEXT: mov h2, v0.h[3] +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s1, s1, s2 +; CHECKNOFP16-NEXT: mov h2, v0.h[4] +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s1, s1, s2 +; CHECKNOFP16-NEXT: mov h2, v0.h[5] +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s1, s1, s2 ; CHECKNOFP16-NEXT: mov h2, v0.h[6] -; CHECKNOFP16-NEXT: mov h3, v0.h[5] -; CHECKNOFP16-NEXT: mov h4, v0.h[4] -; CHECKNOFP16-NEXT: mov h5, v0.h[3] -; CHECKNOFP16-NEXT: mov h6, v0.h[2] -; CHECKNOFP16-NEXT: fcvt s0, h0 -; CHECKNOFP16-NEXT: fcvt s7, h7 -; CHECKNOFP16-NEXT: fadd s0, s0, s7 -; CHECKNOFP16-NEXT: fcvt h0, s0 -; CHECKNOFP16-NEXT: fcvt s6, h6 -; CHECKNOFP16-NEXT: fcvt s0, h0 -; CHECKNOFP16-NEXT: fadd s0, s0, s6 -; CHECKNOFP16-NEXT: fcvt h0, s0 -; CHECKNOFP16-NEXT: fcvt s5, h5 -; CHECKNOFP16-NEXT: fcvt s0, h0 -; CHECKNOFP16-NEXT: fadd s0, s0, s5 -; CHECKNOFP16-NEXT: fcvt h0, s0 -; CHECKNOFP16-NEXT: fcvt s4, h4 -; CHECKNOFP16-NEXT: fcvt s0, h0 -; CHECKNOFP16-NEXT: fadd s0, s0, s4 -; CHECKNOFP16-NEXT: fcvt h0, s0 -; CHECKNOFP16-NEXT: fcvt s3, h3 -; CHECKNOFP16-NEXT: fcvt s0, h0 -; CHECKNOFP16-NEXT: fadd s0, s0, s3 -; CHECKNOFP16-NEXT: fcvt h0, s0 +; CHECKNOFP16-NEXT: mov h0, v0.h[7] +; CHECKNOFP16-NEXT: fcvt h1, s1 ; CHECKNOFP16-NEXT: fcvt s2, h2 ; CHECKNOFP16-NEXT: fcvt s0, h0 -; CHECKNOFP16-NEXT: fadd s0, s0, s2 -; CHECKNOFP16-NEXT: fcvt h0, s0 -; CHECKNOFP16-NEXT: fcvt s0, h0 ; CHECKNOFP16-NEXT: fcvt s1, h1 -; CHECKNOFP16-NEXT: fadd s0, s0, s1 +; CHECKNOFP16-NEXT: fadd s1, s1, s2 +; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fadd s0, s1, s0 ; CHECKNOFP16-NEXT: fcvt h0, s0 ; CHECKNOFP16-NEXT: ret %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx) @@ -140,76 +140,76 @@ ; CHECKNOFP16: // %bb.0: ; CHECKNOFP16-NEXT: mov h2, v1.h[1] ; CHECKNOFP16-NEXT: mov h3, v0.h[1] -; CHECKNOFP16-NEXT: mov h6, v1.h[2] -; CHECKNOFP16-NEXT: mov h7, v0.h[2] -; CHECKNOFP16-NEXT: mov h16, v1.h[3] -; CHECKNOFP16-NEXT: mov h17, v0.h[3] ; CHECKNOFP16-NEXT: fcvt s4, h1 ; CHECKNOFP16-NEXT: fcvt s5, h0 ; CHECKNOFP16-NEXT: fcvt s2, h2 ; CHECKNOFP16-NEXT: fcvt s3, h3 -; CHECKNOFP16-NEXT: fcvt s6, h6 -; CHECKNOFP16-NEXT: fcvt s7, h7 -; CHECKNOFP16-NEXT: fcvt s16, h16 -; CHECKNOFP16-NEXT: fcvt s17, h17 ; CHECKNOFP16-NEXT: fadd s4, s5, s4 -; CHECKNOFP16-NEXT: mov h5, v1.h[4] +; CHECKNOFP16-NEXT: mov h5, v0.h[2] ; CHECKNOFP16-NEXT: fadd s2, s3, s2 -; CHECKNOFP16-NEXT: mov h3, v0.h[4] -; CHECKNOFP16-NEXT: fadd s6, s7, s6 -; CHECKNOFP16-NEXT: mov h7, v1.h[5] -; CHECKNOFP16-NEXT: fadd s16, s17, s16 -; CHECKNOFP16-NEXT: mov h17, v0.h[5] +; CHECKNOFP16-NEXT: mov h3, v1.h[2] +; CHECKNOFP16-NEXT: fcvt h4, s4 ; CHECKNOFP16-NEXT: fcvt s5, h5 -; CHECKNOFP16-NEXT: fcvt s3, h3 -; CHECKNOFP16-NEXT: fcvt s7, h7 -; CHECKNOFP16-NEXT: fcvt s17, h17 -; CHECKNOFP16-NEXT: fadd s3, s3, s5 -; CHECKNOFP16-NEXT: mov h5, v1.h[6] -; CHECKNOFP16-NEXT: fadd s7, s17, s7 -; CHECKNOFP16-NEXT: mov h17, v0.h[6] -; CHECKNOFP16-NEXT: mov h1, v1.h[7] -; CHECKNOFP16-NEXT: mov h0, v0.h[7] -; CHECKNOFP16-NEXT: fcvt s1, h1 -; CHECKNOFP16-NEXT: fcvt s0, h0 -; CHECKNOFP16-NEXT: fadd s0, s0, s1 -; CHECKNOFP16-NEXT: fcvt h1, s4 ; CHECKNOFP16-NEXT: fcvt h2, s2 -; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fcvt s4, h4 ; CHECKNOFP16-NEXT: fcvt s2, h2 -; CHECKNOFP16-NEXT: fadd s1, s1, s2 -; CHECKNOFP16-NEXT: fcvt h2, s6 -; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fadd s3, s5, s3 +; CHECKNOFP16-NEXT: mov h5, v0.h[3] +; CHECKNOFP16-NEXT: fadd s2, s4, s2 +; CHECKNOFP16-NEXT: mov h4, v1.h[3] +; CHECKNOFP16-NEXT: fcvt h3, s3 +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt s4, h4 +; CHECKNOFP16-NEXT: fcvt s3, h3 ; CHECKNOFP16-NEXT: fcvt s2, h2 -; CHECKNOFP16-NEXT: fcvt s1, h1 -; CHECKNOFP16-NEXT: fadd s1, s1, s2 -; CHECKNOFP16-NEXT: fcvt h2, s16 -; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fadd s4, s5, s4 +; CHECKNOFP16-NEXT: mov h5, v0.h[4] +; CHECKNOFP16-NEXT: fadd s2, s2, s3 +; CHECKNOFP16-NEXT: mov h3, v1.h[4] +; CHECKNOFP16-NEXT: fcvt h4, s4 +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt s3, h3 +; CHECKNOFP16-NEXT: fcvt s4, h4 ; CHECKNOFP16-NEXT: fcvt s2, h2 -; CHECKNOFP16-NEXT: fcvt s1, h1 -; CHECKNOFP16-NEXT: fadd s1, s1, s2 -; CHECKNOFP16-NEXT: fcvt h2, s3 -; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fadd s3, s5, s3 +; CHECKNOFP16-NEXT: mov h5, v0.h[5] +; CHECKNOFP16-NEXT: fadd s2, s2, s4 +; CHECKNOFP16-NEXT: mov h4, v1.h[5] +; CHECKNOFP16-NEXT: fcvt h3, s3 +; CHECKNOFP16-NEXT: fcvt s5, h5 +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt s4, h4 +; CHECKNOFP16-NEXT: fcvt s3, h3 ; CHECKNOFP16-NEXT: fcvt s2, h2 -; CHECKNOFP16-NEXT: fcvt s1, h1 -; CHECKNOFP16-NEXT: fadd s1, s1, s2 -; CHECKNOFP16-NEXT: fcvt h3, s7 -; CHECKNOFP16-NEXT: fcvt h1, s1 +; CHECKNOFP16-NEXT: fadd s4, s5, s4 +; CHECKNOFP16-NEXT: mov h5, v0.h[6] +; CHECKNOFP16-NEXT: mov h0, v0.h[7] +; CHECKNOFP16-NEXT: fadd s2, s2, s3 +; CHECKNOFP16-NEXT: mov h3, v1.h[6] +; CHECKNOFP16-NEXT: fcvt h4, s4 ; CHECKNOFP16-NEXT: fcvt s5, h5 -; CHECKNOFP16-NEXT: fcvt s17, h17 +; CHECKNOFP16-NEXT: mov h1, v1.h[7] +; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fcvt h2, s2 ; CHECKNOFP16-NEXT: fcvt s3, h3 -; CHECKNOFP16-NEXT: fcvt s1, h1 -; CHECKNOFP16-NEXT: fadd s5, s17, s5 -; CHECKNOFP16-NEXT: fadd s1, s1, s3 -; CHECKNOFP16-NEXT: fcvt h4, s5 -; CHECKNOFP16-NEXT: fcvt h1, s1 ; CHECKNOFP16-NEXT: fcvt s4, h4 ; CHECKNOFP16-NEXT: fcvt s1, h1 -; CHECKNOFP16-NEXT: fadd s1, s1, s4 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fadd s3, s5, s3 +; CHECKNOFP16-NEXT: fadd s0, s0, s1 +; CHECKNOFP16-NEXT: fadd s2, s2, s4 +; CHECKNOFP16-NEXT: fcvt h3, s3 ; CHECKNOFP16-NEXT: fcvt h0, s0 -; CHECKNOFP16-NEXT: fcvt h1, s1 -; CHECKNOFP16-NEXT: fcvt s1, h1 +; CHECKNOFP16-NEXT: fcvt h2, s2 +; CHECKNOFP16-NEXT: fcvt s3, h3 ; CHECKNOFP16-NEXT: fcvt s0, h0 +; CHECKNOFP16-NEXT: fcvt s2, h2 +; CHECKNOFP16-NEXT: fadd s2, s2, s3 +; CHECKNOFP16-NEXT: fcvt h1, s2 +; CHECKNOFP16-NEXT: fcvt s1, h1 ; CHECKNOFP16-NEXT: fadd s0, s1, s0 ; CHECKNOFP16-NEXT: fcvt h0, s0 ; CHECKNOFP16-NEXT: ret @@ -257,8 +257,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s ; CHECK-NEXT: mov w8, #1109917696 -; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret ; @@ -266,8 +266,8 @@ ; CHECKNOFP16: // %bb.0: ; CHECKNOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s ; CHECKNOFP16-NEXT: mov w8, #1109917696 -; CHECKNOFP16-NEXT: faddp s0, v0.2s ; CHECKNOFP16-NEXT: fmov s1, w8 +; CHECKNOFP16-NEXT: faddp s0, v0.2s ; CHECKNOFP16-NEXT: fadd s0, s0, s1 ; CHECKNOFP16-NEXT: ret %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 42.0, <4 x float> %bin.rdx) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll @@ -51,20 +51,20 @@ ; CHECK-NOFP-LABEL: test_v4f16: ; CHECK-NOFP: // %bb.0: ; CHECK-NOFP-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP-NEXT: mov h3, v0.h[1] -; CHECK-NOFP-NEXT: mov h1, v0.h[3] +; CHECK-NOFP-NEXT: mov h1, v0.h[1] +; CHECK-NOFP-NEXT: fcvt s2, h0 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fmaxnm s1, s2, s1 ; CHECK-NOFP-NEXT: mov h2, v0.h[2] -; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcvt s3, h3 -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s3 -; CHECK-NOFP-NEXT: fcvt h0, s0 +; CHECK-NOFP-NEXT: mov h0, v0.h[3] +; CHECK-NOFP-NEXT: fcvt h1, s1 ; CHECK-NOFP-NEXT: fcvt s2, h2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s2 -; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcvt s0, h0 ; CHECK-NOFP-NEXT: fcvt s1, h1 -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 +; CHECK-NOFP-NEXT: fmaxnm s1, s1, s2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fmaxnm s0, s1, s0 ; CHECK-NOFP-NEXT: fcvt h0, s0 ; CHECK-NOFP-NEXT: ret ; @@ -80,20 +80,20 @@ ; CHECK-NOFP-LABEL: test_v4f16_ninf: ; CHECK-NOFP: // %bb.0: ; CHECK-NOFP-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP-NEXT: mov h3, v0.h[1] -; CHECK-NOFP-NEXT: mov h1, v0.h[3] +; CHECK-NOFP-NEXT: mov h1, v0.h[1] +; CHECK-NOFP-NEXT: fcvt s2, h0 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fmaxnm s1, s2, s1 ; CHECK-NOFP-NEXT: mov h2, v0.h[2] -; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcvt s3, h3 -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s3 -; CHECK-NOFP-NEXT: fcvt h0, s0 +; CHECK-NOFP-NEXT: mov h0, v0.h[3] +; CHECK-NOFP-NEXT: fcvt h1, s1 ; CHECK-NOFP-NEXT: fcvt s2, h2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s2 -; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcvt s0, h0 ; CHECK-NOFP-NEXT: fcvt s1, h1 -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 +; CHECK-NOFP-NEXT: fmaxnm s1, s1, s2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fmaxnm s0, s1, s0 ; CHECK-NOFP-NEXT: fcvt h0, s0 ; CHECK-NOFP-NEXT: ret ; @@ -108,104 +108,104 @@ define half @test_v11f16(<11 x half> %a) nounwind { ; CHECK-NOFP-LABEL: test_v11f16: ; CHECK-NOFP: // %bb.0: -; CHECK-NOFP-NEXT: ldr h18, [sp, #8] -; CHECK-NOFP-NEXT: ldr h17, [sp] -; CHECK-NOFP-NEXT: ldr h16, [sp, #16] +; CHECK-NOFP-NEXT: ldr h16, [sp, #8] ; CHECK-NOFP-NEXT: fcvt s1, h1 -; CHECK-NOFP-NEXT: fcvt s18, h18 +; CHECK-NOFP-NEXT: ldr h17, [sp] ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcmp s1, s18 -; CHECK-NOFP-NEXT: fcvt s17, h17 +; CHECK-NOFP-NEXT: fcvt s2, h2 ; CHECK-NOFP-NEXT: adrp x8, .LCPI6_0 -; CHECK-NOFP-NEXT: fcsel s1, s1, s18, gt +; CHECK-NOFP-NEXT: fcvt s16, h16 +; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcvt s17, h17 +; CHECK-NOFP-NEXT: fcmp s1, s16 +; CHECK-NOFP-NEXT: fcsel s1, s1, s16, gt ; CHECK-NOFP-NEXT: fcmp s0, s17 -; CHECK-NOFP-NEXT: ldr h18, [x8, :lo12:.LCPI6_0] +; CHECK-NOFP-NEXT: ldr h16, [sp, #16] ; CHECK-NOFP-NEXT: fcsel s0, s0, s17, gt -; CHECK-NOFP-NEXT: fcvt s2, h2 -; CHECK-NOFP-NEXT: fcvt s16, h16 ; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s16, h16 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s2, s16 ; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fcmp s2, s16 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s2, s2, s16, gt ; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcsel s1, s2, s16, gt +; CHECK-NOFP-NEXT: ldr h2, [x8, :lo12:.LCPI6_0] ; CHECK-NOFP-NEXT: mov w8, #-8388608 -; CHECK-NOFP-NEXT: fcvt s18, h18 -; CHECK-NOFP-NEXT: fcvt h2, s2 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fmov s17, w8 -; CHECK-NOFP-NEXT: fcmp s3, s18 -; CHECK-NOFP-NEXT: fcvt s1, h2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s2, h2 +; CHECK-NOFP-NEXT: fmov s16, w8 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s3, s3, s17, gt +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s4, h4 -; CHECK-NOFP-NEXT: fcvt h2, s3 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, gt +; CHECK-NOFP-NEXT: fcvt s3, h4 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s4, s18 -; CHECK-NOFP-NEXT: fcvt s2, h2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s3, s4, s17, gt -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s2 -; CHECK-NOFP-NEXT: fcvt s5, h5 -; CHECK-NOFP-NEXT: fcvt h3, s3 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, gt +; CHECK-NOFP-NEXT: fcvt s3, h5 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s5, s18 -; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s4, s5, s17, gt -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s3 -; CHECK-NOFP-NEXT: fcvt s6, h6 -; CHECK-NOFP-NEXT: fcvt h4, s4 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, gt +; CHECK-NOFP-NEXT: fcvt s3, h6 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s6, s18 -; CHECK-NOFP-NEXT: fcvt s1, h4 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s5, s6, s17, gt +; CHECK-NOFP-NEXT: fcvt s1, h1 ; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s7, h7 -; CHECK-NOFP-NEXT: fcvt h4, s5 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, gt +; CHECK-NOFP-NEXT: fcvt s3, h7 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s7, s18 -; CHECK-NOFP-NEXT: fcvt s4, h4 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s5, s7, s17, gt -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s4 -; CHECK-NOFP-NEXT: fcvt h5, s5 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, gt ; CHECK-NOFP-NEXT: fcvt h0, s0 +; CHECK-NOFP-NEXT: fcvt h1, s1 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcvt s1, h5 +; CHECK-NOFP-NEXT: fcvt s1, h1 ; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 ; CHECK-NOFP-NEXT: fcvt h0, s0 ; CHECK-NOFP-NEXT: ret ; ; CHECK-FP-LABEL: test_v11f16: ; CHECK-FP: // %bb.0: -; CHECK-FP-NEXT: movi v16.8h, #252, lsl #8 -; CHECK-FP-NEXT: mov x8, sp -; CHECK-FP-NEXT: ld1 { v16.h }[0], [x8] -; CHECK-FP-NEXT: add x8, sp, #8 ; CHECK-FP-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-FP-NEXT: // kill: def $h1 killed $h1 def $q1 ; CHECK-FP-NEXT: // kill: def $h2 killed $h2 def $q2 ; CHECK-FP-NEXT: // kill: def $h3 killed $h3 def $q3 ; CHECK-FP-NEXT: // kill: def $h4 killed $h4 def $q4 +; CHECK-FP-NEXT: mov x8, sp ; CHECK-FP-NEXT: // kill: def $h5 killed $h5 def $q5 ; CHECK-FP-NEXT: // kill: def $h6 killed $h6 def $q6 ; CHECK-FP-NEXT: // kill: def $h7 killed $h7 def $q7 ; CHECK-FP-NEXT: mov v0.h[1], v1.h[0] -; CHECK-FP-NEXT: ld1 { v16.h }[1], [x8] +; CHECK-FP-NEXT: movi v1.8h, #252, lsl #8 ; CHECK-FP-NEXT: mov v0.h[2], v2.h[0] +; CHECK-FP-NEXT: ld1 { v1.h }[0], [x8] +; CHECK-FP-NEXT: add x8, sp, #8 ; CHECK-FP-NEXT: mov v0.h[3], v3.h[0] +; CHECK-FP-NEXT: ld1 { v1.h }[1], [x8] ; CHECK-FP-NEXT: add x8, sp, #16 ; CHECK-FP-NEXT: mov v0.h[4], v4.h[0] -; CHECK-FP-NEXT: ld1 { v16.h }[2], [x8] +; CHECK-FP-NEXT: ld1 { v1.h }[2], [x8] ; CHECK-FP-NEXT: mov v0.h[5], v5.h[0] ; CHECK-FP-NEXT: mov v0.h[6], v6.h[0] ; CHECK-FP-NEXT: mov v0.h[7], v7.h[0] -; CHECK-FP-NEXT: fmaxnm v0.8h, v0.8h, v16.8h +; CHECK-FP-NEXT: fmaxnm v0.8h, v0.8h, v1.8h ; CHECK-FP-NEXT: fmaxnmv h0, v0.8h ; CHECK-FP-NEXT: ret %b = call nnan half @llvm.vector.reduce.fmax.v11f16(<11 x half> %a) @@ -215,105 +215,105 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind { ; CHECK-NOFP-LABEL: test_v11f16_ninf: ; CHECK-NOFP: // %bb.0: -; CHECK-NOFP-NEXT: ldr h18, [sp, #8] -; CHECK-NOFP-NEXT: ldr h17, [sp] -; CHECK-NOFP-NEXT: ldr h16, [sp, #16] +; CHECK-NOFP-NEXT: ldr h16, [sp, #8] ; CHECK-NOFP-NEXT: fcvt s1, h1 -; CHECK-NOFP-NEXT: fcvt s18, h18 +; CHECK-NOFP-NEXT: ldr h17, [sp] ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcmp s1, s18 -; CHECK-NOFP-NEXT: fcvt s17, h17 +; CHECK-NOFP-NEXT: fcvt s2, h2 ; CHECK-NOFP-NEXT: adrp x8, .LCPI7_0 -; CHECK-NOFP-NEXT: fcsel s1, s1, s18, gt +; CHECK-NOFP-NEXT: fcvt s16, h16 +; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcvt s17, h17 +; CHECK-NOFP-NEXT: fcmp s1, s16 +; CHECK-NOFP-NEXT: fcsel s1, s1, s16, gt ; CHECK-NOFP-NEXT: fcmp s0, s17 -; CHECK-NOFP-NEXT: ldr h18, [x8, :lo12:.LCPI7_0] +; CHECK-NOFP-NEXT: ldr h16, [sp, #16] ; CHECK-NOFP-NEXT: fcsel s0, s0, s17, gt -; CHECK-NOFP-NEXT: fcvt s2, h2 -; CHECK-NOFP-NEXT: fcvt s16, h16 ; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s16, h16 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s2, s16 ; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fcmp s2, s16 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: mov w8, #57344 -; CHECK-NOFP-NEXT: fcsel s2, s2, s16, gt ; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcsel s1, s2, s16, gt +; CHECK-NOFP-NEXT: ldr h2, [x8, :lo12:.LCPI7_0] +; CHECK-NOFP-NEXT: mov w8, #57344 ; CHECK-NOFP-NEXT: movk w8, #51071, lsl #16 -; CHECK-NOFP-NEXT: fcvt s18, h18 -; CHECK-NOFP-NEXT: fcvt h2, s2 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fmov s17, w8 -; CHECK-NOFP-NEXT: fcmp s3, s18 -; CHECK-NOFP-NEXT: fcvt s1, h2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s2, h2 +; CHECK-NOFP-NEXT: fmov s16, w8 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s3, s3, s17, gt +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s4, h4 -; CHECK-NOFP-NEXT: fcvt h2, s3 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, gt +; CHECK-NOFP-NEXT: fcvt s3, h4 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s4, s18 -; CHECK-NOFP-NEXT: fcvt s2, h2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s3, s4, s17, gt -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s2 -; CHECK-NOFP-NEXT: fcvt s5, h5 -; CHECK-NOFP-NEXT: fcvt h3, s3 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, gt +; CHECK-NOFP-NEXT: fcvt s3, h5 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s5, s18 -; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s4, s5, s17, gt -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s3 -; CHECK-NOFP-NEXT: fcvt s6, h6 -; CHECK-NOFP-NEXT: fcvt h4, s4 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, gt +; CHECK-NOFP-NEXT: fcvt s3, h6 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s6, s18 -; CHECK-NOFP-NEXT: fcvt s1, h4 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s5, s6, s17, gt +; CHECK-NOFP-NEXT: fcvt s1, h1 ; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s7, h7 -; CHECK-NOFP-NEXT: fcvt h4, s5 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, gt +; CHECK-NOFP-NEXT: fcvt s3, h7 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s7, s18 -; CHECK-NOFP-NEXT: fcvt s4, h4 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s5, s7, s17, gt -; CHECK-NOFP-NEXT: fmaxnm s0, s0, s4 -; CHECK-NOFP-NEXT: fcvt h5, s5 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, gt ; CHECK-NOFP-NEXT: fcvt h0, s0 +; CHECK-NOFP-NEXT: fcvt h1, s1 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcvt s1, h5 +; CHECK-NOFP-NEXT: fcvt s1, h1 ; CHECK-NOFP-NEXT: fmaxnm s0, s0, s1 ; CHECK-NOFP-NEXT: fcvt h0, s0 ; CHECK-NOFP-NEXT: ret ; ; CHECK-FP-LABEL: test_v11f16_ninf: ; CHECK-FP: // %bb.0: -; CHECK-FP-NEXT: mvni v16.8h, #4, lsl #8 -; CHECK-FP-NEXT: mov x8, sp -; CHECK-FP-NEXT: ld1 { v16.h }[0], [x8] -; CHECK-FP-NEXT: add x8, sp, #8 ; CHECK-FP-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-FP-NEXT: // kill: def $h1 killed $h1 def $q1 ; CHECK-FP-NEXT: // kill: def $h2 killed $h2 def $q2 ; CHECK-FP-NEXT: // kill: def $h3 killed $h3 def $q3 ; CHECK-FP-NEXT: // kill: def $h4 killed $h4 def $q4 +; CHECK-FP-NEXT: mov x8, sp ; CHECK-FP-NEXT: // kill: def $h5 killed $h5 def $q5 ; CHECK-FP-NEXT: // kill: def $h6 killed $h6 def $q6 ; CHECK-FP-NEXT: // kill: def $h7 killed $h7 def $q7 ; CHECK-FP-NEXT: mov v0.h[1], v1.h[0] -; CHECK-FP-NEXT: ld1 { v16.h }[1], [x8] +; CHECK-FP-NEXT: mvni v1.8h, #4, lsl #8 ; CHECK-FP-NEXT: mov v0.h[2], v2.h[0] +; CHECK-FP-NEXT: ld1 { v1.h }[0], [x8] +; CHECK-FP-NEXT: add x8, sp, #8 ; CHECK-FP-NEXT: mov v0.h[3], v3.h[0] +; CHECK-FP-NEXT: ld1 { v1.h }[1], [x8] ; CHECK-FP-NEXT: add x8, sp, #16 ; CHECK-FP-NEXT: mov v0.h[4], v4.h[0] -; CHECK-FP-NEXT: ld1 { v16.h }[2], [x8] +; CHECK-FP-NEXT: ld1 { v1.h }[2], [x8] ; CHECK-FP-NEXT: mov v0.h[5], v5.h[0] ; CHECK-FP-NEXT: mov v0.h[6], v6.h[0] ; CHECK-FP-NEXT: mov v0.h[7], v7.h[0] -; CHECK-FP-NEXT: fmaxnm v0.8h, v0.8h, v16.8h +; CHECK-FP-NEXT: fmaxnm v0.8h, v0.8h, v1.8h ; CHECK-FP-NEXT: fmaxnmv h0, v0.8h ; CHECK-FP-NEXT: ret %b = call nnan ninf half @llvm.vector.reduce.fmax.v11f16(<11 x half> %a) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll @@ -51,20 +51,20 @@ ; CHECK-NOFP-LABEL: test_v4f16: ; CHECK-NOFP: // %bb.0: ; CHECK-NOFP-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP-NEXT: mov h3, v0.h[1] -; CHECK-NOFP-NEXT: mov h1, v0.h[3] +; CHECK-NOFP-NEXT: mov h1, v0.h[1] +; CHECK-NOFP-NEXT: fcvt s2, h0 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fminnm s1, s2, s1 ; CHECK-NOFP-NEXT: mov h2, v0.h[2] -; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcvt s3, h3 -; CHECK-NOFP-NEXT: fminnm s0, s0, s3 -; CHECK-NOFP-NEXT: fcvt h0, s0 +; CHECK-NOFP-NEXT: mov h0, v0.h[3] +; CHECK-NOFP-NEXT: fcvt h1, s1 ; CHECK-NOFP-NEXT: fcvt s2, h2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fminnm s0, s0, s2 -; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcvt s0, h0 ; CHECK-NOFP-NEXT: fcvt s1, h1 -; CHECK-NOFP-NEXT: fminnm s0, s0, s1 +; CHECK-NOFP-NEXT: fminnm s1, s1, s2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fminnm s0, s1, s0 ; CHECK-NOFP-NEXT: fcvt h0, s0 ; CHECK-NOFP-NEXT: ret ; @@ -80,20 +80,20 @@ ; CHECK-NOFP-LABEL: test_v4f16_ninf: ; CHECK-NOFP: // %bb.0: ; CHECK-NOFP-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NOFP-NEXT: mov h3, v0.h[1] -; CHECK-NOFP-NEXT: mov h1, v0.h[3] +; CHECK-NOFP-NEXT: mov h1, v0.h[1] +; CHECK-NOFP-NEXT: fcvt s2, h0 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fminnm s1, s2, s1 ; CHECK-NOFP-NEXT: mov h2, v0.h[2] -; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcvt s3, h3 -; CHECK-NOFP-NEXT: fminnm s0, s0, s3 -; CHECK-NOFP-NEXT: fcvt h0, s0 +; CHECK-NOFP-NEXT: mov h0, v0.h[3] +; CHECK-NOFP-NEXT: fcvt h1, s1 ; CHECK-NOFP-NEXT: fcvt s2, h2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fminnm s0, s0, s2 -; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcvt s0, h0 ; CHECK-NOFP-NEXT: fcvt s1, h1 -; CHECK-NOFP-NEXT: fminnm s0, s0, s1 +; CHECK-NOFP-NEXT: fminnm s1, s1, s2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fminnm s0, s1, s0 ; CHECK-NOFP-NEXT: fcvt h0, s0 ; CHECK-NOFP-NEXT: ret ; @@ -108,104 +108,104 @@ define half @test_v11f16(<11 x half> %a) nounwind { ; CHECK-NOFP-LABEL: test_v11f16: ; CHECK-NOFP: // %bb.0: -; CHECK-NOFP-NEXT: ldr h18, [sp, #8] -; CHECK-NOFP-NEXT: ldr h17, [sp] -; CHECK-NOFP-NEXT: ldr h16, [sp, #16] +; CHECK-NOFP-NEXT: ldr h16, [sp, #8] ; CHECK-NOFP-NEXT: fcvt s1, h1 -; CHECK-NOFP-NEXT: fcvt s18, h18 +; CHECK-NOFP-NEXT: ldr h17, [sp] ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcmp s1, s18 -; CHECK-NOFP-NEXT: fcvt s17, h17 +; CHECK-NOFP-NEXT: fcvt s2, h2 ; CHECK-NOFP-NEXT: adrp x8, .LCPI6_0 -; CHECK-NOFP-NEXT: fcsel s1, s1, s18, lt +; CHECK-NOFP-NEXT: fcvt s16, h16 +; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcvt s17, h17 +; CHECK-NOFP-NEXT: fcmp s1, s16 +; CHECK-NOFP-NEXT: fcsel s1, s1, s16, lt ; CHECK-NOFP-NEXT: fcmp s0, s17 -; CHECK-NOFP-NEXT: ldr h18, [x8, :lo12:.LCPI6_0] +; CHECK-NOFP-NEXT: ldr h16, [sp, #16] ; CHECK-NOFP-NEXT: fcsel s0, s0, s17, lt -; CHECK-NOFP-NEXT: fcvt s2, h2 -; CHECK-NOFP-NEXT: fcvt s16, h16 ; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s16, h16 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s2, s16 ; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fcmp s2, s16 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s2, s2, s16, lt ; CHECK-NOFP-NEXT: fminnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcsel s1, s2, s16, lt +; CHECK-NOFP-NEXT: ldr h2, [x8, :lo12:.LCPI6_0] ; CHECK-NOFP-NEXT: mov w8, #2139095040 -; CHECK-NOFP-NEXT: fcvt s18, h18 -; CHECK-NOFP-NEXT: fcvt h2, s2 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fmov s17, w8 -; CHECK-NOFP-NEXT: fcmp s3, s18 -; CHECK-NOFP-NEXT: fcvt s1, h2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s2, h2 +; CHECK-NOFP-NEXT: fmov s16, w8 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s3, s3, s17, lt +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fminnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s4, h4 -; CHECK-NOFP-NEXT: fcvt h2, s3 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, lt +; CHECK-NOFP-NEXT: fcvt s3, h4 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s4, s18 -; CHECK-NOFP-NEXT: fcvt s2, h2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s3, s4, s17, lt -; CHECK-NOFP-NEXT: fminnm s0, s0, s2 -; CHECK-NOFP-NEXT: fcvt s5, h5 -; CHECK-NOFP-NEXT: fcvt h3, s3 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fminnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, lt +; CHECK-NOFP-NEXT: fcvt s3, h5 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s5, s18 -; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s4, s5, s17, lt -; CHECK-NOFP-NEXT: fminnm s0, s0, s3 -; CHECK-NOFP-NEXT: fcvt s6, h6 -; CHECK-NOFP-NEXT: fcvt h4, s4 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fminnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, lt +; CHECK-NOFP-NEXT: fcvt s3, h6 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s6, s18 -; CHECK-NOFP-NEXT: fcvt s1, h4 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s5, s6, s17, lt +; CHECK-NOFP-NEXT: fcvt s1, h1 ; CHECK-NOFP-NEXT: fminnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s7, h7 -; CHECK-NOFP-NEXT: fcvt h4, s5 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, lt +; CHECK-NOFP-NEXT: fcvt s3, h7 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s7, s18 -; CHECK-NOFP-NEXT: fcvt s4, h4 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s5, s7, s17, lt -; CHECK-NOFP-NEXT: fminnm s0, s0, s4 -; CHECK-NOFP-NEXT: fcvt h5, s5 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fminnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, lt ; CHECK-NOFP-NEXT: fcvt h0, s0 +; CHECK-NOFP-NEXT: fcvt h1, s1 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcvt s1, h5 +; CHECK-NOFP-NEXT: fcvt s1, h1 ; CHECK-NOFP-NEXT: fminnm s0, s0, s1 ; CHECK-NOFP-NEXT: fcvt h0, s0 ; CHECK-NOFP-NEXT: ret ; ; CHECK-FP-LABEL: test_v11f16: ; CHECK-FP: // %bb.0: -; CHECK-FP-NEXT: movi v16.8h, #124, lsl #8 -; CHECK-FP-NEXT: mov x8, sp -; CHECK-FP-NEXT: ld1 { v16.h }[0], [x8] -; CHECK-FP-NEXT: add x8, sp, #8 ; CHECK-FP-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-FP-NEXT: // kill: def $h1 killed $h1 def $q1 ; CHECK-FP-NEXT: // kill: def $h2 killed $h2 def $q2 ; CHECK-FP-NEXT: // kill: def $h3 killed $h3 def $q3 ; CHECK-FP-NEXT: // kill: def $h4 killed $h4 def $q4 +; CHECK-FP-NEXT: mov x8, sp ; CHECK-FP-NEXT: // kill: def $h5 killed $h5 def $q5 ; CHECK-FP-NEXT: // kill: def $h6 killed $h6 def $q6 ; CHECK-FP-NEXT: // kill: def $h7 killed $h7 def $q7 ; CHECK-FP-NEXT: mov v0.h[1], v1.h[0] -; CHECK-FP-NEXT: ld1 { v16.h }[1], [x8] +; CHECK-FP-NEXT: movi v1.8h, #124, lsl #8 ; CHECK-FP-NEXT: mov v0.h[2], v2.h[0] +; CHECK-FP-NEXT: ld1 { v1.h }[0], [x8] +; CHECK-FP-NEXT: add x8, sp, #8 ; CHECK-FP-NEXT: mov v0.h[3], v3.h[0] +; CHECK-FP-NEXT: ld1 { v1.h }[1], [x8] ; CHECK-FP-NEXT: add x8, sp, #16 ; CHECK-FP-NEXT: mov v0.h[4], v4.h[0] -; CHECK-FP-NEXT: ld1 { v16.h }[2], [x8] +; CHECK-FP-NEXT: ld1 { v1.h }[2], [x8] ; CHECK-FP-NEXT: mov v0.h[5], v5.h[0] ; CHECK-FP-NEXT: mov v0.h[6], v6.h[0] ; CHECK-FP-NEXT: mov v0.h[7], v7.h[0] -; CHECK-FP-NEXT: fminnm v0.8h, v0.8h, v16.8h +; CHECK-FP-NEXT: fminnm v0.8h, v0.8h, v1.8h ; CHECK-FP-NEXT: fminnmv h0, v0.8h ; CHECK-FP-NEXT: ret %b = call nnan half @llvm.vector.reduce.fmin.v11f16(<11 x half> %a) @@ -215,105 +215,105 @@ define half @test_v11f16_ninf(<11 x half> %a) nounwind { ; CHECK-NOFP-LABEL: test_v11f16_ninf: ; CHECK-NOFP: // %bb.0: -; CHECK-NOFP-NEXT: ldr h18, [sp, #8] -; CHECK-NOFP-NEXT: ldr h17, [sp] -; CHECK-NOFP-NEXT: ldr h16, [sp, #16] +; CHECK-NOFP-NEXT: ldr h16, [sp, #8] ; CHECK-NOFP-NEXT: fcvt s1, h1 -; CHECK-NOFP-NEXT: fcvt s18, h18 +; CHECK-NOFP-NEXT: ldr h17, [sp] ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcmp s1, s18 -; CHECK-NOFP-NEXT: fcvt s17, h17 +; CHECK-NOFP-NEXT: fcvt s2, h2 ; CHECK-NOFP-NEXT: adrp x8, .LCPI7_0 -; CHECK-NOFP-NEXT: fcsel s1, s1, s18, lt +; CHECK-NOFP-NEXT: fcvt s16, h16 +; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcvt s17, h17 +; CHECK-NOFP-NEXT: fcmp s1, s16 +; CHECK-NOFP-NEXT: fcsel s1, s1, s16, lt ; CHECK-NOFP-NEXT: fcmp s0, s17 -; CHECK-NOFP-NEXT: ldr h18, [x8, :lo12:.LCPI7_0] +; CHECK-NOFP-NEXT: ldr h16, [sp, #16] ; CHECK-NOFP-NEXT: fcsel s0, s0, s17, lt -; CHECK-NOFP-NEXT: fcvt s2, h2 -; CHECK-NOFP-NEXT: fcvt s16, h16 ; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s16, h16 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s2, s16 ; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fcmp s2, s16 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: mov w8, #57344 -; CHECK-NOFP-NEXT: fcsel s2, s2, s16, lt ; CHECK-NOFP-NEXT: fminnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcsel s1, s2, s16, lt +; CHECK-NOFP-NEXT: ldr h2, [x8, :lo12:.LCPI7_0] +; CHECK-NOFP-NEXT: mov w8, #57344 ; CHECK-NOFP-NEXT: movk w8, #18303, lsl #16 -; CHECK-NOFP-NEXT: fcvt s18, h18 -; CHECK-NOFP-NEXT: fcvt h2, s2 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fmov s17, w8 -; CHECK-NOFP-NEXT: fcmp s3, s18 -; CHECK-NOFP-NEXT: fcvt s1, h2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcvt s2, h2 +; CHECK-NOFP-NEXT: fmov s16, w8 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s3, s3, s17, lt +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fminnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s4, h4 -; CHECK-NOFP-NEXT: fcvt h2, s3 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, lt +; CHECK-NOFP-NEXT: fcvt s3, h4 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s4, s18 -; CHECK-NOFP-NEXT: fcvt s2, h2 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s3, s4, s17, lt -; CHECK-NOFP-NEXT: fminnm s0, s0, s2 -; CHECK-NOFP-NEXT: fcvt s5, h5 -; CHECK-NOFP-NEXT: fcvt h3, s3 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fminnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, lt +; CHECK-NOFP-NEXT: fcvt s3, h5 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s5, s18 -; CHECK-NOFP-NEXT: fcvt s3, h3 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s4, s5, s17, lt -; CHECK-NOFP-NEXT: fminnm s0, s0, s3 -; CHECK-NOFP-NEXT: fcvt s6, h6 -; CHECK-NOFP-NEXT: fcvt h4, s4 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fminnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, lt +; CHECK-NOFP-NEXT: fcvt s3, h6 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s6, s18 -; CHECK-NOFP-NEXT: fcvt s1, h4 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s5, s6, s17, lt +; CHECK-NOFP-NEXT: fcvt s1, h1 ; CHECK-NOFP-NEXT: fminnm s0, s0, s1 -; CHECK-NOFP-NEXT: fcvt s7, h7 -; CHECK-NOFP-NEXT: fcvt h4, s5 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, lt +; CHECK-NOFP-NEXT: fcvt s3, h7 ; CHECK-NOFP-NEXT: fcvt h0, s0 -; CHECK-NOFP-NEXT: fcmp s7, s18 -; CHECK-NOFP-NEXT: fcvt s4, h4 +; CHECK-NOFP-NEXT: fcvt h1, s1 +; CHECK-NOFP-NEXT: fcmp s3, s2 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcsel s5, s7, s17, lt -; CHECK-NOFP-NEXT: fminnm s0, s0, s4 -; CHECK-NOFP-NEXT: fcvt h5, s5 +; CHECK-NOFP-NEXT: fcvt s1, h1 +; CHECK-NOFP-NEXT: fminnm s0, s0, s1 +; CHECK-NOFP-NEXT: fcsel s1, s3, s16, lt ; CHECK-NOFP-NEXT: fcvt h0, s0 +; CHECK-NOFP-NEXT: fcvt h1, s1 ; CHECK-NOFP-NEXT: fcvt s0, h0 -; CHECK-NOFP-NEXT: fcvt s1, h5 +; CHECK-NOFP-NEXT: fcvt s1, h1 ; CHECK-NOFP-NEXT: fminnm s0, s0, s1 ; CHECK-NOFP-NEXT: fcvt h0, s0 ; CHECK-NOFP-NEXT: ret ; ; CHECK-FP-LABEL: test_v11f16_ninf: ; CHECK-FP: // %bb.0: -; CHECK-FP-NEXT: mvni v16.8h, #132, lsl #8 -; CHECK-FP-NEXT: mov x8, sp -; CHECK-FP-NEXT: ld1 { v16.h }[0], [x8] -; CHECK-FP-NEXT: add x8, sp, #8 ; CHECK-FP-NEXT: // kill: def $h0 killed $h0 def $q0 ; CHECK-FP-NEXT: // kill: def $h1 killed $h1 def $q1 ; CHECK-FP-NEXT: // kill: def $h2 killed $h2 def $q2 ; CHECK-FP-NEXT: // kill: def $h3 killed $h3 def $q3 ; CHECK-FP-NEXT: // kill: def $h4 killed $h4 def $q4 +; CHECK-FP-NEXT: mov x8, sp ; CHECK-FP-NEXT: // kill: def $h5 killed $h5 def $q5 ; CHECK-FP-NEXT: // kill: def $h6 killed $h6 def $q6 ; CHECK-FP-NEXT: // kill: def $h7 killed $h7 def $q7 ; CHECK-FP-NEXT: mov v0.h[1], v1.h[0] -; CHECK-FP-NEXT: ld1 { v16.h }[1], [x8] +; CHECK-FP-NEXT: mvni v1.8h, #132, lsl #8 ; CHECK-FP-NEXT: mov v0.h[2], v2.h[0] +; CHECK-FP-NEXT: ld1 { v1.h }[0], [x8] +; CHECK-FP-NEXT: add x8, sp, #8 ; CHECK-FP-NEXT: mov v0.h[3], v3.h[0] +; CHECK-FP-NEXT: ld1 { v1.h }[1], [x8] ; CHECK-FP-NEXT: add x8, sp, #16 ; CHECK-FP-NEXT: mov v0.h[4], v4.h[0] -; CHECK-FP-NEXT: ld1 { v16.h }[2], [x8] +; CHECK-FP-NEXT: ld1 { v1.h }[2], [x8] ; CHECK-FP-NEXT: mov v0.h[5], v5.h[0] ; CHECK-FP-NEXT: mov v0.h[6], v6.h[0] ; CHECK-FP-NEXT: mov v0.h[7], v7.h[0] -; CHECK-FP-NEXT: fminnm v0.8h, v0.8h, v16.8h +; CHECK-FP-NEXT: fminnm v0.8h, v0.8h, v1.8h ; CHECK-FP-NEXT: fminnmv h0, v0.8h ; CHECK-FP-NEXT: ret %b = call nnan ninf half @llvm.vector.reduce.fmin.v11f16(<11 x half> %a) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -143,13 +143,13 @@ ; CHECK-LABEL: test_v4i1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: orr w9, w10, w9 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: orr w8, w9, w8 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: orr w8, w8, w11 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %b = call i1 @llvm.vector.reduce.umax.v4i1(<4 x i1> %a) @@ -174,8 +174,8 @@ ; CHECK-NEXT: csel x8, x0, x2, hi ; CHECK-NEXT: cmp x1, x3 ; CHECK-NEXT: csel x9, x0, x2, hi -; CHECK-NEXT: csel x0, x8, x9, eq ; CHECK-NEXT: csel x1, x1, x3, hi +; CHECK-NEXT: csel x0, x8, x9, eq ; CHECK-NEXT: ret %b = call i128 @llvm.vector.reduce.umax.v2i128(<2 x i128> %a) ret i128 %b diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll @@ -56,8 +56,8 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: movi.2d v2, #0000000000000000 ; CHECK-NEXT: ; kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fneg.2d v2, v2 ; CHECK-NEXT: ; kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fneg.2d v2, v2 ; CHECK-NEXT: bit.16b v0, v1, v2 ; CHECK-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -82,8 +82,8 @@ define <2 x float> @test_copysign_v2f32_v2f64(<2 x float> %a, <2 x double> %b) #0 { ; CHECK-LABEL: test_copysign_v2f32_v2f64: ; CHECK: ; %bb.0: -; CHECK-NEXT: fcvtn v1.2s, v1.2d ; CHECK-NEXT: movi.2s v2, #128, lsl #24 +; CHECK-NEXT: fcvtn v1.2s, v1.2d ; CHECK-NEXT: bit.8b v0, v1, v2 ; CHECK-NEXT: ret %tmp0 = fptrunc <2 x double> %b to <2 x float> @@ -110,9 +110,9 @@ ; CHECK-LABEL: test_copysign_v4f32_v4f64: ; CHECK: ; %bb.0: ; CHECK-NEXT: fcvtn v1.2s, v1.2d +; CHECK-NEXT: movi.4s v3, #128, lsl #24 ; CHECK-NEXT: fcvtn2 v1.4s, v2.2d -; CHECK-NEXT: movi.4s v2, #128, lsl #24 -; CHECK-NEXT: bit.16b v0, v1, v2 +; CHECK-NEXT: bit.16b v0, v1, v3 ; CHECK-NEXT: ret %tmp0 = fptrunc <4 x double> %b to <4 x float> %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0) @@ -155,12 +155,12 @@ define <4 x double> @test_copysign_v4f64_v4f32(<4 x double> %a, <4 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v4f64_v4f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.2d v4, #0000000000000000 -; CHECK-NEXT: fcvtl v3.2d, v2.2s -; CHECK-NEXT: fcvtl2 v2.2d, v2.4s -; CHECK-NEXT: fneg.2d v4, v4 -; CHECK-NEXT: bit.16b v1, v2, v4 -; CHECK-NEXT: bit.16b v0, v3, v4 +; CHECK-NEXT: movi.2d v3, #0000000000000000 +; CHECK-NEXT: fcvtl2 v4.2d, v2.4s +; CHECK-NEXT: fcvtl v2.2d, v2.2s +; CHECK-NEXT: fneg.2d v3, v3 +; CHECK-NEXT: bit.16b v1, v4, v3 +; CHECK-NEXT: bit.16b v0, v2, v3 ; CHECK-NEXT: ret %tmp0 = fpext <4 x float> %b to <4 x double> %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %tmp0) @@ -189,31 +189,31 @@ ; NOFP16: ; %bb.0: ; NOFP16-NEXT: ; kill: def $d1 killed $d1 def $q1 ; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0 -; NOFP16-NEXT: mov h2, v1[1] -; NOFP16-NEXT: mov h3, v0[1] -; NOFP16-NEXT: movi.4s v4, #128, lsl #24 +; NOFP16-NEXT: mov h3, v1[1] +; NOFP16-NEXT: mov h4, v0[1] +; NOFP16-NEXT: movi.4s v2, #128, lsl #24 ; NOFP16-NEXT: fcvt s5, h1 ; NOFP16-NEXT: fcvt s6, h0 -; NOFP16-NEXT: fcvt s2, h2 +; NOFP16-NEXT: mov h7, v1[2] +; NOFP16-NEXT: mov h16, v0[2] ; NOFP16-NEXT: fcvt s3, h3 -; NOFP16-NEXT: bit.16b v6, v5, v4 -; NOFP16-NEXT: mov h5, v1[2] -; NOFP16-NEXT: bit.16b v3, v2, v4 -; NOFP16-NEXT: mov h2, v0[2] -; NOFP16-NEXT: fcvt s5, h5 -; NOFP16-NEXT: fcvt s2, h2 +; NOFP16-NEXT: fcvt s4, h4 ; NOFP16-NEXT: mov h1, v1[3] -; NOFP16-NEXT: mov h0, v0[3] -; NOFP16-NEXT: bit.16b v2, v5, v4 -; NOFP16-NEXT: fcvt s1, h1 -; NOFP16-NEXT: fcvt s5, h0 +; NOFP16-NEXT: bit.16b v6, v5, v2 +; NOFP16-NEXT: fcvt s5, h7 +; NOFP16-NEXT: fcvt s7, h16 +; NOFP16-NEXT: bit.16b v4, v3, v2 +; NOFP16-NEXT: mov h3, v0[3] ; NOFP16-NEXT: fcvt h0, s6 -; NOFP16-NEXT: bit.16b v5, v1, v4 +; NOFP16-NEXT: fcvt s1, h1 +; NOFP16-NEXT: bit.16b v7, v5, v2 +; NOFP16-NEXT: fcvt h4, s4 +; NOFP16-NEXT: fcvt s3, h3 +; NOFP16-NEXT: fcvt h5, s7 +; NOFP16-NEXT: mov.h v0[1], v4[0] +; NOFP16-NEXT: bit.16b v3, v1, v2 +; NOFP16-NEXT: mov.h v0[2], v5[0] ; NOFP16-NEXT: fcvt h1, s3 -; NOFP16-NEXT: fcvt h2, s2 -; NOFP16-NEXT: mov.h v0[1], v1[0] -; NOFP16-NEXT: mov.h v0[2], v2[0] -; NOFP16-NEXT: fcvt h1, s5 ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; NOFP16-NEXT: ret @@ -232,39 +232,39 @@ ; NOFP16: ; %bb.0: ; NOFP16-NEXT: fcvtn v1.4h, v1.4s ; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0 -; NOFP16-NEXT: movi.4s v3, #128, lsl #24 -; NOFP16-NEXT: fcvt s4, h0 +; NOFP16-NEXT: mov h3, v0[1] +; NOFP16-NEXT: movi.4s v2, #128, lsl #24 +; NOFP16-NEXT: fcvt s5, h0 +; NOFP16-NEXT: mov h7, v0[2] +; NOFP16-NEXT: mov h4, v1[1] ; NOFP16-NEXT: fcvt s6, h1 -; NOFP16-NEXT: mov h2, v0[1] -; NOFP16-NEXT: bit.16b v4, v6, v3 -; NOFP16-NEXT: mov h6, v1[1] -; NOFP16-NEXT: fcvt s2, h2 -; NOFP16-NEXT: fcvt s6, h6 -; NOFP16-NEXT: mov h5, v0[2] -; NOFP16-NEXT: bit.16b v2, v6, v3 -; NOFP16-NEXT: mov h6, v1[2] -; NOFP16-NEXT: fcvt s5, h5 -; NOFP16-NEXT: fcvt s6, h6 -; NOFP16-NEXT: mov h0, v0[3] -; NOFP16-NEXT: bit.16b v5, v6, v3 -; NOFP16-NEXT: fcvt s6, h0 -; NOFP16-NEXT: mov h0, v1[3] -; NOFP16-NEXT: fcvt s1, h0 -; NOFP16-NEXT: fcvt h0, s4 -; NOFP16-NEXT: bit.16b v6, v1, v3 -; NOFP16-NEXT: fcvt h1, s2 -; NOFP16-NEXT: fcvt h2, s5 -; NOFP16-NEXT: mov.h v0[1], v1[0] -; NOFP16-NEXT: mov.h v0[2], v2[0] -; NOFP16-NEXT: fcvt h1, s6 +; NOFP16-NEXT: mov h16, v1[2] +; NOFP16-NEXT: fcvt s3, h3 +; NOFP16-NEXT: mov h1, v1[3] +; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: bit.16b v5, v6, v2 +; NOFP16-NEXT: fcvt s6, h7 +; NOFP16-NEXT: fcvt s7, h16 +; NOFP16-NEXT: fcvt s1, h1 +; NOFP16-NEXT: bit.16b v3, v4, v2 +; NOFP16-NEXT: mov h4, v0[3] +; NOFP16-NEXT: fcvt h0, s5 +; NOFP16-NEXT: bit.16b v6, v7, v2 +; NOFP16-NEXT: fcvt h3, s3 +; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: fcvt h5, s6 +; NOFP16-NEXT: mov.h v0[1], v3[0] +; NOFP16-NEXT: bit.16b v4, v1, v2 +; NOFP16-NEXT: mov.h v0[2], v5[0] +; NOFP16-NEXT: fcvt h1, s4 ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; NOFP16-NEXT: ret ; ; FP16-LABEL: test_copysign_v4f16_v4f32: ; FP16: ; %bb.0: -; FP16-NEXT: fcvtn v1.4h, v1.4s ; FP16-NEXT: movi.4h v2, #128, lsl #8 +; FP16-NEXT: fcvtn v1.4h, v1.4s ; FP16-NEXT: bit.8b v0, v1, v2 ; FP16-NEXT: ret %tmp0 = fptrunc <4 x float> %b to <4 x half> @@ -277,46 +277,46 @@ ; NOFP16: ; %bb.0: ; NOFP16-NEXT: ; kill: def $d0 killed $d0 def $q0 ; NOFP16-NEXT: mov d4, v1[1] -; NOFP16-NEXT: movi.4s v5, #128, lsl #24 +; NOFP16-NEXT: mov h5, v0[1] +; NOFP16-NEXT: movi.4s v3, #128, lsl #24 ; NOFP16-NEXT: fcvt s1, d1 ; NOFP16-NEXT: fcvt s6, h0 -; NOFP16-NEXT: bit.16b v6, v1, v5 -; NOFP16-NEXT: mov h1, v0[1] +; NOFP16-NEXT: mov h7, v0[2] ; NOFP16-NEXT: fcvt s4, d4 -; NOFP16-NEXT: fcvt s1, h1 -; NOFP16-NEXT: bit.16b v1, v4, v5 -; NOFP16-NEXT: mov h4, v0[2] -; NOFP16-NEXT: mov d3, v2[1] +; NOFP16-NEXT: fcvt s5, h5 +; NOFP16-NEXT: bit.16b v6, v1, v3 +; NOFP16-NEXT: fcvt s1, d2 +; NOFP16-NEXT: fcvt s7, h7 +; NOFP16-NEXT: bit.16b v5, v4, v3 +; NOFP16-NEXT: mov d2, v2[1] +; NOFP16-NEXT: mov h4, v0[3] +; NOFP16-NEXT: fcvt h0, s6 +; NOFP16-NEXT: bit.16b v7, v1, v3 +; NOFP16-NEXT: fcvt h1, s5 ; NOFP16-NEXT: fcvt s2, d2 -; NOFP16-NEXT: mov h0, v0[3] ; NOFP16-NEXT: fcvt s4, h4 -; NOFP16-NEXT: fcvt s3, d3 -; NOFP16-NEXT: fcvt s7, h0 -; NOFP16-NEXT: fcvt h0, s6 -; NOFP16-NEXT: bit.16b v4, v2, v5 -; NOFP16-NEXT: fcvt h1, s1 -; NOFP16-NEXT: bit.16b v7, v3, v5 -; NOFP16-NEXT: fcvt h2, s4 +; NOFP16-NEXT: fcvt h5, s7 ; NOFP16-NEXT: mov.h v0[1], v1[0] -; NOFP16-NEXT: mov.h v0[2], v2[0] -; NOFP16-NEXT: fcvt h1, s7 +; NOFP16-NEXT: bit.16b v4, v2, v3 +; NOFP16-NEXT: mov.h v0[2], v5[0] +; NOFP16-NEXT: fcvt h1, s4 ; NOFP16-NEXT: mov.h v0[3], v1[0] ; NOFP16-NEXT: ; kill: def $d0 killed $d0 killed $q0 ; NOFP16-NEXT: ret ; ; FP16-LABEL: test_copysign_v4f16_v4f64: ; FP16: ; %bb.0: -; FP16-NEXT: mov d3, v1[1] +; FP16-NEXT: mov d4, v1[1] ; FP16-NEXT: fcvt h1, d1 -; FP16-NEXT: fcvt h3, d3 -; FP16-NEXT: mov.h v1[1], v3[0] -; FP16-NEXT: fcvt h3, d2 +; FP16-NEXT: movi.4h v3, #128, lsl #8 +; FP16-NEXT: fcvt h4, d4 +; FP16-NEXT: mov.h v1[1], v4[0] +; FP16-NEXT: fcvt h4, d2 ; FP16-NEXT: mov d2, v2[1] +; FP16-NEXT: mov.h v1[2], v4[0] ; FP16-NEXT: fcvt h2, d2 -; FP16-NEXT: mov.h v1[2], v3[0] ; FP16-NEXT: mov.h v1[3], v2[0] -; FP16-NEXT: movi.4h v2, #128, lsl #8 -; FP16-NEXT: bit.8b v0, v1, v2 +; FP16-NEXT: bit.8b v0, v1, v3 ; FP16-NEXT: ret %tmp0 = fptrunc <4 x double> %b to <4 x half> %r = call <4 x half> @llvm.copysign.v4f16(<4 x half> %a, <4 x half> %tmp0) @@ -330,60 +330,61 @@ define <8 x half> @test_copysign_v8f16_v8f16(<8 x half> %a, <8 x half> %b) #0 { ; NOFP16-LABEL: test_copysign_v8f16_v8f16: ; NOFP16: ; %bb.0: -; NOFP16-NEXT: mov h4, v1[1] -; NOFP16-NEXT: mov h5, v0[1] +; NOFP16-NEXT: mov h5, v1[1] +; NOFP16-NEXT: mov h6, v0[1] +; NOFP16-NEXT: movi.4s v3, #128, lsl #24 +; NOFP16-NEXT: fcvt s2, h1 +; NOFP16-NEXT: fcvt s4, h0 ; NOFP16-NEXT: mov h7, v1[2] ; NOFP16-NEXT: mov h16, v0[2] -; NOFP16-NEXT: mov h17, v1[3] -; NOFP16-NEXT: mov h18, v0[3] -; NOFP16-NEXT: movi.4s v2, #128, lsl #24 -; NOFP16-NEXT: fcvt s6, h1 -; NOFP16-NEXT: fcvt s3, h0 -; NOFP16-NEXT: fcvt s4, h4 ; NOFP16-NEXT: fcvt s5, h5 +; NOFP16-NEXT: fcvt s6, h6 +; NOFP16-NEXT: mov h17, v0[3] +; NOFP16-NEXT: bit.16b v4, v2, v3 +; NOFP16-NEXT: mov h2, v1[3] ; NOFP16-NEXT: fcvt s7, h7 ; NOFP16-NEXT: fcvt s16, h16 +; NOFP16-NEXT: bit.16b v6, v5, v3 ; NOFP16-NEXT: fcvt s17, h17 -; NOFP16-NEXT: fcvt s18, h18 -; NOFP16-NEXT: bit.16b v3, v6, v2 -; NOFP16-NEXT: mov h6, v1[4] -; NOFP16-NEXT: bit.16b v5, v4, v2 -; NOFP16-NEXT: mov h4, v0[4] -; NOFP16-NEXT: bit.16b v16, v7, v2 -; NOFP16-NEXT: mov h7, v1[5] -; NOFP16-NEXT: bit.16b v18, v17, v2 -; NOFP16-NEXT: mov h17, v0[5] -; NOFP16-NEXT: fcvt s6, h6 -; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: fcvt s18, h2 +; NOFP16-NEXT: mov h5, v1[4] +; NOFP16-NEXT: fcvt h2, s4 +; NOFP16-NEXT: bit.16b v16, v7, v3 +; NOFP16-NEXT: mov h7, v0[4] +; NOFP16-NEXT: fcvt h4, s6 +; NOFP16-NEXT: bit.16b v17, v18, v3 +; NOFP16-NEXT: mov h6, v1[5] +; NOFP16-NEXT: mov h18, v0[5] +; NOFP16-NEXT: fcvt s5, h5 ; NOFP16-NEXT: fcvt s7, h7 -; NOFP16-NEXT: fcvt s17, h17 -; NOFP16-NEXT: bit.16b v4, v6, v2 -; NOFP16-NEXT: mov h6, v1[6] -; NOFP16-NEXT: bit.16b v17, v7, v2 -; NOFP16-NEXT: mov h7, v0[6] +; NOFP16-NEXT: mov.h v2[1], v4[0] +; NOFP16-NEXT: fcvt h4, s16 ; NOFP16-NEXT: fcvt s6, h6 -; NOFP16-NEXT: fcvt s7, h7 +; NOFP16-NEXT: fcvt s16, h18 +; NOFP16-NEXT: fcvt h17, s17 +; NOFP16-NEXT: bit.16b v7, v5, v3 +; NOFP16-NEXT: mov h5, v0[6] +; NOFP16-NEXT: mov.h v2[2], v4[0] +; NOFP16-NEXT: mov h4, v1[6] +; NOFP16-NEXT: bit.16b v16, v6, v3 ; NOFP16-NEXT: mov h1, v1[7] +; NOFP16-NEXT: fcvt s5, h5 +; NOFP16-NEXT: mov.h v2[3], v17[0] +; NOFP16-NEXT: fcvt h6, s7 +; NOFP16-NEXT: fcvt s4, h4 ; NOFP16-NEXT: mov h0, v0[7] -; NOFP16-NEXT: bit.16b v7, v6, v2 ; NOFP16-NEXT: fcvt s1, h1 -; NOFP16-NEXT: fcvt s6, h0 -; NOFP16-NEXT: bit.16b v6, v1, v2 -; NOFP16-NEXT: fcvt h0, s3 -; NOFP16-NEXT: fcvt h1, s5 -; NOFP16-NEXT: mov.h v0[1], v1[0] -; NOFP16-NEXT: fcvt h1, s16 -; NOFP16-NEXT: mov.h v0[2], v1[0] -; NOFP16-NEXT: fcvt h1, s18 -; NOFP16-NEXT: fcvt h2, s4 -; NOFP16-NEXT: mov.h v0[3], v1[0] -; NOFP16-NEXT: fcvt h3, s17 -; NOFP16-NEXT: mov.h v0[4], v2[0] -; NOFP16-NEXT: fcvt h4, s7 -; NOFP16-NEXT: mov.h v0[5], v3[0] -; NOFP16-NEXT: mov.h v0[6], v4[0] -; NOFP16-NEXT: fcvt h1, s6 -; NOFP16-NEXT: mov.h v0[7], v1[0] +; NOFP16-NEXT: mov.h v2[4], v6[0] +; NOFP16-NEXT: bit.16b v5, v4, v3 +; NOFP16-NEXT: fcvt h4, s16 +; NOFP16-NEXT: fcvt s0, h0 +; NOFP16-NEXT: fcvt h5, s5 +; NOFP16-NEXT: mov.h v2[5], v4[0] +; NOFP16-NEXT: bit.16b v0, v1, v3 +; NOFP16-NEXT: mov.h v2[6], v5[0] +; NOFP16-NEXT: fcvt h0, s0 +; NOFP16-NEXT: mov.h v2[7], v0[0] +; NOFP16-NEXT: mov.16b v0, v2 ; NOFP16-NEXT: ret ; ; FP16-LABEL: test_copysign_v8f16_v8f16: @@ -398,70 +399,71 @@ define <8 x half> @test_copysign_v8f16_v8f32(<8 x half> %a, <8 x float> %b) #0 { ; NOFP16-LABEL: test_copysign_v8f16_v8f32: ; NOFP16: ; %bb.0: -; NOFP16-NEXT: fcvtn v4.4h, v1.4s +; NOFP16-NEXT: fcvtn v1.4h, v1.4s ; NOFP16-NEXT: fcvtn v2.4h, v2.4s -; NOFP16-NEXT: movi.4s v1, #128, lsl #24 -; NOFP16-NEXT: fcvt s3, h0 -; NOFP16-NEXT: mov h16, v0[4] -; NOFP16-NEXT: fcvt s18, h4 -; NOFP16-NEXT: fcvt s16, h16 -; NOFP16-NEXT: bit.16b v3, v18, v1 -; NOFP16-NEXT: fcvt s18, h2 -; NOFP16-NEXT: mov h5, v0[1] -; NOFP16-NEXT: bit.16b v16, v18, v1 -; NOFP16-NEXT: mov h18, v4[1] -; NOFP16-NEXT: fcvt s5, h5 -; NOFP16-NEXT: fcvt s18, h18 -; NOFP16-NEXT: mov h7, v0[3] -; NOFP16-NEXT: bit.16b v5, v18, v1 -; NOFP16-NEXT: mov h18, v4[2] -; NOFP16-NEXT: mov h4, v4[3] -; NOFP16-NEXT: fcvt s7, h7 +; NOFP16-NEXT: movi.4s v3, #128, lsl #24 +; NOFP16-NEXT: mov h4, v0[1] +; NOFP16-NEXT: mov h5, v0[4] +; NOFP16-NEXT: fcvt s7, h0 +; NOFP16-NEXT: mov h17, v0[2] +; NOFP16-NEXT: mov h6, v1[1] +; NOFP16-NEXT: fcvt s16, h1 ; NOFP16-NEXT: fcvt s4, h4 -; NOFP16-NEXT: mov h6, v0[2] -; NOFP16-NEXT: mov h17, v0[5] -; NOFP16-NEXT: bit.16b v7, v4, v1 -; NOFP16-NEXT: mov h4, v2[1] -; NOFP16-NEXT: fcvt s6, h6 -; NOFP16-NEXT: fcvt s18, h18 +; NOFP16-NEXT: mov h18, v1[2] +; NOFP16-NEXT: fcvt s5, h5 ; NOFP16-NEXT: fcvt s17, h17 -; NOFP16-NEXT: fcvt s4, h4 -; NOFP16-NEXT: bit.16b v6, v18, v1 -; NOFP16-NEXT: mov h18, v0[6] -; NOFP16-NEXT: bit.16b v17, v4, v1 -; NOFP16-NEXT: mov h4, v2[2] +; NOFP16-NEXT: fcvt s6, h6 +; NOFP16-NEXT: bit.16b v7, v16, v3 +; NOFP16-NEXT: fcvt s16, h2 ; NOFP16-NEXT: fcvt s18, h18 -; NOFP16-NEXT: fcvt s4, h4 +; NOFP16-NEXT: bit.16b v4, v6, v3 +; NOFP16-NEXT: mov h6, v0[3] +; NOFP16-NEXT: bit.16b v5, v16, v3 +; NOFP16-NEXT: mov h16, v1[3] +; NOFP16-NEXT: fcvt h1, s7 +; NOFP16-NEXT: mov h7, v0[5] +; NOFP16-NEXT: bit.16b v17, v18, v3 +; NOFP16-NEXT: fcvt h4, s4 +; NOFP16-NEXT: fcvt s6, h6 +; NOFP16-NEXT: fcvt s16, h16 +; NOFP16-NEXT: mov h18, v2[1] +; NOFP16-NEXT: fcvt s7, h7 +; NOFP16-NEXT: fcvt h5, s5 +; NOFP16-NEXT: mov.h v1[1], v4[0] +; NOFP16-NEXT: fcvt h4, s17 +; NOFP16-NEXT: bit.16b v6, v16, v3 +; NOFP16-NEXT: fcvt s17, h18 +; NOFP16-NEXT: mov h16, v2[2] +; NOFP16-NEXT: mov.h v1[2], v4[0] +; NOFP16-NEXT: mov h4, v0[6] ; NOFP16-NEXT: mov h0, v0[7] -; NOFP16-NEXT: bit.16b v18, v4, v1 -; NOFP16-NEXT: fcvt s4, h0 -; NOFP16-NEXT: mov h0, v2[3] +; NOFP16-NEXT: fcvt h6, s6 +; NOFP16-NEXT: mov h2, v2[3] +; NOFP16-NEXT: bit.16b v7, v17, v3 +; NOFP16-NEXT: fcvt s16, h16 +; NOFP16-NEXT: fcvt s4, h4 ; NOFP16-NEXT: fcvt s0, h0 -; NOFP16-NEXT: bit.16b v4, v0, v1 -; NOFP16-NEXT: fcvt h0, s3 -; NOFP16-NEXT: fcvt h1, s5 -; NOFP16-NEXT: mov.h v0[1], v1[0] -; NOFP16-NEXT: fcvt h2, s6 -; NOFP16-NEXT: fcvt h3, s7 -; NOFP16-NEXT: mov.h v0[2], v2[0] -; NOFP16-NEXT: fcvt h1, s16 -; NOFP16-NEXT: mov.h v0[3], v3[0] -; NOFP16-NEXT: fcvt h5, s17 -; NOFP16-NEXT: mov.h v0[4], v1[0] -; NOFP16-NEXT: fcvt h6, s18 -; NOFP16-NEXT: mov.h v0[5], v5[0] -; NOFP16-NEXT: mov.h v0[6], v6[0] -; NOFP16-NEXT: fcvt h1, s4 -; NOFP16-NEXT: mov.h v0[7], v1[0] +; NOFP16-NEXT: mov.h v1[3], v6[0] +; NOFP16-NEXT: fcvt s2, h2 +; NOFP16-NEXT: bit.16b v4, v16, v3 +; NOFP16-NEXT: mov.h v1[4], v5[0] +; NOFP16-NEXT: fcvt h5, s7 +; NOFP16-NEXT: bit.16b v0, v2, v3 +; NOFP16-NEXT: fcvt h4, s4 +; NOFP16-NEXT: mov.h v1[5], v5[0] +; NOFP16-NEXT: fcvt h0, s0 +; NOFP16-NEXT: mov.h v1[6], v4[0] +; NOFP16-NEXT: mov.h v1[7], v0[0] +; NOFP16-NEXT: mov.16b v0, v1 ; NOFP16-NEXT: ret ; ; FP16-LABEL: test_copysign_v8f16_v8f32: ; FP16: ; %bb.0: ; FP16-NEXT: fcvtn v2.4h, v2.4s ; FP16-NEXT: fcvtn v1.4h, v1.4s +; FP16-NEXT: movi.8h v3, #128, lsl #8 ; FP16-NEXT: mov.d v1[1], v2[0] -; FP16-NEXT: movi.8h v2, #128, lsl #8 -; FP16-NEXT: bit.16b v0, v1, v2 +; FP16-NEXT: bit.16b v0, v1, v3 ; FP16-NEXT: ret %tmp0 = fptrunc <8 x float> %b to <8 x half> %r = call <8 x half> @llvm.copysign.v8f16(<8 x half> %a, <8 x half> %tmp0) diff --git a/llvm/test/CodeGen/AArch64/vector-gep.ll b/llvm/test/CodeGen/AArch64/vector-gep.ll --- a/llvm/test/CodeGen/AArch64/vector-gep.ll +++ b/llvm/test/CodeGen/AArch64/vector-gep.ll @@ -10,9 +10,9 @@ define <2 x i8*> @vector_gep(<2 x i8*> %0) { ; CHECK-LABEL: vector_gep: ; CHECK: adrp x[[REG8:[123]?[0-9]]], lCPI0_0@PAGE -; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG8]], lCPI0_0@PAGEOFF] -; CHECK: add v[[REG0:[0-9]+]].2d, v[[REG0]].2d, v[[REG1]].2d -; CHECK: movi v[[REG1]].2d, #0x000000ffffffff +; CHECK: movi v[[REG1:[0-9]+]].2d, #0x000000ffffffff +; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG8]], lCPI0_0@PAGEOFF] +; CHECK: add v[[REG0:[0-9]+]].2d, v[[REG0]].2d, v[[REG2]].2d ; CHECK: and v[[REG0]].16b, v[[REG0]].16b, v[[REG1]].16b ; CHECK: ret entry: diff --git a/llvm/test/CodeGen/AArch64/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/AArch64/vector-popcnt-128-ult-ugt.ll --- a/llvm/test/CodeGen/AArch64/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/AArch64/vector-popcnt-128-ult-ugt.ll @@ -4,8 +4,8 @@ define <16 x i8> @ugt_1_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ugt_1_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #1 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -17,8 +17,8 @@ define <16 x i8> @ult_2_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ult_2_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #2 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -30,8 +30,8 @@ define <16 x i8> @ugt_2_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ugt_2_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #2 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -43,8 +43,8 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ult_3_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #3 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -56,8 +56,8 @@ define <16 x i8> @ugt_3_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ugt_3_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #3 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -69,8 +69,8 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ult_4_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #4 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -82,8 +82,8 @@ define <16 x i8> @ugt_4_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ugt_4_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #4 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -95,8 +95,8 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ult_5_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #5 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -108,8 +108,8 @@ define <16 x i8> @ugt_5_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ugt_5_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #5 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -121,8 +121,8 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ult_6_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #6 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -134,8 +134,8 @@ define <16 x i8> @ugt_6_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ugt_6_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #6 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -147,8 +147,8 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; CHECK-LABEL: ult_7_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: movi v1.16b, #7 +; CHECK-NEXT: cnt v0.16b, v0.16b ; CHECK-NEXT: cmhi v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -188,8 +188,8 @@ ; CHECK-LABEL: ugt_2_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #2 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -202,8 +202,8 @@ ; CHECK-LABEL: ult_3_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #3 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -216,8 +216,8 @@ ; CHECK-LABEL: ugt_3_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #3 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -230,8 +230,8 @@ ; CHECK-LABEL: ult_4_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #4 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -244,8 +244,8 @@ ; CHECK-LABEL: ugt_4_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #4 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -258,8 +258,8 @@ ; CHECK-LABEL: ult_5_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #5 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -272,8 +272,8 @@ ; CHECK-LABEL: ugt_5_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #5 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -286,8 +286,8 @@ ; CHECK-LABEL: ult_6_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #6 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -300,8 +300,8 @@ ; CHECK-LABEL: ugt_6_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #6 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -314,8 +314,8 @@ ; CHECK-LABEL: ult_7_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #7 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -328,8 +328,8 @@ ; CHECK-LABEL: ugt_7_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #7 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -342,8 +342,8 @@ ; CHECK-LABEL: ult_8_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -356,8 +356,8 @@ ; CHECK-LABEL: ugt_8_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -370,8 +370,8 @@ ; CHECK-LABEL: ult_9_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #9 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -384,8 +384,8 @@ ; CHECK-LABEL: ugt_9_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #9 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -398,8 +398,8 @@ ; CHECK-LABEL: ult_10_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #10 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -412,8 +412,8 @@ ; CHECK-LABEL: ugt_10_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #10 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -426,8 +426,8 @@ ; CHECK-LABEL: ult_11_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #11 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -440,8 +440,8 @@ ; CHECK-LABEL: ugt_11_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #11 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -454,8 +454,8 @@ ; CHECK-LABEL: ult_12_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #12 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -468,8 +468,8 @@ ; CHECK-LABEL: ugt_12_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #12 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -482,8 +482,8 @@ ; CHECK-LABEL: ult_13_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #13 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -496,8 +496,8 @@ ; CHECK-LABEL: ugt_13_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #13 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -510,8 +510,8 @@ ; CHECK-LABEL: ult_14_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #14 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -524,8 +524,8 @@ ; CHECK-LABEL: ugt_14_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #14 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -538,8 +538,8 @@ ; CHECK-LABEL: ult_15_v8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: movi v1.8h, #15 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: cmhi v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -579,9 +579,9 @@ ; CHECK-LABEL: ugt_2_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #2 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #2 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -594,9 +594,9 @@ ; CHECK-LABEL: ult_3_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #3 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #3 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -609,9 +609,9 @@ ; CHECK-LABEL: ugt_3_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #3 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #3 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -624,9 +624,9 @@ ; CHECK-LABEL: ult_4_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #4 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #4 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -639,9 +639,9 @@ ; CHECK-LABEL: ugt_4_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #4 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #4 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -654,9 +654,9 @@ ; CHECK-LABEL: ult_5_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #5 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #5 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -669,9 +669,9 @@ ; CHECK-LABEL: ugt_5_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #5 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #5 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -684,9 +684,9 @@ ; CHECK-LABEL: ult_6_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #6 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #6 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -699,9 +699,9 @@ ; CHECK-LABEL: ugt_6_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #6 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #6 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -714,9 +714,9 @@ ; CHECK-LABEL: ult_7_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #7 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #7 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -729,9 +729,9 @@ ; CHECK-LABEL: ugt_7_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #7 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #7 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -744,9 +744,9 @@ ; CHECK-LABEL: ult_8_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #8 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #8 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -759,9 +759,9 @@ ; CHECK-LABEL: ugt_8_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #8 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #8 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -774,9 +774,9 @@ ; CHECK-LABEL: ult_9_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #9 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #9 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -789,9 +789,9 @@ ; CHECK-LABEL: ugt_9_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #9 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #9 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -804,9 +804,9 @@ ; CHECK-LABEL: ult_10_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #10 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #10 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -819,9 +819,9 @@ ; CHECK-LABEL: ugt_10_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #10 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #10 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -834,9 +834,9 @@ ; CHECK-LABEL: ult_11_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #11 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #11 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -849,9 +849,9 @@ ; CHECK-LABEL: ugt_11_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #11 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #11 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -864,9 +864,9 @@ ; CHECK-LABEL: ult_12_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #12 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #12 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -879,9 +879,9 @@ ; CHECK-LABEL: ugt_12_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #12 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #12 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -894,9 +894,9 @@ ; CHECK-LABEL: ult_13_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #13 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #13 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -909,9 +909,9 @@ ; CHECK-LABEL: ugt_13_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #13 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #13 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -924,9 +924,9 @@ ; CHECK-LABEL: ult_14_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #14 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #14 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -939,9 +939,9 @@ ; CHECK-LABEL: ugt_14_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #14 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #14 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -954,9 +954,9 @@ ; CHECK-LABEL: ult_15_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #15 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #15 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -969,9 +969,9 @@ ; CHECK-LABEL: ugt_15_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #15 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #15 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -984,9 +984,9 @@ ; CHECK-LABEL: ult_16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #16 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #16 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -999,9 +999,9 @@ ; CHECK-LABEL: ugt_16_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #16 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #16 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1014,9 +1014,9 @@ ; CHECK-LABEL: ult_17_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #17 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #17 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1029,9 +1029,9 @@ ; CHECK-LABEL: ugt_17_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #17 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #17 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1044,9 +1044,9 @@ ; CHECK-LABEL: ult_18_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #18 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #18 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1059,9 +1059,9 @@ ; CHECK-LABEL: ugt_18_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #18 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #18 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1074,9 +1074,9 @@ ; CHECK-LABEL: ult_19_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #19 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #19 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1089,9 +1089,9 @@ ; CHECK-LABEL: ugt_19_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #19 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #19 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1104,9 +1104,9 @@ ; CHECK-LABEL: ult_20_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #20 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #20 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1119,9 +1119,9 @@ ; CHECK-LABEL: ugt_20_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #20 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #20 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1134,9 +1134,9 @@ ; CHECK-LABEL: ult_21_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #21 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #21 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1149,9 +1149,9 @@ ; CHECK-LABEL: ugt_21_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #21 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #21 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1164,9 +1164,9 @@ ; CHECK-LABEL: ult_22_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #22 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #22 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1179,9 +1179,9 @@ ; CHECK-LABEL: ugt_22_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #22 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #22 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1194,9 +1194,9 @@ ; CHECK-LABEL: ult_23_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #23 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #23 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1209,9 +1209,9 @@ ; CHECK-LABEL: ugt_23_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #23 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #23 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1224,9 +1224,9 @@ ; CHECK-LABEL: ult_24_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #24 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #24 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1239,9 +1239,9 @@ ; CHECK-LABEL: ugt_24_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #24 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #24 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1254,9 +1254,9 @@ ; CHECK-LABEL: ult_25_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #25 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #25 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1269,9 +1269,9 @@ ; CHECK-LABEL: ugt_25_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #25 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #25 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1284,9 +1284,9 @@ ; CHECK-LABEL: ult_26_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #26 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #26 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1299,9 +1299,9 @@ ; CHECK-LABEL: ugt_26_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #26 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #26 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1314,9 +1314,9 @@ ; CHECK-LABEL: ult_27_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #27 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #27 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1329,9 +1329,9 @@ ; CHECK-LABEL: ugt_27_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #27 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #27 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1344,9 +1344,9 @@ ; CHECK-LABEL: ult_28_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #28 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #28 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1359,9 +1359,9 @@ ; CHECK-LABEL: ugt_28_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #28 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #28 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1374,9 +1374,9 @@ ; CHECK-LABEL: ult_29_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #29 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #29 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1389,9 +1389,9 @@ ; CHECK-LABEL: ugt_29_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #29 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #29 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1404,9 +1404,9 @@ ; CHECK-LABEL: ult_30_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #30 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #30 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1419,9 +1419,9 @@ ; CHECK-LABEL: ugt_30_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #30 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #30 ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1434,9 +1434,9 @@ ; CHECK-LABEL: ult_31_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: movi v1.4s, #31 ; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h -; CHECK-NEXT: movi v1.4s, #31 ; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret %2 = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %0) @@ -1476,11 +1476,11 @@ ; CHECK-LABEL: ugt_2_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #2 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1493,11 +1493,11 @@ ; CHECK-LABEL: ult_3_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #3 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1510,11 +1510,11 @@ ; CHECK-LABEL: ugt_3_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #3 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1527,11 +1527,11 @@ ; CHECK-LABEL: ult_4_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1544,11 +1544,11 @@ ; CHECK-LABEL: ugt_4_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #4 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1561,11 +1561,11 @@ ; CHECK-LABEL: ult_5_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1578,11 +1578,11 @@ ; CHECK-LABEL: ugt_5_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #5 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1595,11 +1595,11 @@ ; CHECK-LABEL: ult_6_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1612,11 +1612,11 @@ ; CHECK-LABEL: ugt_6_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #6 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1629,11 +1629,11 @@ ; CHECK-LABEL: ult_7_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #7 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1646,11 +1646,11 @@ ; CHECK-LABEL: ugt_7_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #7 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1663,11 +1663,11 @@ ; CHECK-LABEL: ult_8_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1680,11 +1680,11 @@ ; CHECK-LABEL: ugt_8_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #8 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1697,11 +1697,11 @@ ; CHECK-LABEL: ult_9_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #9 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1714,11 +1714,11 @@ ; CHECK-LABEL: ugt_9_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #9 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1731,11 +1731,11 @@ ; CHECK-LABEL: ult_10_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #10 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1748,11 +1748,11 @@ ; CHECK-LABEL: ugt_10_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #10 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1765,11 +1765,11 @@ ; CHECK-LABEL: ult_11_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #11 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1782,11 +1782,11 @@ ; CHECK-LABEL: ugt_11_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #11 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1799,11 +1799,11 @@ ; CHECK-LABEL: ult_12_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #12 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1816,11 +1816,11 @@ ; CHECK-LABEL: ugt_12_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #12 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1833,11 +1833,11 @@ ; CHECK-LABEL: ult_13_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #13 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1850,11 +1850,11 @@ ; CHECK-LABEL: ugt_13_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #13 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1867,11 +1867,11 @@ ; CHECK-LABEL: ult_14_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #14 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1884,11 +1884,11 @@ ; CHECK-LABEL: ugt_14_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #14 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1901,11 +1901,11 @@ ; CHECK-LABEL: ult_15_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1918,11 +1918,11 @@ ; CHECK-LABEL: ugt_15_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #15 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1935,11 +1935,11 @@ ; CHECK-LABEL: ult_16_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1952,11 +1952,11 @@ ; CHECK-LABEL: ugt_16_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #16 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1969,11 +1969,11 @@ ; CHECK-LABEL: ult_17_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #17 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -1986,11 +1986,11 @@ ; CHECK-LABEL: ugt_17_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #17 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2003,11 +2003,11 @@ ; CHECK-LABEL: ult_18_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #18 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2020,11 +2020,11 @@ ; CHECK-LABEL: ugt_18_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #18 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2037,11 +2037,11 @@ ; CHECK-LABEL: ult_19_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #19 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2054,11 +2054,11 @@ ; CHECK-LABEL: ugt_19_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #19 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2071,11 +2071,11 @@ ; CHECK-LABEL: ult_20_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #20 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2088,11 +2088,11 @@ ; CHECK-LABEL: ugt_20_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #20 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2105,11 +2105,11 @@ ; CHECK-LABEL: ult_21_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #21 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2122,11 +2122,11 @@ ; CHECK-LABEL: ugt_21_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #21 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2139,11 +2139,11 @@ ; CHECK-LABEL: ult_22_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #22 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2156,11 +2156,11 @@ ; CHECK-LABEL: ugt_22_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #22 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2173,11 +2173,11 @@ ; CHECK-LABEL: ult_23_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #23 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2190,11 +2190,11 @@ ; CHECK-LABEL: ugt_23_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #23 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2207,11 +2207,11 @@ ; CHECK-LABEL: ult_24_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #24 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2224,11 +2224,11 @@ ; CHECK-LABEL: ugt_24_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #24 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2241,11 +2241,11 @@ ; CHECK-LABEL: ult_25_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #25 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2258,11 +2258,11 @@ ; CHECK-LABEL: ugt_25_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #25 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2275,11 +2275,11 @@ ; CHECK-LABEL: ult_26_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #26 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2292,11 +2292,11 @@ ; CHECK-LABEL: ugt_26_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #26 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2309,11 +2309,11 @@ ; CHECK-LABEL: ult_27_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #27 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2326,11 +2326,11 @@ ; CHECK-LABEL: ugt_27_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #27 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2343,11 +2343,11 @@ ; CHECK-LABEL: ult_28_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #28 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2360,11 +2360,11 @@ ; CHECK-LABEL: ugt_28_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #28 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2377,11 +2377,11 @@ ; CHECK-LABEL: ult_29_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #29 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2394,11 +2394,11 @@ ; CHECK-LABEL: ugt_29_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #29 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2411,11 +2411,11 @@ ; CHECK-LABEL: ult_30_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2428,11 +2428,11 @@ ; CHECK-LABEL: ugt_30_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #30 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2445,11 +2445,11 @@ ; CHECK-LABEL: ult_31_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2462,11 +2462,11 @@ ; CHECK-LABEL: ugt_31_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #31 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2479,11 +2479,11 @@ ; CHECK-LABEL: ult_32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2496,11 +2496,11 @@ ; CHECK-LABEL: ugt_32_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #32 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2513,11 +2513,11 @@ ; CHECK-LABEL: ult_33_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #33 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2530,11 +2530,11 @@ ; CHECK-LABEL: ugt_33_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #33 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2547,11 +2547,11 @@ ; CHECK-LABEL: ult_34_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #34 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2564,11 +2564,11 @@ ; CHECK-LABEL: ugt_34_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #34 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2581,11 +2581,11 @@ ; CHECK-LABEL: ult_35_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #35 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2598,11 +2598,11 @@ ; CHECK-LABEL: ugt_35_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #35 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2615,11 +2615,11 @@ ; CHECK-LABEL: ult_36_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #36 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2632,11 +2632,11 @@ ; CHECK-LABEL: ugt_36_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #36 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2649,11 +2649,11 @@ ; CHECK-LABEL: ult_37_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #37 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2666,11 +2666,11 @@ ; CHECK-LABEL: ugt_37_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #37 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2683,11 +2683,11 @@ ; CHECK-LABEL: ult_38_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #38 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2700,11 +2700,11 @@ ; CHECK-LABEL: ugt_38_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #38 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2717,11 +2717,11 @@ ; CHECK-LABEL: ult_39_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #39 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2734,11 +2734,11 @@ ; CHECK-LABEL: ugt_39_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #39 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2751,11 +2751,11 @@ ; CHECK-LABEL: ult_40_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #40 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2768,11 +2768,11 @@ ; CHECK-LABEL: ugt_40_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #40 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2785,11 +2785,11 @@ ; CHECK-LABEL: ult_41_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2802,11 +2802,11 @@ ; CHECK-LABEL: ugt_41_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #41 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2819,11 +2819,11 @@ ; CHECK-LABEL: ult_42_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2836,11 +2836,11 @@ ; CHECK-LABEL: ugt_42_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #42 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2853,11 +2853,11 @@ ; CHECK-LABEL: ult_43_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #43 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2870,11 +2870,11 @@ ; CHECK-LABEL: ugt_43_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #43 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2887,11 +2887,11 @@ ; CHECK-LABEL: ult_44_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #44 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2904,11 +2904,11 @@ ; CHECK-LABEL: ugt_44_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #44 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2921,11 +2921,11 @@ ; CHECK-LABEL: ult_45_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #45 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2938,11 +2938,11 @@ ; CHECK-LABEL: ugt_45_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #45 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2955,11 +2955,11 @@ ; CHECK-LABEL: ult_46_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #46 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2972,11 +2972,11 @@ ; CHECK-LABEL: ugt_46_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #46 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -2989,11 +2989,11 @@ ; CHECK-LABEL: ult_47_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #47 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3006,11 +3006,11 @@ ; CHECK-LABEL: ugt_47_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #47 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3023,11 +3023,11 @@ ; CHECK-LABEL: ult_48_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #48 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3040,11 +3040,11 @@ ; CHECK-LABEL: ugt_48_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #48 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3057,11 +3057,11 @@ ; CHECK-LABEL: ult_49_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #49 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3074,11 +3074,11 @@ ; CHECK-LABEL: ugt_49_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #49 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3091,11 +3091,11 @@ ; CHECK-LABEL: ult_50_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #50 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3108,11 +3108,11 @@ ; CHECK-LABEL: ugt_50_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #50 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3125,11 +3125,11 @@ ; CHECK-LABEL: ult_51_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #51 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3142,11 +3142,11 @@ ; CHECK-LABEL: ugt_51_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #51 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3159,11 +3159,11 @@ ; CHECK-LABEL: ult_52_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #52 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3176,11 +3176,11 @@ ; CHECK-LABEL: ugt_52_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #52 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3193,11 +3193,11 @@ ; CHECK-LABEL: ult_53_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #53 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3210,11 +3210,11 @@ ; CHECK-LABEL: ugt_53_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #53 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3227,11 +3227,11 @@ ; CHECK-LABEL: ult_54_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #54 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3244,11 +3244,11 @@ ; CHECK-LABEL: ugt_54_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #54 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3261,11 +3261,11 @@ ; CHECK-LABEL: ult_55_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #55 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3278,11 +3278,11 @@ ; CHECK-LABEL: ugt_55_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #55 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3295,11 +3295,11 @@ ; CHECK-LABEL: ult_56_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #56 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3312,11 +3312,11 @@ ; CHECK-LABEL: ugt_56_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #56 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3329,11 +3329,11 @@ ; CHECK-LABEL: ult_57_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #57 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3346,11 +3346,11 @@ ; CHECK-LABEL: ugt_57_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #57 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3363,11 +3363,11 @@ ; CHECK-LABEL: ult_58_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #58 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3380,11 +3380,11 @@ ; CHECK-LABEL: ugt_58_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #58 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3397,11 +3397,11 @@ ; CHECK-LABEL: ult_59_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #59 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3414,11 +3414,11 @@ ; CHECK-LABEL: ugt_59_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #59 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3431,11 +3431,11 @@ ; CHECK-LABEL: ult_60_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #60 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3448,11 +3448,11 @@ ; CHECK-LABEL: ugt_60_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #60 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3465,11 +3465,11 @@ ; CHECK-LABEL: ult_61_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #61 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3482,11 +3482,11 @@ ; CHECK-LABEL: ugt_61_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #61 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3499,11 +3499,11 @@ ; CHECK-LABEL: ult_62_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #62 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3516,11 +3516,11 @@ ; CHECK-LABEL: ugt_62_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #62 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) @@ -3533,11 +3533,11 @@ ; CHECK-LABEL: ult_63_v2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: cnt v0.16b, v0.16b -; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: mov w8, #63 +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: uaddlp v0.2d, v0.4s -; CHECK-NEXT: dup v1.2d, x8 ; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret %2 = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %0) diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll --- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -92,10 +92,10 @@ ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 +; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s ; CHECK-NEXT: add x9, x1, x8 ; CHECK-NEXT: add x8, x8, #32 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 -; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s ; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s ; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s ; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s @@ -142,11 +142,11 @@ ; CHECK-NEXT: .LBB3_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x10, x1, x8 ; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9] -; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x10] +; CHECK-NEXT: add x9, x1, x8 ; CHECK-NEXT: add x8, x8, #32 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 +; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x9] ; CHECK-NEXT: fmul v4.4s, v2.4s, v0.4s ; CHECK-NEXT: fmla v4.4s, v1.4s, v3.4s ; CHECK-NEXT: str q4, [x2], #16 diff --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll --- a/llvm/test/CodeGen/AArch64/vselect-constants.ll +++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll @@ -10,12 +10,12 @@ define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) { ; CHECK-LABEL: sel_C1_or_C2_vec: ; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: adrp x9, .LCPI0_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] ; CHECK-NEXT: sshr v0.4s, v0.4s, #31 ; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b ; CHECK-NEXT: ret @@ -28,10 +28,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI1_0 ; CHECK-NEXT: adrp x9, .LCPI1_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI1_1] ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI1_1] +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b ; CHECK-NEXT: ret %cond = icmp eq <4 x i32> %x, %y %add = select <4 x i1> %cond, <4 x i32> , <4 x i32> @@ -41,12 +41,12 @@ define <4 x i32> @sel_Cplus1_or_C_vec(<4 x i1> %cond) { ; CHECK-LABEL: sel_Cplus1_or_C_vec: ; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: adrp x9, .LCPI2_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_1] -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_1] ; CHECK-NEXT: sshr v0.4s, v0.4s, #31 ; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b ; CHECK-NEXT: ret @@ -59,10 +59,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: adrp x9, .LCPI3_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI3_1] ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_1] +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b ; CHECK-NEXT: ret %cond = icmp eq <4 x i32> %x, %y %add = select <4 x i1> %cond, <4 x i32> , <4 x i32> @@ -72,12 +72,12 @@ define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) { ; CHECK-LABEL: sel_Cminus1_or_C_vec: ; CHECK: // %bb.0: +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: adrp x8, .LCPI4_0 ; CHECK-NEXT: adrp x9, .LCPI4_1 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI4_1] -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI4_1] ; CHECK-NEXT: sshr v0.4s, v0.4s, #31 ; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b ; CHECK-NEXT: ret @@ -90,10 +90,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI5_0 ; CHECK-NEXT: adrp x9, .LCPI5_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_0] -; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI5_1] ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: bsl v0.16b, v3.16b, v2.16b +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI5_1] +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b ; CHECK-NEXT: ret %cond = icmp eq <4 x i32> %x, %y %add = select <4 x i1> %cond, <4 x i32> , <4 x i32> @@ -147,9 +147,9 @@ ; CHECK-LABEL: sel_1_or_0_vec: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-NEXT: sshr v0.4s, v0.4s, #31 -; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %add = select <4 x i1> %cond, <4 x i32> , <4 x i32> @@ -159,9 +159,9 @@ define <4 x i32> @cmp_sel_1_or_0_vec(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: cmp_sel_1_or_0_vec: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %cond = icmp eq <4 x i32> %x, %y %add = select <4 x i1> %cond, <4 x i32> , <4 x i32> @@ -172,9 +172,9 @@ ; CHECK-LABEL: sel_0_or_1_vec: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-NEXT: cmge v0.4s, v0.4s, #0 -; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %add = select <4 x i1> %cond, <4 x i32> , <4 x i32> @@ -184,9 +184,9 @@ define <4 x i32> @cmp_sel_0_or_1_vec(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: cmp_sel_0_or_1_vec: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: bic v0.16b, v2.16b, v0.16b ; CHECK-NEXT: ret %cond = icmp eq <4 x i32> %x, %y %add = select <4 x i1> %cond, <4 x i32> , <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/win-tls.ll b/llvm/test/CodeGen/AArch64/win-tls.ll --- a/llvm/test/CodeGen/AArch64/win-tls.ll +++ b/llvm/test/CodeGen/AArch64/win-tls.ll @@ -30,8 +30,8 @@ ; CHECK-LABEL: getVar ; CHECK: adrp [[TLS_INDEX_ADDR:x[0-9]+]], _tls_index -; CHECK: ldr w[[TLS_INDEX:[0-9]+]], {{\[}}[[TLS_INDEX_ADDR]], :lo12:_tls_index] ; CHECK: ldr [[TLS_POINTER:x[0-9]+]], [x18, #88] +; CHECK: ldr w[[TLS_INDEX:[0-9]+]], {{\[}}[[TLS_INDEX_ADDR]], :lo12:_tls_index] ; CHECK: ldr [[TLS:x[0-9]+]], {{\[}}[[TLS_POINTER]], x[[TLS_INDEX]], lsl #3] ; CHECK: add [[TLS]], [[TLS]], :secrel_hi12:tlsVar @@ -39,8 +39,8 @@ ; CHECK-LABEL: getPtr ; CHECK: adrp [[TLS_INDEX_ADDR:x[0-9]+]], _tls_index -; CHECK: ldr w[[TLS_INDEX:[0-9]+]], {{\[}}[[TLS_INDEX_ADDR]], :lo12:_tls_index] ; CHECK: ldr [[TLS_POINTER:x[0-9]+]], [x18, #88] +; CHECK: ldr w[[TLS_INDEX:[0-9]+]], {{\[}}[[TLS_INDEX_ADDR]], :lo12:_tls_index] ; CHECK: ldr [[TLS:x[0-9]+]], {{\[}}[[TLS_POINTER]], x[[TLS_INDEX]], lsl #3] ; CHECK: add [[TLS]], [[TLS]], :secrel_hi12:tlsVar @@ -48,8 +48,8 @@ ; CHECK-LABEL: setVar ; CHECK: adrp [[TLS_INDEX_ADDR:x[0-9]+]], _tls_index -; CHECK: ldr w[[TLS_INDEX:[0-9]+]], {{\[}}[[TLS_INDEX_ADDR]], :lo12:_tls_index] ; CHECK: ldr [[TLS_POINTER:x[0-9]+]], [x18, #88] +; CHECK: ldr w[[TLS_INDEX:[0-9]+]], {{\[}}[[TLS_INDEX_ADDR]], :lo12:_tls_index] ; CHECK: ldr [[TLS:x[0-9]+]], {{\[}}[[TLS_POINTER]], x[[TLS_INDEX]], lsl #3] ; CHECK: add [[TLS]], [[TLS]], :secrel_hi12:tlsVar diff --git a/llvm/test/CodeGen/AArch64/win64_vararg.ll b/llvm/test/CodeGen/AArch64/win64_vararg.ll --- a/llvm/test/CodeGen/AArch64/win64_vararg.ll +++ b/llvm/test/CodeGen/AArch64/win64_vararg.ll @@ -5,8 +5,8 @@ ; CHECK: str x30, [sp, #-80]! ; CHECK: add x8, sp, #24 ; CHECK: add x0, sp, #24 -; CHECK: stp x1, x2, [sp, #24] ; CHECK: stp x3, x4, [sp, #40] +; CHECK: stp x1, x2, [sp, #24] ; CHECK: stp x5, x6, [sp, #56] ; CHECK: str x7, [sp, #72] ; CHECK: str x8, [sp, #8] @@ -61,8 +61,8 @@ ; CHECK-LABEL: f7: ; CHECK: sub sp, sp, #32 ; CHECK: add x8, sp, #24 -; CHECK: str x7, [sp, #24] ; CHECK: add x0, sp, #24 +; CHECK: str x7, [sp, #24] ; CHECK: str x8, [sp, #8] ; CHECK: add sp, sp, #32 ; CHECK: ret @@ -78,8 +78,8 @@ ; CHECK-LABEL: copy1: ; CHECK: sub sp, sp, #80 ; CHECK: add x8, sp, #24 -; CHECK: stp x1, x2, [sp, #24] ; CHECK: stp x3, x4, [sp, #40] +; CHECK: stp x1, x2, [sp, #24] ; CHECK: stp x5, x6, [sp, #56] ; CHECK: str x7, [sp, #72] ; CHECK: stp x8, x8, [sp], #80 @@ -120,9 +120,9 @@ ; CHECK: add x5, x29, #32 ; CHECK: mov x1, x21 ; CHECK: mov x2, x20 -; CHECK: orr x0, x8, #0x2 ; CHECK: mov x3, x19 ; CHECK: mov x4, xzr +; CHECK: orr x0, x8, #0x2 ; CHECK: bl __stdio_common_vsprintf ; CHECK: cmp w0, #0 ; CHECK: csinv w0, w0, wzr, ge @@ -156,15 +156,15 @@ ; CHECK: stp x29, x30, [sp, #40] ; CHECK: add x29, sp, #40 ; CHECK: add x8, x29, #24 -; CHECK: str x8, [x29, #16] -; CHECK: mov w8, w0 -; CHECK: add x8, x8, #15 -; CHECK: lsr x15, x8, #4 +; CHECK: mov w9, w0 ; CHECK: mov x19, x1 ; CHECK: mov [[REG2:x[0-9]+]], sp -; CHECK: stp x2, x3, [x29, #24] -; CHECK: stp x4, x5, [x29, #40] -; CHECK: stp x6, x7, [x29, #56] +; CHECK: stp x3, x4, [x29, #32] +; CHECK: stp x8, x2, [x29, #16] +; CHECK: add x8, x9, #15 +; CHECK: lsr x15, x8, #4 +; CHECK: stp x5, x6, [x29, #48] +; CHECK: str x7, [x29, #64] ; CHECK: bl __chkstk ; CHECK: mov x8, sp ; CHECK: sub [[REG:x[0-9]+]], x8, x15, lsl #4 @@ -176,9 +176,9 @@ ; CHECK: mov x1, [[REG]] ; CHECK: mov x2, [[REG4]] ; CHECK: mov x3, x19 -; CHECK: orr x0, x8, #0x2 ; CHECK: mov x4, xzr ; CHECK: mov x5, [[REG3]] +; CHECK: orr x0, x8, #0x2 ; CHECK: bl __stdio_common_vsprintf ; CHECK: mov sp, [[REG2]] ; CHECK: sub sp, x29, #40 @@ -256,17 +256,17 @@ ; CHECK-LABEL: fixed_params ; CHECK: sub sp, sp, #32 -; CHECK-DAG: mov w6, w3 -; CHECK-DAG: mov [[REG1:w[0-9]+]], w2 +; CHECK: mov w8, w4 +; CHECK: mov w6, w3 +; CHECK: mov w4, w2 ; CHECK: mov w2, w1 ; CHECK: fmov x1, d0 ; CHECK: fmov x3, d1 ; CHECK: fmov x5, d2 ; CHECK: fmov x7, d3 -; CHECK: str w4, [sp] -; CHECK: mov w4, [[REG1]] ; CHECK: str x30, [sp, #16] ; CHECK: str d4, [sp, #8] +; CHECK: str w8, [sp] ; CHECK: bl varargs ; CHECK: ldr x30, [sp, #16] ; CHECK: add sp, sp, #32 diff --git a/llvm/test/CodeGen/AArch64/win64_vararg_float.ll b/llvm/test/CodeGen/AArch64/win64_vararg_float.ll --- a/llvm/test/CodeGen/AArch64/win64_vararg_float.ll +++ b/llvm/test/CodeGen/AArch64/win64_vararg_float.ll @@ -16,9 +16,9 @@ ; O0: str x8, [sp, #8] ; O0: ldr x0, [sp, #8] ; DAGISEL: add x0, sp, #16 -; DAGISEL: stp x2, x3, [sp, #16] -; DAGISEL: stp x4, x5, [sp, #32] -; DAGISEL: stp x6, x7, [sp, #48] +; DAGISEL: stp x3, x4, [sp, #24] +; DAGISEL: stp x5, x6, [sp, #40] +; DAGISEL: stp x8, x2, [sp, #8] ; CHECK: bl f_va_list %ap = alloca i8*, align 8 %0 = bitcast i8** %ap to i8* @@ -51,9 +51,9 @@ ; O0: str x8, [sp, #8] ; O0: ldr x0, [sp, #8] ; DAGISEL: add x0, sp, #16 -; DAGISEL: stp x2, x3, [sp, #16] -; DAGISEL: stp x4, x5, [sp, #32] -; DAGISEL: stp x6, x7, [sp, #48] +; DAGISEL: stp x3, x4, [sp, #24] +; DAGISEL: stp x5, x6, [sp, #40] +; DAGISEL: stp x8, x2, [sp, #8] ; CHECK: bl d_va_list %ap = alloca i8*, align 8 %0 = bitcast i8** %ap to i8* diff --git a/llvm/test/CodeGen/AArch64/win64_vararg_float_cc.ll b/llvm/test/CodeGen/AArch64/win64_vararg_float_cc.ll --- a/llvm/test/CodeGen/AArch64/win64_vararg_float_cc.ll +++ b/llvm/test/CodeGen/AArch64/win64_vararg_float_cc.ll @@ -16,9 +16,9 @@ ; O0: str x8, [sp, #8] ; O0: ldr x0, [sp, #8] ; DAGISEL: add x0, sp, #32 -; DAGISEL: stp x2, x3, [sp, #32] -; DAGISEL: stp x4, x5, [sp, #48] -; DAGISEL: stp x6, x7, [sp, #64] +; DAGISEL: stp x3, x4, [sp, #40] +; DAGISEL: str x2, [sp, #32] +; DAGISEL: stp x5, x6, [sp, #56] ; CHECK: bl f_va_list %ap = alloca i8*, align 8 %0 = bitcast i8** %ap to i8* @@ -51,9 +51,9 @@ ; O0: str x8, [sp, #8] ; O0: ldr x0, [sp, #8] ; DAGISEL: add x0, sp, #32 -; DAGISEL: stp x2, x3, [sp, #32] -; DAGISEL: stp x4, x5, [sp, #48] -; DAGISEL: stp x6, x7, [sp, #64] +; DAGISEL: stp x3, x4, [sp, #40] +; DAGISEL: str x2, [sp, #32] +; DAGISEL: stp x5, x6, [sp, #56] ; CHECK: bl d_va_list %ap = alloca i8*, align 8 %0 = bitcast i8** %ap to i8* diff --git a/llvm/test/CodeGen/AArch64/xor.ll b/llvm/test/CodeGen/AArch64/xor.ll --- a/llvm/test/CodeGen/AArch64/xor.ll +++ b/llvm/test/CodeGen/AArch64/xor.ll @@ -62,9 +62,9 @@ define <4 x i32> @vec_add_of_not_with_undef(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: vec_add_of_not_with_undef: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %x, %y %r = add <4 x i32> %t0, @@ -74,9 +74,9 @@ define <4 x i32> @vec_add_of_not_with_undef_decrement(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: vec_add_of_not_with_undef_decrement: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: ret %t0 = sub <4 x i32> %x, %y %r = add <4 x i32> %t0, diff --git a/llvm/test/MC/AArch64/elf-globaladdress.ll b/llvm/test/MC/AArch64/elf-globaladdress.ll --- a/llvm/test/MC/AArch64/elf-globaladdress.ll +++ b/llvm/test/MC/AArch64/elf-globaladdress.ll @@ -42,12 +42,12 @@ ; OBJ: Relocations [ ; OBJ: Section {{.*}} .rela.text { ; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 var8 -; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_LDST8_ABS_LO12_NC var8 ; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 var16 -; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_LDST16_ABS_LO12_NC var16 +; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_LDST8_ABS_LO12_NC var8 ; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 var32 -; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_LDST32_ABS_LO12_NC var32 +; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_LDST16_ABS_LO12_NC var16 ; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 var64 +; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_LDST32_ABS_LO12_NC var32 ; OBJ: 0x{{[0-9,A-F]+}} R_AARCH64_LDST64_ABS_LO12_NC var64 ; This is on the store, so not really important, but it stops the next diff --git a/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll b/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll --- a/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll +++ b/llvm/test/Transforms/CanonicalizeFreezeInLoops/aarch64.ll @@ -11,8 +11,8 @@ ; CHECK-NEXT: .LBB0_1: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: strb wzr, [x0, w8, sxtw] -; CHECK-NEXT: subs w1, w1, #1 ; CHECK-NEXT: add w8, w8, #1 +; CHECK-NEXT: subs w1, w1, #1 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll @@ -14,9 +14,9 @@ ; CHECK-NEXT: b.ge .LBB0_2 ; CHECK-NEXT: .LBB0_1: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: add w10, w8, #1 +; CHECK-NEXT: stp w10, w8, [x9] +; CHECK-NEXT: mov w8, w10 ; CHECK-NEXT: cmp w8, w1 ; CHECK-NEXT: b.lt .LBB0_1 ; CHECK-NEXT: .LBB0_2: // %while_end @@ -54,9 +54,9 @@ ; CHECK-NEXT: b.ge .LBB1_3 ; CHECK-NEXT: .LBB1_2: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: add w10, w8, #1 +; CHECK-NEXT: stp w10, w8, [x9] +; CHECK-NEXT: mov w8, w10 ; CHECK-NEXT: cmp w8, w1 ; CHECK-NEXT: b.lt .LBB1_2 ; CHECK-NEXT: .LBB1_3: // %while_end @@ -96,9 +96,9 @@ ; CHECK-NEXT: b.ge .LBB2_3 ; CHECK-NEXT: .LBB2_2: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: add w10, w8, #1 +; CHECK-NEXT: stp w10, w8, [x9] +; CHECK-NEXT: mov w8, w10 ; CHECK-NEXT: cmp w8, w3 ; CHECK-NEXT: b.lt .LBB2_2 ; CHECK-NEXT: .LBB2_3: // %while_end @@ -164,9 +164,9 @@ ; CHECK-NEXT: b.ge .LBB3_4 ; CHECK-NEXT: // %bb.3: // %while_body ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: str w20, [x8, #4] -; CHECK-NEXT: add w20, w20, #1 -; CHECK-NEXT: str w20, [x8] +; CHECK-NEXT: add w9, w20, #1 +; CHECK-NEXT: stp w9, w20, [x8] +; CHECK-NEXT: mov w20, w9 ; CHECK-NEXT: b .LBB3_1 ; CHECK-NEXT: .LBB3_4: // %while_end ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload @@ -222,9 +222,9 @@ ; CHECK-NEXT: b.ge .LBB4_2 ; CHECK-NEXT: .LBB4_1: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: add w10, w8, #1 +; CHECK-NEXT: stp w10, w8, [x9] +; CHECK-NEXT: mov w8, w10 ; CHECK-NEXT: cmp w8, w1 ; CHECK-NEXT: b.lt .LBB4_1 ; CHECK-NEXT: .LBB4_2: // %while_end diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-pre-inc-offset-check.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-pre-inc-offset-check.ll --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-pre-inc-offset-check.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-pre-inc-offset-check.ll @@ -19,15 +19,15 @@ define void @test_lsr_pre_inc_offset_check(%"Type"* %p) { ; CHECK-LABEL: test_lsr_pre_inc_offset_check: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add x8, x0, #340 -; CHECK-NEXT: mov w9, #165 +; CHECK-NEXT: mov w8, #165 +; CHECK-NEXT: add x9, x0, #340 ; CHECK-NEXT: mov w10, #2 ; CHECK-NEXT: .LBB0_1: // %main ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: stur wzr, [x8, #-1] -; CHECK-NEXT: strb w10, [x8] -; CHECK-NEXT: subs x9, x9, #1 -; CHECK-NEXT: add x8, x8, #338 +; CHECK-NEXT: stur wzr, [x9, #-1] +; CHECK-NEXT: subs x8, x8, #1 +; CHECK-NEXT: strb w10, [x9] +; CHECK-NEXT: add x9, x9, #338 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll @@ -67,8 +67,8 @@ ; CHECK-NEXT: add x8, x0, #28 ; CHECK-NEXT: .LBB1_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr s1, [x8, x1, lsl #2] ; CHECK-NEXT: scvtf s2, x1 +; CHECK-NEXT: ldr s1, [x8, x1, lsl #2] ; CHECK-NEXT: fadd s2, s2, s0 ; CHECK-NEXT: fcmp s1, s2 ; CHECK-NEXT: b.gt .LBB1_5 diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected @@ -90,7 +90,9 @@ ; CHECK-NEXT: bl OUTLINED_FUNCTION_0 ; CHECK-NEXT: .LBB0_5: ; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: b OUTLINED_FUNCTION_1 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret ; ; CHECK-LABEL: main: ; CHECK: // %bb.0: @@ -100,21 +102,23 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: adrp x8, x -; CHECK-NEXT: mov w9, #1 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: adrp x9, x ; CHECK-NEXT: mov w10, #2 ; CHECK-NEXT: mov w11, #3 -; CHECK-NEXT: mov w12, #4 -; CHECK-NEXT: stp w9, wzr, [x29, #-8] +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: str w8, [x9, :lo12:x] +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: stp w8, wzr, [x29, #-8] ; CHECK-NEXT: stur w10, [x29, #-12] -; CHECK-NEXT: stp w12, w11, [sp, #12] -; CHECK-NEXT: str w9, [x8, :lo12:x] +; CHECK-NEXT: stp w9, w11, [sp, #12] ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP -; CHECK-NEXT: stp w10, w9, [x29, #-12] +; CHECK-NEXT: stp w10, w8, [x29, #-12] ; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: stp w12, w11, [sp, #12] -; CHECK-NEXT: b OUTLINED_FUNCTION_1 +; CHECK-NEXT: stp w9, w11, [sp, #12] +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret ; ; CHECK-LABEL: OUTLINED_FUNCTION_0: ; CHECK: // %bb.0: @@ -124,9 +128,3 @@ ; CHECK-NEXT: stp w9, w8, [x29, #-12] ; CHECK-NEXT: stp w11, w10, [sp, #12] ; CHECK-NEXT: ret -; -; CHECK-LABEL: OUTLINED_FUNCTION_1: -; CHECK: // %bb.0: -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: ret diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected @@ -31,7 +31,9 @@ ; CHECK-NEXT: bl OUTLINED_FUNCTION_0 ; CHECK-NEXT: .LBB0_5: ; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: b OUTLINED_FUNCTION_1 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret %1 = alloca i32, align 4 %2 = alloca i32, align 4 %3 = alloca i32, align 4 @@ -77,21 +79,23 @@ ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: adrp x8, x -; CHECK-NEXT: mov w9, #1 +; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: adrp x9, x ; CHECK-NEXT: mov w10, #2 ; CHECK-NEXT: mov w11, #3 -; CHECK-NEXT: mov w12, #4 -; CHECK-NEXT: stp w9, wzr, [x29, #-8] +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: str w8, [x9, :lo12:x] +; CHECK-NEXT: mov w9, #4 +; CHECK-NEXT: stp w8, wzr, [x29, #-8] ; CHECK-NEXT: stur w10, [x29, #-12] -; CHECK-NEXT: stp w12, w11, [sp, #12] -; CHECK-NEXT: str w9, [x8, :lo12:x] +; CHECK-NEXT: stp w9, w11, [sp, #12] ; CHECK-NEXT: //APP ; CHECK-NEXT: //NO_APP -; CHECK-NEXT: stp w10, w9, [x29, #-12] +; CHECK-NEXT: stp w10, w8, [x29, #-12] ; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: stp w12, w11, [sp, #12] -; CHECK-NEXT: b OUTLINED_FUNCTION_1 +; CHECK-NEXT: stp w9, w11, [sp, #12] +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret %1 = alloca i32, align 4 %2 = alloca i32, align 4 %3 = alloca i32, align 4