diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17881,6 +17881,11 @@ unsigned NumElts = VecVT.getVectorNumElements(); unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); + // Try to simplify the whole operation to a constant, or simplify its + // operands. + if (SimplifyDemandedBits(SDValue(N, 0))) + return SDValue(N, 0); + // TODO: These transforms should not require the 'hasOneUse' restriction, but // there are regressions on multiple targets without it. We can end up with a // mess of scalar and vector code if we reduce only part of the DAG to scalar. diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -57,8 +57,8 @@ ; CHECK-LABEL: widen_f16_build_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #13294 -; CHECK-NEXT: dup.4h v0, w8 -; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: movk w8, #13294, lsl #16 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %1 = bitcast half* %addr to <2 x half>* store <2 x half> , <2 x half>* %1, align 2 diff --git a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll --- a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll +++ b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll @@ -8,19 +8,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: adrp x8, .LCPI0_1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: movi v2.4h, #1 ; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: mov w1, wzr ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: movi v1.4h, #1 -; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: shl v0.4h, v0.4h, #15 -; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: cmgt v0.4h, v2.4h, v0.4h ; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: umov w3, v0.h[3] -; CHECK-NEXT: mov w1, wzr ; CHECK-NEXT: mov w2, wzr ; CHECK-NEXT: b foo %tmp3 = shufflevector <4 x i16> %a1, <4 x i16> undef, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll @@ -74,8 +74,7 @@ ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: fmov w0, s1 ; CHECK-NEXT: // kill: def $x0 killed $w0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov x1, v0.d[1] +; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: ret Entry: %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0) diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -373,16 +373,16 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x7, #0 // =0 +; CHECK-NEXT: cmp x5, #0 // =0 ; CHECK-NEXT: cset w9, ge ; CHECK-NEXT: csinc w9, w9, wzr, ne -; CHECK-NEXT: cmp x3, #0 // =0 +; CHECK-NEXT: cmp x1, #0 // =0 ; CHECK-NEXT: cset w10, ge ; CHECK-NEXT: csinc w10, w10, wzr, ne ; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: cset w9, eq -; CHECK-NEXT: adds x11, x2, x6 -; CHECK-NEXT: adcs x12, x3, x7 +; CHECK-NEXT: adds x11, x0, x4 +; CHECK-NEXT: adcs x12, x1, x5 ; CHECK-NEXT: cmp x12, #0 // =0 ; CHECK-NEXT: cset w13, ge ; CHECK-NEXT: mov x8, #9223372036854775807 @@ -392,31 +392,28 @@ ; CHECK-NEXT: cset w13, ne ; CHECK-NEXT: asr x10, x12, #63 ; CHECK-NEXT: tst w9, w13 -; CHECK-NEXT: csel x3, x14, x12, ne -; CHECK-NEXT: csel x2, x10, x11, ne -; CHECK-NEXT: cmp x5, #0 // =0 +; CHECK-NEXT: csel x1, x14, x12, ne +; CHECK-NEXT: csel x0, x10, x11, ne +; CHECK-NEXT: cmp x7, #0 // =0 ; CHECK-NEXT: cset w9, ge ; CHECK-NEXT: csinc w9, w9, wzr, ne -; CHECK-NEXT: cmp x1, #0 // =0 +; CHECK-NEXT: cmp x3, #0 // =0 ; CHECK-NEXT: cset w10, ge ; CHECK-NEXT: csinc w10, w10, wzr, ne ; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: cset w9, eq -; CHECK-NEXT: adds x11, x0, x4 -; CHECK-NEXT: adcs x12, x1, x5 +; CHECK-NEXT: adds x11, x2, x6 +; CHECK-NEXT: adcs x12, x3, x7 ; CHECK-NEXT: cmp x12, #0 // =0 -; CHECK-NEXT: cset w13, ge -; CHECK-NEXT: csinc w13, w13, wzr, ne +; CHECK-NEXT: cset w14, ge +; CHECK-NEXT: csinc w14, w14, wzr, ne ; CHECK-NEXT: cinv x8, x8, ge -; CHECK-NEXT: cmp w10, w13 +; CHECK-NEXT: cmp w10, w14 ; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: asr x13, x12, #63 ; CHECK-NEXT: tst w9, w10 -; CHECK-NEXT: asr x9, x12, #63 -; CHECK-NEXT: csel x9, x9, x11, ne -; CHECK-NEXT: csel x1, x8, x12, ne -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x2, x13, x11, ne +; CHECK-NEXT: csel x3, x8, x12, ne ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -374,16 +374,16 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x7, #0 // =0 +; CHECK-NEXT: cmp x5, #0 // =0 ; CHECK-NEXT: cset w9, ge ; CHECK-NEXT: csinc w9, w9, wzr, ne -; CHECK-NEXT: cmp x3, #0 // =0 +; CHECK-NEXT: cmp x1, #0 // =0 ; CHECK-NEXT: cset w10, ge ; CHECK-NEXT: csinc w10, w10, wzr, ne ; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: cset w9, ne -; CHECK-NEXT: subs x11, x2, x6 -; CHECK-NEXT: sbcs x12, x3, x7 +; CHECK-NEXT: subs x11, x0, x4 +; CHECK-NEXT: sbcs x12, x1, x5 ; CHECK-NEXT: cmp x12, #0 // =0 ; CHECK-NEXT: cset w13, ge ; CHECK-NEXT: mov x8, #9223372036854775807 @@ -393,31 +393,28 @@ ; CHECK-NEXT: cset w13, ne ; CHECK-NEXT: asr x10, x12, #63 ; CHECK-NEXT: tst w9, w13 -; CHECK-NEXT: csel x3, x14, x12, ne -; CHECK-NEXT: csel x2, x10, x11, ne -; CHECK-NEXT: cmp x5, #0 // =0 +; CHECK-NEXT: csel x1, x14, x12, ne +; CHECK-NEXT: csel x0, x10, x11, ne +; CHECK-NEXT: cmp x7, #0 // =0 ; CHECK-NEXT: cset w9, ge ; CHECK-NEXT: csinc w9, w9, wzr, ne -; CHECK-NEXT: cmp x1, #0 // =0 +; CHECK-NEXT: cmp x3, #0 // =0 ; CHECK-NEXT: cset w10, ge ; CHECK-NEXT: csinc w10, w10, wzr, ne ; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: cset w9, ne -; CHECK-NEXT: subs x11, x0, x4 -; CHECK-NEXT: sbcs x12, x1, x5 +; CHECK-NEXT: subs x11, x2, x6 +; CHECK-NEXT: sbcs x12, x3, x7 ; CHECK-NEXT: cmp x12, #0 // =0 -; CHECK-NEXT: cset w13, ge -; CHECK-NEXT: csinc w13, w13, wzr, ne +; CHECK-NEXT: cset w14, ge +; CHECK-NEXT: csinc w14, w14, wzr, ne ; CHECK-NEXT: cinv x8, x8, ge -; CHECK-NEXT: cmp w10, w13 +; CHECK-NEXT: cmp w10, w14 ; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: asr x13, x12, #63 ; CHECK-NEXT: tst w9, w10 -; CHECK-NEXT: asr x9, x12, #63 -; CHECK-NEXT: csel x9, x9, x11, ne -; CHECK-NEXT: csel x1, x8, x12, ne -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x2, x13, x11, ne +; CHECK-NEXT: csel x3, x8, x12, ne ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -371,16 +371,6 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: adds x8, x2, x6 -; CHECK-NEXT: adcs x9, x3, x7 -; CHECK-NEXT: cmp x8, x2 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: cmp x9, x3 -; CHECK-NEXT: cset w11, lo -; CHECK-NEXT: csel w10, w10, w11, eq -; CHECK-NEXT: cmp w10, #0 // =0 -; CHECK-NEXT: csinv x3, x9, xzr, eq -; CHECK-NEXT: csinv x2, x8, xzr, eq ; CHECK-NEXT: adds x8, x0, x4 ; CHECK-NEXT: adcs x9, x1, x5 ; CHECK-NEXT: cmp x8, x0 @@ -389,11 +379,18 @@ ; CHECK-NEXT: cset w11, lo ; CHECK-NEXT: csel w10, w10, w11, eq ; CHECK-NEXT: cmp w10, #0 // =0 -; CHECK-NEXT: csinv x8, x8, xzr, eq ; CHECK-NEXT: csinv x1, x9, xzr, eq -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csinv x0, x8, xzr, eq +; CHECK-NEXT: adds x8, x2, x6 +; CHECK-NEXT: adcs x9, x3, x7 +; CHECK-NEXT: cmp x8, x2 +; CHECK-NEXT: cset w10, lo +; CHECK-NEXT: cmp x9, x3 +; CHECK-NEXT: cset w11, lo +; CHECK-NEXT: csel w10, w10, w11, eq +; CHECK-NEXT: cmp w10, #0 // =0 +; CHECK-NEXT: csinv x2, x8, xzr, eq +; CHECK-NEXT: csinv x3, x9, xzr, eq ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -372,16 +372,6 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: subs x8, x2, x6 -; CHECK-NEXT: sbcs x9, x3, x7 -; CHECK-NEXT: cmp x8, x2 -; CHECK-NEXT: cset w10, hi -; CHECK-NEXT: cmp x9, x3 -; CHECK-NEXT: cset w11, hi -; CHECK-NEXT: csel w10, w10, w11, eq -; CHECK-NEXT: cmp w10, #0 // =0 -; CHECK-NEXT: csel x3, xzr, x9, ne -; CHECK-NEXT: csel x2, xzr, x8, ne ; CHECK-NEXT: subs x8, x0, x4 ; CHECK-NEXT: sbcs x9, x1, x5 ; CHECK-NEXT: cmp x8, x0 @@ -390,11 +380,18 @@ ; CHECK-NEXT: cset w11, hi ; CHECK-NEXT: csel w10, w10, w11, eq ; CHECK-NEXT: cmp w10, #0 // =0 -; CHECK-NEXT: csel x8, xzr, x8, ne ; CHECK-NEXT: csel x1, xzr, x9, ne -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x0, xzr, x8, ne +; CHECK-NEXT: subs x8, x2, x6 +; CHECK-NEXT: sbcs x9, x3, x7 +; CHECK-NEXT: cmp x8, x2 +; CHECK-NEXT: cset w10, hi +; CHECK-NEXT: cmp x9, x3 +; CHECK-NEXT: cset w11, hi +; CHECK-NEXT: csel w10, w10, w11, eq +; CHECK-NEXT: cmp w10, #0 // =0 +; CHECK-NEXT: csel x2, xzr, x8, ne +; CHECK-NEXT: csel x3, xzr, x9, ne ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -96,24 +96,18 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK-LABEL: test_v9i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v1.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v1.b[1] +; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v1.b[0] ; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: umov w9, v0.b[2] ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v1.b[3] +; CHECK-NEXT: umov w9, v0.b[3] ; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[4] ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v1.b[5] +; CHECK-NEXT: umov w9, v0.b[5] ; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[6] ; CHECK-NEXT: and w8, w8, w9 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -537,17 +537,16 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 -; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 +; VI-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1 ; VI-NEXT: s_endpgm %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -108,7 +108,6 @@ ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64 ; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]] -; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], 0 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] @@ -119,6 +118,7 @@ ; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] ; VI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] ; GCN: v_mov_b32_e32 v[[R_I64_0_High:[0-9]+]], 0 +; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], v[[R_I64_0_High]] ; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @fptoui_v2f16_to_v2i64( diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -236,35 +236,35 @@ ; GFX6-LABEL: v_saddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v6, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_bfe_i32 v6, v7, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v8, v5 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v3, s4, v3 ; GFX6-NEXT: s_movk_i32 s5, 0x8000 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_i32_e32 v3, s5, v3 +; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 ; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -499,7 +499,7 @@ ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -529,7 +529,7 @@ ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -671,7 +671,7 @@ ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -701,7 +701,7 @@ ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -738,7 +738,7 @@ ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -768,7 +768,7 @@ ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -805,7 +805,7 @@ ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -835,7 +835,7 @@ ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -125,18 +125,18 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 64, v0 -; GCN-NEXT: v_lshr_b64 v[2:3], 17, v1 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, 64, v0 -; GCN-NEXT: v_lshl_b64 v[4:5], 17, v1 +; GCN-NEXT: v_lshr_b64 v[1:2], 17, v1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 64, v0 +; GCN-NEXT: v_lshl_b64 v[2:3], 17, v2 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN-NEXT: v_lshl_b64 v[4:5], 17, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v1, s[4:5] -; GCN-NEXT: v_lshl_b64 v[0:1], 17, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 17, %rhs ret i128 %shl @@ -146,16 +146,15 @@ ; GCN-LABEL: v_lshr_i128_kv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_movk_i32 s4, 0x41 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_movk_i32 s4, 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -169,11 +168,10 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshr_b64 v[1:2], 33, v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 33, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -480,7 +480,7 @@ ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -512,7 +512,7 @@ ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -551,7 +551,7 @@ ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -583,7 +583,7 @@ ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -676,7 +676,7 @@ ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -708,7 +708,7 @@ ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -747,7 +747,7 @@ ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -779,7 +779,7 @@ ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -237,35 +237,35 @@ ; GFX6-LABEL: v_ssubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v6, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_bfe_i32 v6, v7, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v8, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v3, s4, v3 ; GFX6-NEXT: s_movk_i32 s5, 0x8000 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_i32_e32 v3, s5, v3 +; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 ; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -442,9 +442,9 @@ ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_pk_sub_i16 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -167,26 +167,26 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v7 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v6 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v8, v5 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -168,28 +168,28 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v10, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v10, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_and_b32_e32 v11, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v11 -; GFX6-NEXT: v_max_u32_e32 v0, v0, v10 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v9, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v6 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v9, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v11, s4, v7 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_u32_e32 v2, v2, v8 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v3, v9 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_max_u32_e32 v1, v2, v1 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v11 +; GFX6-NEXT: v_max_u32_e32 v2, v10, v9 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v8 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v4i16: diff --git a/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll b/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll --- a/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll +++ b/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll @@ -43,7 +43,18 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] ; CHECK-NEXT: vmovl.u8 q8, d16 -; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vmov.u16 r1, d16[1] +; CHECK-NEXT: vmov.u16 r2, d16[2] +; CHECK-NEXT: vmov.u16 r3, d16[3] +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: vmov.32 d16[0], r0 +; CHECK-NEXT: uxtb r0, r1 +; CHECK-NEXT: vmov.32 d16[1], r0 +; CHECK-NEXT: uxtb r0, r2 +; CHECK-NEXT: vmov.32 d17[0], r0 +; CHECK-NEXT: uxtb r0, r3 +; CHECK-NEXT: vmov.32 d17[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll --- a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll +++ b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll @@ -101,12 +101,10 @@ ; ; CHECKSOFT-LABEL: test_vset_laneq_f16_1: ; CHECKSOFT: @ %bb.0: @ %entry -; CHECKSOFT-NEXT: vmov d1, r2, r3 -; CHECKSOFT-NEXT: vldr s4, [sp] -; CHECKSOFT-NEXT: vmov d0, r0, r1 -; CHECKSOFT-NEXT: vcvtt.f16.f32 s0, s4 -; CHECKSOFT-NEXT: vmov r2, r3, d1 -; CHECKSOFT-NEXT: vmov r0, r1, d0 +; CHECKSOFT-NEXT: vldr s0, [sp] +; CHECKSOFT-NEXT: vmov d2, r0, r1 +; CHECKSOFT-NEXT: vcvtt.f16.f32 s4, s0 +; CHECKSOFT-NEXT: vmov r0, r1, d2 ; CHECKSOFT-NEXT: bx lr entry: %b = fptrunc float %fb to half @@ -126,7 +124,6 @@ ; CHECKSOFT-NEXT: vldr s4, [sp] ; CHECKSOFT-NEXT: vmov d0, r0, r1 ; CHECKSOFT-NEXT: vcvtt.f16.f32 s3, s4 -; CHECKSOFT-NEXT: vmov r0, r1, d0 ; CHECKSOFT-NEXT: vmov r2, r3, d1 ; CHECKSOFT-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll --- a/llvm/test/CodeGen/ARM/vdup.ll +++ b/llvm/test/CodeGen/ARM/vdup.ll @@ -56,7 +56,16 @@ define <16 x i8> @v_dupQ8(i8 %A) nounwind { ; CHECK-LABEL: v_dupQ8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.8 q8, r0 +; CHECK-NEXT: vmov.i32 d17, #0x0 +; CHECK-NEXT: vdup.8 d16, r0 +; CHECK-NEXT: vmov.8 d17[0], r0 +; CHECK-NEXT: vmov.8 d17[1], r0 +; CHECK-NEXT: vmov.8 d17[2], r0 +; CHECK-NEXT: vmov.8 d17[3], r0 +; CHECK-NEXT: vmov.8 d17[4], r0 +; CHECK-NEXT: vmov.8 d17[5], r0 +; CHECK-NEXT: vmov.8 d17[6], r0 +; CHECK-NEXT: vmov.8 d17[7], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -82,7 +91,12 @@ define <8 x i16> @v_dupQ16(i16 %A) nounwind { ; CHECK-LABEL: v_dupQ16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.16 q8, r0 +; CHECK-NEXT: vmov.i32 d17, #0x0 +; CHECK-NEXT: vdup.16 d16, r0 +; CHECK-NEXT: vmov.16 d17[0], r0 +; CHECK-NEXT: vmov.16 d17[1], r0 +; CHECK-NEXT: vmov.16 d17[2], r0 +; CHECK-NEXT: vmov.16 d17[3], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -100,7 +114,9 @@ define <4 x i32> @v_dupQ32(i32 %A) nounwind { ; CHECK-LABEL: v_dupQ32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q8, r0 +; CHECK-NEXT: vdup.32 d16, r0 +; CHECK-NEXT: vmov.32 d17[0], r0 +; CHECK-NEXT: vmov.32 d17[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -114,9 +130,12 @@ define <4 x float> @v_dupQfloat(float %A) nounwind { ; CHECK-LABEL: v_dupQfloat: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q8, r0 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vmov.f32 s1, s0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.f32 s2, s0 +; CHECK-NEXT: vmov.f32 s3, s0 +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: mov pc, lr %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 @@ -379,7 +398,8 @@ define <4 x i32> @tdupi(i32 %x, i32 %y) { ; CHECK-LABEL: tdupi: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q8, r0 +; CHECK-NEXT: vdup.32 d16, r0 +; CHECK-NEXT: vmov.32 d17[0], r0 ; CHECK-NEXT: vmov.32 d17[1], r1 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 @@ -394,9 +414,11 @@ define <4 x float> @tdupf(float %x, float %y) { ; CHECK-LABEL: tdupf: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vmov s3, r1 +; CHECK-NEXT: vmov.f32 s1, s0 ; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.f32 s2, s0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: mov pc, lr %1 = insertelement <4 x float> undef, float %x, i32 0 @@ -412,8 +434,11 @@ ; CHECK-LABEL: tduplane: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: mov r0, #255 +; CHECK-NEXT: vmov.32 r0, d16[1] +; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vdup.32 q8, d16[1] +; CHECK-NEXT: vmov.32 d17[0], r0 +; CHECK-NEXT: mov r0, #255 ; CHECK-NEXT: vmov.32 d17[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll @@ -131,12 +131,12 @@ define float @test_v16f32(<16 x float> %a) nounwind { ; CHECK-LABEL: test_v16f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d3, r2, r3 -; CHECK-NEXT: vldr s0, .LCPI6_0 ; CHECK-NEXT: vmov d2, r0, r1 +; CHECK-NEXT: vldr s0, .LCPI6_0 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vadd.f32 s0, s4, s0 ; CHECK-NEXT: vadd.f32 s0, s0, s5 +; CHECK-NEXT: vmov d3, r2, r3 ; CHECK-NEXT: vadd.f32 s0, s0, s6 ; CHECK-NEXT: vadd.f32 s0, s0, s7 ; CHECK-NEXT: vld1.64 {d2, d3}, [r0] diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll --- a/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fmul-legalization-strict.ll @@ -131,12 +131,12 @@ define float @test_v16f32(<16 x float> %a) nounwind { ; CHECK-LABEL: test_v16f32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d3, r2, r3 -; CHECK-NEXT: vldr s0, .LCPI6_0 ; CHECK-NEXT: vmov d2, r0, r1 +; CHECK-NEXT: vldr s0, .LCPI6_0 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vmul.f32 s0, s4, s0 ; CHECK-NEXT: vmul.f32 s0, s0, s5 +; CHECK-NEXT: vmov d3, r2, r3 ; CHECK-NEXT: vmul.f32 s0, s0, s6 ; CHECK-NEXT: vmul.f32 s0, s0, s7 ; CHECK-NEXT: vld1.64 {d2, d3}, [r0] diff --git a/llvm/test/CodeGen/ARM/vldlane.ll b/llvm/test/CodeGen/ARM/vldlane.ll --- a/llvm/test/CodeGen/ARM/vldlane.ll +++ b/llvm/test/CodeGen/ARM/vldlane.ll @@ -54,7 +54,7 @@ define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: vld1laneQi8: -;CHECK: vld1.8 {d17[1]}, [r0] +;CHECK: vld1.8 {d{{[0-9]+}}[1]}, [r0] %tmp1 = load <16 x i8>, <16 x i8>* %B %tmp2 = load i8, i8* %A, align 8 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 @@ -63,7 +63,7 @@ define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK-LABEL: vld1laneQi16: -;CHECK: vld1.16 {d17[1]}, [r0:16] +;CHECK: vld1.16 {d{{[0-9]+}}[1]}, [r0:16] %tmp1 = load <8 x i16>, <8 x i16>* %B %tmp2 = load i16, i16* %A, align 8 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 @@ -72,7 +72,7 @@ define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK-LABEL: vld1laneQi32: -;CHECK: vld1.32 {d17[1]}, [r0:32] +;CHECK: vld1.32 {d{{[0-9]+}}[1]}, [r0:32] %tmp1 = load <4 x i32>, <4 x i32>* %B %tmp2 = load i32, i32* %A, align 8 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 @@ -81,7 +81,7 @@ define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { ;CHECK-LABEL: vld1laneQf: -;CHECK: vld1.32 {d16[0]}, [r0:32] +;CHECK: vld1.32 {d{{[0-9]+}}[0]}, [r0:32] %tmp1 = load <4 x float>, <4 x float>* %B %tmp2 = load float, float* %A %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 diff --git a/llvm/test/CodeGen/ARM/vzip.ll b/llvm/test/CodeGen/ARM/vzip.ll --- a/llvm/test/CodeGen/ARM/vzip.ll +++ b/llvm/test/CodeGen/ARM/vzip.ll @@ -291,7 +291,7 @@ ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d19, [r0] ; CHECK-NEXT: vtrn.16 d19, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d19 ; CHECK-NEXT: mov pc, lr entry: diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll --- a/llvm/test/CodeGen/Mips/cconv/vector.ll +++ b/llvm/test/CodeGen/Mips/cconv/vector.ll @@ -960,84 +960,68 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: i8_8: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: sw $6, 24($sp) -; MIPS32R5EB-NEXT: lbu $1, 25($sp) -; MIPS32R5EB-NEXT: lbu $2, 24($sp) -; MIPS32R5EB-NEXT: sw $7, 28($sp) -; MIPS32R5EB-NEXT: insert.h $w0[0], $2 -; MIPS32R5EB-NEXT: insert.h $w0[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 26($sp) -; MIPS32R5EB-NEXT: sw $4, 32($sp) -; MIPS32R5EB-NEXT: insert.h $w0[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 27($sp) -; MIPS32R5EB-NEXT: insert.h $w0[3], $1 -; MIPS32R5EB-NEXT: lbu $1, 28($sp) -; MIPS32R5EB-NEXT: sw $5, 36($sp) -; MIPS32R5EB-NEXT: insert.h $w0[4], $1 -; MIPS32R5EB-NEXT: lbu $1, 33($sp) -; MIPS32R5EB-NEXT: lbu $2, 32($sp) -; MIPS32R5EB-NEXT: insert.h $w1[0], $2 -; MIPS32R5EB-NEXT: insert.h $w1[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 29($sp) -; MIPS32R5EB-NEXT: lbu $2, 34($sp) -; MIPS32R5EB-NEXT: insert.h $w1[2], $2 -; MIPS32R5EB-NEXT: insert.h $w0[5], $1 -; MIPS32R5EB-NEXT: lbu $1, 35($sp) -; MIPS32R5EB-NEXT: lbu $2, 31($sp) -; MIPS32R5EB-NEXT: lbu $3, 30($sp) -; MIPS32R5EB-NEXT: lbu $4, 39($sp) -; MIPS32R5EB-NEXT: insert.h $w0[6], $3 -; MIPS32R5EB-NEXT: insert.h $w0[7], $2 -; MIPS32R5EB-NEXT: insert.h $w1[3], $1 -; MIPS32R5EB-NEXT: lbu $1, 36($sp) -; MIPS32R5EB-NEXT: insert.h $w1[4], $1 -; MIPS32R5EB-NEXT: lbu $1, 37($sp) -; MIPS32R5EB-NEXT: insert.h $w1[5], $1 -; MIPS32R5EB-NEXT: lbu $1, 38($sp) -; MIPS32R5EB-NEXT: insert.h $w1[6], $1 -; MIPS32R5EB-NEXT: insert.h $w1[7], $4 -; MIPS32R5EB-NEXT: addv.h $w0, $w1, $w0 -; MIPS32R5EB-NEXT: copy_s.h $1, $w0[0] -; MIPS32R5EB-NEXT: copy_s.h $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.h $3, $w0[2] -; MIPS32R5EB-NEXT: copy_s.h $4, $w0[3] -; MIPS32R5EB-NEXT: copy_s.h $5, $w0[4] -; MIPS32R5EB-NEXT: copy_s.h $6, $w0[5] -; MIPS32R5EB-NEXT: copy_s.h $7, $w0[6] -; MIPS32R5EB-NEXT: copy_s.h $8, $w0[7] -; MIPS32R5EB-NEXT: sb $8, 23($sp) -; MIPS32R5EB-NEXT: sb $7, 22($sp) -; MIPS32R5EB-NEXT: sb $6, 21($sp) -; MIPS32R5EB-NEXT: sb $5, 20($sp) -; MIPS32R5EB-NEXT: sb $4, 19($sp) -; MIPS32R5EB-NEXT: sb $3, 18($sp) -; MIPS32R5EB-NEXT: sb $2, 17($sp) -; MIPS32R5EB-NEXT: sb $1, 16($sp) -; MIPS32R5EB-NEXT: lw $1, 20($sp) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: lw $1, 16($sp) -; MIPS32R5EB-NEXT: sw $1, 4($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: i8_8: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: addiu $sp, $sp, -24 +; MIPS32R5-NEXT: .cfi_def_cfa_offset 24 +; MIPS32R5-NEXT: sw $6, 8($sp) +; MIPS32R5-NEXT: lbu $1, 9($sp) +; MIPS32R5-NEXT: lbu $2, 8($sp) +; MIPS32R5-NEXT: sw $7, 12($sp) +; MIPS32R5-NEXT: insert.h $w0[0], $2 +; MIPS32R5-NEXT: insert.h $w0[1], $1 +; MIPS32R5-NEXT: lbu $1, 10($sp) +; MIPS32R5-NEXT: sw $4, 16($sp) +; MIPS32R5-NEXT: insert.h $w0[2], $1 +; MIPS32R5-NEXT: lbu $1, 11($sp) +; MIPS32R5-NEXT: insert.h $w0[3], $1 +; MIPS32R5-NEXT: lbu $1, 12($sp) +; MIPS32R5-NEXT: sw $5, 20($sp) +; MIPS32R5-NEXT: insert.h $w0[4], $1 +; MIPS32R5-NEXT: lbu $1, 17($sp) +; MIPS32R5-NEXT: lbu $2, 16($sp) +; MIPS32R5-NEXT: insert.h $w1[0], $2 +; MIPS32R5-NEXT: insert.h $w1[1], $1 +; MIPS32R5-NEXT: lbu $1, 13($sp) +; MIPS32R5-NEXT: lbu $2, 18($sp) +; MIPS32R5-NEXT: insert.h $w1[2], $2 +; MIPS32R5-NEXT: insert.h $w0[5], $1 +; MIPS32R5-NEXT: lbu $1, 19($sp) +; MIPS32R5-NEXT: lbu $2, 15($sp) +; MIPS32R5-NEXT: lbu $3, 14($sp) +; MIPS32R5-NEXT: lbu $4, 23($sp) +; MIPS32R5-NEXT: insert.h $w0[6], $3 +; MIPS32R5-NEXT: insert.h $w0[7], $2 +; MIPS32R5-NEXT: insert.h $w1[3], $1 +; MIPS32R5-NEXT: lbu $1, 20($sp) +; MIPS32R5-NEXT: insert.h $w1[4], $1 +; MIPS32R5-NEXT: lbu $1, 21($sp) +; MIPS32R5-NEXT: insert.h $w1[5], $1 +; MIPS32R5-NEXT: lbu $1, 22($sp) +; MIPS32R5-NEXT: insert.h $w1[6], $1 +; MIPS32R5-NEXT: insert.h $w1[7], $4 +; MIPS32R5-NEXT: addv.h $w0, $w1, $w0 +; MIPS32R5-NEXT: copy_s.h $1, $w0[4] +; MIPS32R5-NEXT: copy_s.h $2, $w0[5] +; MIPS32R5-NEXT: copy_s.h $3, $w0[6] +; MIPS32R5-NEXT: copy_s.h $4, $w0[7] +; MIPS32R5-NEXT: copy_s.h $5, $w0[0] +; MIPS32R5-NEXT: copy_s.h $6, $w0[1] +; MIPS32R5-NEXT: copy_s.h $7, $w0[2] +; MIPS32R5-NEXT: copy_s.h $8, $w0[3] +; MIPS32R5-NEXT: sb $8, 3($sp) +; MIPS32R5-NEXT: sb $7, 2($sp) +; MIPS32R5-NEXT: sb $6, 1($sp) +; MIPS32R5-NEXT: sb $5, 0($sp) +; MIPS32R5-NEXT: sb $4, 7($sp) +; MIPS32R5-NEXT: sb $3, 6($sp) +; MIPS32R5-NEXT: sb $2, 5($sp) +; MIPS32R5-NEXT: sb $1, 4($sp) +; MIPS32R5-NEXT: lw $2, 0($sp) +; MIPS32R5-NEXT: lw $3, 4($sp) +; MIPS32R5-NEXT: addiu $sp, $sp, 24 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: i8_8: ; MIPS64R5: # %bb.0: @@ -1098,85 +1082,6 @@ ; MIPS64R5-NEXT: daddiu $sp, $sp, 32 ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: i8_8: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: sw $6, 24($sp) -; MIPS32R5EL-NEXT: lbu $1, 25($sp) -; MIPS32R5EL-NEXT: lbu $2, 24($sp) -; MIPS32R5EL-NEXT: sw $7, 28($sp) -; MIPS32R5EL-NEXT: insert.h $w0[0], $2 -; MIPS32R5EL-NEXT: insert.h $w0[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 26($sp) -; MIPS32R5EL-NEXT: sw $4, 32($sp) -; MIPS32R5EL-NEXT: insert.h $w0[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 27($sp) -; MIPS32R5EL-NEXT: insert.h $w0[3], $1 -; MIPS32R5EL-NEXT: lbu $1, 28($sp) -; MIPS32R5EL-NEXT: sw $5, 36($sp) -; MIPS32R5EL-NEXT: insert.h $w0[4], $1 -; MIPS32R5EL-NEXT: lbu $1, 33($sp) -; MIPS32R5EL-NEXT: lbu $2, 32($sp) -; MIPS32R5EL-NEXT: insert.h $w1[0], $2 -; MIPS32R5EL-NEXT: insert.h $w1[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 29($sp) -; MIPS32R5EL-NEXT: lbu $2, 34($sp) -; MIPS32R5EL-NEXT: insert.h $w1[2], $2 -; MIPS32R5EL-NEXT: insert.h $w0[5], $1 -; MIPS32R5EL-NEXT: lbu $1, 35($sp) -; MIPS32R5EL-NEXT: lbu $2, 31($sp) -; MIPS32R5EL-NEXT: lbu $3, 30($sp) -; MIPS32R5EL-NEXT: lbu $4, 39($sp) -; MIPS32R5EL-NEXT: insert.h $w0[6], $3 -; MIPS32R5EL-NEXT: insert.h $w0[7], $2 -; MIPS32R5EL-NEXT: insert.h $w1[3], $1 -; MIPS32R5EL-NEXT: lbu $1, 36($sp) -; MIPS32R5EL-NEXT: insert.h $w1[4], $1 -; MIPS32R5EL-NEXT: lbu $1, 37($sp) -; MIPS32R5EL-NEXT: insert.h $w1[5], $1 -; MIPS32R5EL-NEXT: lbu $1, 38($sp) -; MIPS32R5EL-NEXT: insert.h $w1[6], $1 -; MIPS32R5EL-NEXT: insert.h $w1[7], $4 -; MIPS32R5EL-NEXT: addv.h $w0, $w1, $w0 -; MIPS32R5EL-NEXT: copy_s.h $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.h $2, $w0[1] -; MIPS32R5EL-NEXT: copy_s.h $3, $w0[2] -; MIPS32R5EL-NEXT: copy_s.h $4, $w0[3] -; MIPS32R5EL-NEXT: copy_s.h $5, $w0[4] -; MIPS32R5EL-NEXT: copy_s.h $6, $w0[5] -; MIPS32R5EL-NEXT: copy_s.h $7, $w0[6] -; MIPS32R5EL-NEXT: copy_s.h $8, $w0[7] -; MIPS32R5EL-NEXT: sb $8, 23($sp) -; MIPS32R5EL-NEXT: sb $7, 22($sp) -; MIPS32R5EL-NEXT: sb $6, 21($sp) -; MIPS32R5EL-NEXT: sb $5, 20($sp) -; MIPS32R5EL-NEXT: sb $4, 19($sp) -; MIPS32R5EL-NEXT: sb $3, 18($sp) -; MIPS32R5EL-NEXT: sb $2, 17($sp) -; MIPS32R5EL-NEXT: sb $1, 16($sp) -; MIPS32R5EL-NEXT: lw $1, 20($sp) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: lw $1, 16($sp) -; MIPS32R5EL-NEXT: sw $1, 0($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = add <8 x i8> %a, %b ret <8 x i8> %1 } @@ -1642,60 +1547,44 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: i16_4: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: sw $6, 24($sp) -; MIPS32R5EB-NEXT: sw $7, 28($sp) -; MIPS32R5EB-NEXT: lhu $1, 26($sp) -; MIPS32R5EB-NEXT: lhu $2, 24($sp) -; MIPS32R5EB-NEXT: sw $4, 32($sp) -; MIPS32R5EB-NEXT: insert.w $w0[0], $2 -; MIPS32R5EB-NEXT: insert.w $w0[1], $1 -; MIPS32R5EB-NEXT: lhu $1, 28($sp) -; MIPS32R5EB-NEXT: sw $5, 36($sp) -; MIPS32R5EB-NEXT: insert.w $w0[2], $1 -; MIPS32R5EB-NEXT: lhu $1, 30($sp) -; MIPS32R5EB-NEXT: insert.w $w0[3], $1 -; MIPS32R5EB-NEXT: lhu $1, 34($sp) -; MIPS32R5EB-NEXT: lhu $2, 32($sp) -; MIPS32R5EB-NEXT: insert.w $w1[0], $2 -; MIPS32R5EB-NEXT: insert.w $w1[1], $1 -; MIPS32R5EB-NEXT: lhu $1, 36($sp) -; MIPS32R5EB-NEXT: insert.w $w1[2], $1 -; MIPS32R5EB-NEXT: lhu $1, 38($sp) -; MIPS32R5EB-NEXT: insert.w $w1[3], $1 -; MIPS32R5EB-NEXT: addv.w $w0, $w1, $w0 -; MIPS32R5EB-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $4, $w0[3] -; MIPS32R5EB-NEXT: sh $4, 22($sp) -; MIPS32R5EB-NEXT: sh $3, 20($sp) -; MIPS32R5EB-NEXT: sh $2, 18($sp) -; MIPS32R5EB-NEXT: sh $1, 16($sp) -; MIPS32R5EB-NEXT: lw $1, 20($sp) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: lw $1, 16($sp) -; MIPS32R5EB-NEXT: sw $1, 4($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: i16_4: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: addiu $sp, $sp, -24 +; MIPS32R5-NEXT: .cfi_def_cfa_offset 24 +; MIPS32R5-NEXT: sw $6, 8($sp) +; MIPS32R5-NEXT: sw $7, 12($sp) +; MIPS32R5-NEXT: lhu $1, 10($sp) +; MIPS32R5-NEXT: lhu $2, 8($sp) +; MIPS32R5-NEXT: sw $4, 16($sp) +; MIPS32R5-NEXT: insert.w $w0[0], $2 +; MIPS32R5-NEXT: insert.w $w0[1], $1 +; MIPS32R5-NEXT: lhu $1, 12($sp) +; MIPS32R5-NEXT: sw $5, 20($sp) +; MIPS32R5-NEXT: insert.w $w0[2], $1 +; MIPS32R5-NEXT: lhu $1, 14($sp) +; MIPS32R5-NEXT: insert.w $w0[3], $1 +; MIPS32R5-NEXT: lhu $1, 18($sp) +; MIPS32R5-NEXT: lhu $2, 16($sp) +; MIPS32R5-NEXT: insert.w $w1[0], $2 +; MIPS32R5-NEXT: insert.w $w1[1], $1 +; MIPS32R5-NEXT: lhu $1, 20($sp) +; MIPS32R5-NEXT: insert.w $w1[2], $1 +; MIPS32R5-NEXT: lhu $1, 22($sp) +; MIPS32R5-NEXT: insert.w $w1[3], $1 +; MIPS32R5-NEXT: addv.w $w0, $w1, $w0 +; MIPS32R5-NEXT: copy_s.w $1, $w0[2] +; MIPS32R5-NEXT: copy_s.w $2, $w0[3] +; MIPS32R5-NEXT: copy_s.w $3, $w0[0] +; MIPS32R5-NEXT: copy_s.w $4, $w0[1] +; MIPS32R5-NEXT: sh $4, 2($sp) +; MIPS32R5-NEXT: sh $3, 0($sp) +; MIPS32R5-NEXT: sh $2, 6($sp) +; MIPS32R5-NEXT: sh $1, 4($sp) +; MIPS32R5-NEXT: lw $2, 0($sp) +; MIPS32R5-NEXT: lw $3, 4($sp) +; MIPS32R5-NEXT: addiu $sp, $sp, 24 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: i16_4: ; MIPS64R5: # %bb.0: @@ -1732,61 +1621,6 @@ ; MIPS64R5-NEXT: daddiu $sp, $sp, 32 ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: i16_4: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: sw $6, 24($sp) -; MIPS32R5EL-NEXT: sw $7, 28($sp) -; MIPS32R5EL-NEXT: lhu $1, 26($sp) -; MIPS32R5EL-NEXT: lhu $2, 24($sp) -; MIPS32R5EL-NEXT: sw $4, 32($sp) -; MIPS32R5EL-NEXT: insert.w $w0[0], $2 -; MIPS32R5EL-NEXT: insert.w $w0[1], $1 -; MIPS32R5EL-NEXT: lhu $1, 28($sp) -; MIPS32R5EL-NEXT: sw $5, 36($sp) -; MIPS32R5EL-NEXT: insert.w $w0[2], $1 -; MIPS32R5EL-NEXT: lhu $1, 30($sp) -; MIPS32R5EL-NEXT: insert.w $w0[3], $1 -; MIPS32R5EL-NEXT: lhu $1, 34($sp) -; MIPS32R5EL-NEXT: lhu $2, 32($sp) -; MIPS32R5EL-NEXT: insert.w $w1[0], $2 -; MIPS32R5EL-NEXT: insert.w $w1[1], $1 -; MIPS32R5EL-NEXT: lhu $1, 36($sp) -; MIPS32R5EL-NEXT: insert.w $w1[2], $1 -; MIPS32R5EL-NEXT: lhu $1, 38($sp) -; MIPS32R5EL-NEXT: insert.w $w1[3], $1 -; MIPS32R5EL-NEXT: addv.w $w0, $w1, $w0 -; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $4, $w0[3] -; MIPS32R5EL-NEXT: sh $4, 22($sp) -; MIPS32R5EL-NEXT: sh $3, 20($sp) -; MIPS32R5EL-NEXT: sh $2, 18($sp) -; MIPS32R5EL-NEXT: sh $1, 16($sp) -; MIPS32R5EL-NEXT: lw $1, 20($sp) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: lw $1, 16($sp) -; MIPS32R5EL-NEXT: sw $1, 0($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = add <4 x i16> %a, %b ret <4 x i16> %1 } @@ -2829,33 +2663,14 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: ret_8_i8: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: lui $1, %hi(gv8i8) -; MIPS32R5EB-NEXT: lw $2, %lo(gv8i8)($1) -; MIPS32R5EB-NEXT: sw $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i8) -; MIPS32R5EB-NEXT: lw $1, 4($1) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: ret_8_i8: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: lui $1, %hi(gv8i8) +; MIPS32R5-NEXT: lw $2, %lo(gv8i8)($1) +; MIPS32R5-NEXT: addiu $1, $1, %lo(gv8i8) +; MIPS32R5-NEXT: lw $3, 4($1) +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: ret_8_i8: ; MIPS64R5: # %bb.0: @@ -2866,34 +2681,6 @@ ; MIPS64R5-NEXT: ld $2, 0($1) ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: ret_8_i8: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: lui $1, %hi(gv8i8) -; MIPS32R5EL-NEXT: lw $2, %lo(gv8i8)($1) -; MIPS32R5EL-NEXT: sw $2, 0($sp) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i8) -; MIPS32R5EL-NEXT: lw $1, 4($1) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = load <8 x i8>, <8 x i8> * @gv8i8 ret <8 x i8> %1 } @@ -3006,33 +2793,14 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: ret_4_i16: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: lui $1, %hi(gv4i16) -; MIPS32R5EB-NEXT: lw $2, %lo(gv4i16)($1) -; MIPS32R5EB-NEXT: sw $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv4i16) -; MIPS32R5EB-NEXT: lw $1, 4($1) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: ret_4_i16: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: lui $1, %hi(gv4i16) +; MIPS32R5-NEXT: lw $2, %lo(gv4i16)($1) +; MIPS32R5-NEXT: addiu $1, $1, %lo(gv4i16) +; MIPS32R5-NEXT: lw $3, 4($1) +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: ret_4_i16: ; MIPS64R5: # %bb.0: @@ -3043,34 +2811,6 @@ ; MIPS64R5-NEXT: ld $2, 0($1) ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: ret_4_i16: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: lui $1, %hi(gv4i16) -; MIPS32R5EL-NEXT: lw $2, %lo(gv4i16)($1) -; MIPS32R5EL-NEXT: sw $2, 0($sp) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv4i16) -; MIPS32R5EL-NEXT: lw $1, 4($1) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = load <4 x i16>, <4 x i16> * @gv4i16 ret <4 x i16> %1 } @@ -3145,33 +2885,14 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: ret_2_i32: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: lui $1, %hi(gv2i32) -; MIPS32R5EB-NEXT: lw $2, %lo(gv2i32)($1) -; MIPS32R5EB-NEXT: sw $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2i32) -; MIPS32R5EB-NEXT: lw $1, 4($1) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: ret_2_i32: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: lui $1, %hi(gv2i32) +; MIPS32R5-NEXT: lw $2, %lo(gv2i32)($1) +; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2i32) +; MIPS32R5-NEXT: lw $3, 4($1) +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: ret_2_i32: ; MIPS64R5: # %bb.0: @@ -3182,34 +2903,6 @@ ; MIPS64R5-NEXT: ld $2, 0($1) ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: ret_2_i32: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: lui $1, %hi(gv2i32) -; MIPS32R5EL-NEXT: lw $2, %lo(gv2i32)($1) -; MIPS32R5EL-NEXT: sw $2, 0($sp) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2i32) -; MIPS32R5EL-NEXT: lw $1, 4($1) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = load <2 x i32>, <2 x i32> * @gv2i32 ret <2 x i32> %1 } @@ -4170,77 +3863,81 @@ ; MIPS64EB-NEXT: jr $ra ; MIPS64EB-NEXT: nop ; -; MIPS32R5-LABEL: calli8_16: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -40 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 40 -; MIPS32R5-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: lui $1, %hi($CPI30_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI30_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5-NEXT: lui $1, %hi($CPI30_1) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI30_1) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: jal i8_16 -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv16i8) -; MIPS32R5-NEXT: insert.w $w0[0], $2 -; MIPS32R5-NEXT: insert.w $w0[1], $3 -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv16i8) -; MIPS32R5-NEXT: insert.w $w0[2], $4 -; MIPS32R5-NEXT: insert.w $w0[3], $5 -; MIPS32R5-NEXT: st.w $w0, 0($1) -; MIPS32R5-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 40 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calli8_16: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: lui $1, 3080 +; MIPS32R5EB-NEXT: ori $1, $1, 2314 +; MIPS32R5EB-NEXT: lui $2, 1801 +; MIPS32R5EB-NEXT: sw $1, 28($sp) +; MIPS32R5EB-NEXT: ori $1, $2, 1801 +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: sw $1, 20($sp) +; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: lui $1, 1543 +; MIPS32R5EB-NEXT: ori $4, $1, 1543 +; MIPS32R5EB-NEXT: ori $7, $1, 2314 +; MIPS32R5EB-NEXT: move $5, $4 +; MIPS32R5EB-NEXT: move $6, $4 +; MIPS32R5EB-NEXT: jal i8_16 +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: insert.w $w0[0], $2 +; MIPS32R5EB-NEXT: insert.w $w0[1], $3 +; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv16i8) +; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv16i8) +; MIPS32R5EB-NEXT: st.w $w0, 0($1) +; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; -; MIPS64R5-LABEL: calli8_16: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI30_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI30_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI30_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI30_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(i8_16)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv16i8)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: calli8_16: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) +; MIPS64R5EB-NEXT: lui $1, 1801 +; MIPS64R5EB-NEXT: daddiu $1, $1, 1801 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 1801 +; MIPS64R5EB-NEXT: lui $2, 1543 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $2, $2, 1543 +; MIPS64R5EB-NEXT: dsll $2, $2, 16 +; MIPS64R5EB-NEXT: daddiu $2, $2, 1543 +; MIPS64R5EB-NEXT: dsll $2, $2, 16 +; MIPS64R5EB-NEXT: daddiu $4, $2, 1543 +; MIPS64R5EB-NEXT: daddiu $5, $2, 2314 +; MIPS64R5EB-NEXT: daddiu $6, $1, 1801 +; MIPS64R5EB-NEXT: lui $1, 225 +; MIPS64R5EB-NEXT: daddiu $1, $1, 8417 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 8577 +; MIPS64R5EB-NEXT: dsll $1, $1, 19 +; MIPS64R5EB-NEXT: daddiu $7, $1, 2314 +; MIPS64R5EB-NEXT: ld $25, %call16(i8_16)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv16i8)($gp) +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS32EL-LABEL: calli8_16: ; MIPS32EL: # %bb.0: # %entry @@ -4320,6 +4017,87 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calli8_16: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: lui $1, 2569 +; MIPS32R5EL-NEXT: ori $2, $1, 2060 +; MIPS32R5EL-NEXT: lui $3, 2311 +; MIPS32R5EL-NEXT: sw $2, 28($sp) +; MIPS32R5EL-NEXT: ori $2, $3, 2311 +; MIPS32R5EL-NEXT: sw $2, 24($sp) +; MIPS32R5EL-NEXT: sw $2, 20($sp) +; MIPS32R5EL-NEXT: sw $2, 16($sp) +; MIPS32R5EL-NEXT: lui $2, 1798 +; MIPS32R5EL-NEXT: ori $4, $2, 1798 +; MIPS32R5EL-NEXT: ori $7, $1, 1798 +; MIPS32R5EL-NEXT: move $5, $4 +; MIPS32R5EL-NEXT: move $6, $4 +; MIPS32R5EL-NEXT: jal i8_16 +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: insert.w $w0[0], $2 +; MIPS32R5EL-NEXT: insert.w $w0[1], $3 +; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv16i8) +; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv16i8) +; MIPS32R5EL-NEXT: st.w $w0, 0($1) +; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop +; +; MIPS64R5EL-LABEL: calli8_16: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) +; MIPS64R5EL-NEXT: lui $1, 1285 +; MIPS64R5EL-NEXT: daddiu $1, $1, -31869 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 899 +; MIPS64R5EL-NEXT: lui $2, 2311 +; MIPS64R5EL-NEXT: daddiu $2, $2, 2311 +; MIPS64R5EL-NEXT: dsll $2, $2, 16 +; MIPS64R5EL-NEXT: daddiu $2, $2, 2311 +; MIPS64R5EL-NEXT: dsll $2, $2, 16 +; MIPS64R5EL-NEXT: dsll $1, $1, 17 +; MIPS64R5EL-NEXT: lui $3, 899 +; MIPS64R5EL-NEXT: daddiu $3, $3, 899 +; MIPS64R5EL-NEXT: dsll $3, $3, 16 +; MIPS64R5EL-NEXT: daddiu $3, $3, 899 +; MIPS64R5EL-NEXT: dsll $3, $3, 17 +; MIPS64R5EL-NEXT: daddiu $4, $3, 1798 +; MIPS64R5EL-NEXT: daddiu $5, $1, 1798 +; MIPS64R5EL-NEXT: daddiu $6, $2, 2311 +; MIPS64R5EL-NEXT: lui $1, 642 +; MIPS64R5EL-NEXT: daddiu $1, $1, 16899 +; MIPS64R5EL-NEXT: dsll $1, $1, 18 +; MIPS64R5EL-NEXT: daddiu $1, $1, 2311 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $7, $1, 2311 +; MIPS64R5EL-NEXT: ld $25, %call16(i8_16)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv16i8)($gp) +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <16 x i8> @i8_16(<16 x i8> , <16 x i8> ) store <16 x i8> %0, <16 x i8> * @gv16i8 @@ -4825,36 +4603,26 @@ ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 ; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill ; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: lui $1, 9 +; MIPS32R5EB-NEXT: ori $5, $1, 10 +; MIPS32R5EB-NEXT: sw $5, 28($sp) +; MIPS32R5EB-NEXT: lui $1, 12 +; MIPS32R5EB-NEXT: ori $1, $1, 8 +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: sw $5, 20($sp) ; MIPS32R5EB-NEXT: lui $1, 6 -; MIPS32R5EB-NEXT: ori $1, $1, 7 -; MIPS32R5EB-NEXT: lui $2, 9 -; MIPS32R5EB-NEXT: ori $2, $2, 10 -; MIPS32R5EB-NEXT: fill.w $w0, $2 -; MIPS32R5EB-NEXT: insert.w $w0[1], $1 -; MIPS32R5EB-NEXT: splati.d $w0, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5EB-NEXT: lui $1, %hi($CPI33_0) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo($CPI33_0) -; MIPS32R5EB-NEXT: ld.w $w0, 0($1) -; MIPS32R5EB-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5EB-NEXT: sw $8, 28($sp) -; MIPS32R5EB-NEXT: sw $3, 24($sp) -; MIPS32R5EB-NEXT: sw $2, 20($sp) -; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: ori $4, $1, 7 +; MIPS32R5EB-NEXT: sw $4, 16($sp) +; MIPS32R5EB-NEXT: move $6, $4 +; MIPS32R5EB-NEXT: move $7, $5 ; MIPS32R5EB-NEXT: jal i16_8 ; MIPS32R5EB-NEXT: nop -; MIPS32R5EB-NEXT: lui $1, %hi(gv8i16) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EB-NEXT: insert.w $w0[0], $2 ; MIPS32R5EB-NEXT: insert.w $w0[1], $3 ; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv8i16) ; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EB-NEXT: st.w $w0, 0($1) ; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 @@ -4872,20 +4640,21 @@ ; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_8))) ; MIPS64R5EB-NEXT: daddu $1, $1, $25 ; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli16_8))) -; MIPS64R5EB-NEXT: lui $1, 9 -; MIPS64R5EB-NEXT: ori $1, $1, 10 -; MIPS64R5EB-NEXT: lui $2, 6 -; MIPS64R5EB-NEXT: ori $2, $2, 7 -; MIPS64R5EB-NEXT: dinsu $1, $2, 32, 32 -; MIPS64R5EB-NEXT: fill.d $w0, $1 -; MIPS64R5EB-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5EB-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5EB-NEXT: ld $1, %got_page(.LCPI33_0)($gp) -; MIPS64R5EB-NEXT: daddiu $1, $1, %got_ofst(.LCPI33_0) -; MIPS64R5EB-NEXT: ld.d $w0, 0($1) -; MIPS64R5EB-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5EB-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5EB-NEXT: lui $1, 6 +; MIPS64R5EB-NEXT: daddiu $1, $1, 7 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $4, $1, 10 +; MIPS64R5EB-NEXT: lui $1, 2 +; MIPS64R5EB-NEXT: daddiu $1, $1, -32767 +; MIPS64R5EB-NEXT: dsll $1, $1, 19 +; MIPS64R5EB-NEXT: daddiu $1, $1, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $7, $1, 10 ; MIPS64R5EB-NEXT: ld $25, %call16(i16_8)($gp) +; MIPS64R5EB-NEXT: move $5, $4 +; MIPS64R5EB-NEXT: move $6, $4 ; MIPS64R5EB-NEXT: jalr $25 ; MIPS64R5EB-NEXT: nop ; MIPS64R5EB-NEXT: ld $1, %got_disp(gv8i16)($gp) @@ -4973,35 +4742,25 @@ ; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill ; MIPS32R5EL-NEXT: .cfi_offset 31, -4 ; MIPS32R5EL-NEXT: lui $1, 10 -; MIPS32R5EL-NEXT: ori $1, $1, 9 -; MIPS32R5EL-NEXT: lui $2, 7 -; MIPS32R5EL-NEXT: ori $2, $2, 6 -; MIPS32R5EL-NEXT: fill.w $w0, $2 -; MIPS32R5EL-NEXT: insert.w $w0[1], $1 -; MIPS32R5EL-NEXT: splati.d $w0, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5EL-NEXT: lui $1, %hi($CPI33_0) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo($CPI33_0) -; MIPS32R5EL-NEXT: ld.w $w0, 0($1) -; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5EL-NEXT: sw $8, 28($sp) -; MIPS32R5EL-NEXT: sw $3, 24($sp) -; MIPS32R5EL-NEXT: sw $2, 20($sp) -; MIPS32R5EL-NEXT: sw $1, 16($sp) +; MIPS32R5EL-NEXT: ori $5, $1, 9 +; MIPS32R5EL-NEXT: sw $5, 28($sp) +; MIPS32R5EL-NEXT: lui $1, 8 +; MIPS32R5EL-NEXT: ori $1, $1, 12 +; MIPS32R5EL-NEXT: sw $1, 24($sp) +; MIPS32R5EL-NEXT: sw $5, 20($sp) +; MIPS32R5EL-NEXT: lui $1, 7 +; MIPS32R5EL-NEXT: ori $4, $1, 6 +; MIPS32R5EL-NEXT: sw $4, 16($sp) +; MIPS32R5EL-NEXT: move $6, $4 +; MIPS32R5EL-NEXT: move $7, $5 ; MIPS32R5EL-NEXT: jal i16_8 ; MIPS32R5EL-NEXT: nop -; MIPS32R5EL-NEXT: lui $1, %hi(gv8i16) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EL-NEXT: insert.w $w0[0], $2 ; MIPS32R5EL-NEXT: insert.w $w0[1], $3 ; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv8i16) ; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EL-NEXT: st.w $w0, 0($1) ; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 @@ -5019,20 +4778,21 @@ ; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_8))) ; MIPS64R5EL-NEXT: daddu $1, $1, $25 ; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli16_8))) -; MIPS64R5EL-NEXT: lui $1, 7 -; MIPS64R5EL-NEXT: ori $1, $1, 6 -; MIPS64R5EL-NEXT: lui $2, 10 -; MIPS64R5EL-NEXT: ori $2, $2, 9 -; MIPS64R5EL-NEXT: dinsu $1, $2, 32, 32 -; MIPS64R5EL-NEXT: fill.d $w0, $1 -; MIPS64R5EL-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5EL-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5EL-NEXT: ld $1, %got_page(.LCPI33_0)($gp) -; MIPS64R5EL-NEXT: daddiu $1, $1, %got_ofst(.LCPI33_0) -; MIPS64R5EL-NEXT: ld.d $w0, 0($1) -; MIPS64R5EL-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5EL-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5EL-NEXT: lui $1, 10 +; MIPS64R5EL-NEXT: daddiu $1, $1, 9 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 7 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $4, $1, 6 +; MIPS64R5EL-NEXT: lui $1, 1 +; MIPS64R5EL-NEXT: daddiu $1, $1, 16385 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 8193 +; MIPS64R5EL-NEXT: dsll $1, $1, 19 +; MIPS64R5EL-NEXT: daddiu $7, $1, 12 ; MIPS64R5EL-NEXT: ld $25, %call16(i16_8)($gp) +; MIPS64R5EL-NEXT: move $5, $4 +; MIPS64R5EL-NEXT: move $6, $4 ; MIPS64R5EL-NEXT: jalr $25 ; MIPS64R5EL-NEXT: nop ; MIPS64R5EL-NEXT: ld $1, %got_disp(gv8i16)($gp) @@ -5304,39 +5064,38 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: calli32_4: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI35_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI35_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI35_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI35_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(i32_4)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv4i32)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: calli32_4: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 3 +; MIPS64R5EB-NEXT: dsll $2, $1, 33 +; MIPS64R5EB-NEXT: daddiu $4, $2, 7 +; MIPS64R5EB-NEXT: dsll $1, $1, 34 +; MIPS64R5EB-NEXT: daddiu $6, $1, 8 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 32 +; MIPS64R5EB-NEXT: daddiu $5, $1, 10 +; MIPS64R5EB-NEXT: ld $25, %call16(i32_4)($gp) +; MIPS64R5EB-NEXT: move $7, $5 +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4i32)($gp) +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: calli32_4: ; MIPS64EL: # %bb.0: # %entry @@ -5370,6 +5129,40 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: calli32_4: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 7 +; MIPS64R5EL-NEXT: dsll $1, $1, 32 +; MIPS64R5EL-NEXT: daddiu $4, $1, 6 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 1 +; MIPS64R5EL-NEXT: dsll $1, $1, 35 +; MIPS64R5EL-NEXT: daddiu $6, $1, 12 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 5 +; MIPS64R5EL-NEXT: dsll $1, $1, 33 +; MIPS64R5EL-NEXT: daddiu $5, $1, 9 +; MIPS64R5EL-NEXT: ld $25, %call16(i32_4)($gp) +; MIPS64R5EL-NEXT: move $7, $5 +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4i32)($gp) +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <4 x i32> @i32_4(<4 x i32> , <4 x i32> ) store <4 x i32> %0, <4 x i32> * @gv4i32 @@ -5433,43 +5226,35 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5-LABEL: calli64_2: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -40 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 40 -; MIPS32R5-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: lui $1, %hi($CPI36_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI36_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5-NEXT: lui $1, %hi($CPI36_1) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI36_1) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: jal i64_2 -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv2i64) -; MIPS32R5-NEXT: insert.w $w0[0], $2 -; MIPS32R5-NEXT: insert.w $w0[1], $3 -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2i64) -; MIPS32R5-NEXT: insert.w $w0[2], $4 -; MIPS32R5-NEXT: insert.w $w0[3], $5 -; MIPS32R5-NEXT: st.w $w0, 0($1) -; MIPS32R5-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 40 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calli64_2: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: addiu $1, $zero, 8 +; MIPS32R5EB-NEXT: sw $1, 28($sp) +; MIPS32R5EB-NEXT: addiu $1, $zero, 12 +; MIPS32R5EB-NEXT: sw $1, 20($sp) +; MIPS32R5EB-NEXT: sw $zero, 24($sp) +; MIPS32R5EB-NEXT: sw $zero, 16($sp) +; MIPS32R5EB-NEXT: addiu $4, $zero, 0 +; MIPS32R5EB-NEXT: addiu $5, $zero, 6 +; MIPS32R5EB-NEXT: addiu $6, $zero, 0 +; MIPS32R5EB-NEXT: addiu $7, $zero, 7 +; MIPS32R5EB-NEXT: jal i64_2 +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: insert.w $w0[0], $2 +; MIPS32R5EB-NEXT: insert.w $w0[1], $3 +; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv2i64) +; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2i64) +; MIPS32R5EB-NEXT: st.w $w0, 0($1) +; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; ; MIPS64R5-LABEL: calli64_2: ; MIPS64R5: # %bb.0: # %entry @@ -5527,6 +5312,36 @@ ; MIPS32EL-NEXT: addiu $sp, $sp, 40 ; MIPS32EL-NEXT: jr $ra ; MIPS32EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calli64_2: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: addiu $1, $zero, 8 +; MIPS32R5EL-NEXT: sw $1, 24($sp) +; MIPS32R5EL-NEXT: addiu $1, $zero, 12 +; MIPS32R5EL-NEXT: sw $1, 16($sp) +; MIPS32R5EL-NEXT: sw $zero, 28($sp) +; MIPS32R5EL-NEXT: sw $zero, 20($sp) +; MIPS32R5EL-NEXT: addiu $4, $zero, 6 +; MIPS32R5EL-NEXT: addiu $5, $zero, 0 +; MIPS32R5EL-NEXT: addiu $6, $zero, 7 +; MIPS32R5EL-NEXT: addiu $7, $zero, 0 +; MIPS32R5EL-NEXT: jal i64_2 +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: insert.w $w0[0], $2 +; MIPS32R5EL-NEXT: insert.w $w0[1], $3 +; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv2i64) +; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2i64) +; MIPS32R5EL-NEXT: st.w $w0, 0($1) +; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop entry: %0 = call <2 x i64> @i64_2(<2 x i64> , <2 x i64> ) store <2 x i64> %0, <2 x i64> * @gv2i64 @@ -5618,35 +5433,33 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: callfloat_2: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI37_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI37_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI37_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI37_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $5, $w0[0] -; MIPS64R5-NEXT: ld $25, %call16(float2_extern)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: ld $1, %got_disp(gv2f32)($gp) -; MIPS64R5-NEXT: sd $2, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: callfloat_2: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 383 +; MIPS64R5EB-NEXT: dsll $4, $1, 23 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 261 +; MIPS64R5EB-NEXT: dsll $1, $1, 33 +; MIPS64R5EB-NEXT: daddiu $1, $1, 523 +; MIPS64R5EB-NEXT: dsll $5, $1, 21 +; MIPS64R5EB-NEXT: ld $25, %call16(float2_extern)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv2f32)($gp) +; MIPS64R5EB-NEXT: sd $2, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: callfloat_2: ; MIPS64EL: # %bb.0: # %entry @@ -5675,6 +5488,34 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: callfloat_2: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 383 +; MIPS64R5EL-NEXT: dsll $4, $1, 55 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 523 +; MIPS64R5EL-NEXT: dsll $1, $1, 31 +; MIPS64R5EL-NEXT: daddiu $1, $1, 261 +; MIPS64R5EL-NEXT: dsll $5, $1, 22 +; MIPS64R5EL-NEXT: ld $25, %call16(float2_extern)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv2f32)($gp) +; MIPS64R5EL-NEXT: sd $2, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <2 x float> @float2_extern(<2 x float> , <2 x float> ) store <2 x float> %0, <2 x float> * @gv2f32 @@ -5777,27 +5618,21 @@ ; MIPS32R5-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5-NEXT: addiu $1, $zero, -16 ; MIPS32R5-NEXT: and $sp, $sp, $1 -; MIPS32R5-NEXT: lui $1, %hi($CPI38_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI38_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $6, $w0[0] -; MIPS32R5-NEXT: copy_s.w $7, $w0[1] -; MIPS32R5-NEXT: copy_s.w $1, $w0[2] -; MIPS32R5-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5-NEXT: lui $3, %hi($CPI38_1) -; MIPS32R5-NEXT: addiu $3, $3, %lo($CPI38_1) -; MIPS32R5-NEXT: ld.w $w0, 0($3) -; MIPS32R5-NEXT: copy_s.w $3, $w0[0] -; MIPS32R5-NEXT: copy_s.w $4, $w0[1] -; MIPS32R5-NEXT: copy_s.w $5, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 36($sp) -; MIPS32R5-NEXT: sw $5, 32($sp) -; MIPS32R5-NEXT: sw $4, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) +; MIPS32R5-NEXT: lui $1, 16704 +; MIPS32R5-NEXT: lui $2, 16736 +; MIPS32R5-NEXT: lui $3, 16752 +; MIPS32R5-NEXT: lui $4, 16768 +; MIPS32R5-NEXT: sw $4, 36($sp) +; MIPS32R5-NEXT: sw $3, 32($sp) +; MIPS32R5-NEXT: sw $2, 28($sp) +; MIPS32R5-NEXT: sw $1, 24($sp) +; MIPS32R5-NEXT: lui $1, 16512 +; MIPS32R5-NEXT: sw $1, 20($sp) +; MIPS32R5-NEXT: lui $1, 16384 ; MIPS32R5-NEXT: sw $1, 16($sp) ; MIPS32R5-NEXT: addiu $4, $sp, 48 +; MIPS32R5-NEXT: addiu $6, $zero, 0 +; MIPS32R5-NEXT: lui $7, 49024 ; MIPS32R5-NEXT: jal float4_extern ; MIPS32R5-NEXT: nop ; MIPS32R5-NEXT: lui $1, %hi(gv4f32) @@ -5811,39 +5646,43 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: callfloat_4: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI38_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI38_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI38_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI38_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(float4_extern)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv4f32)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: callfloat_4: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 1 +; MIPS64R5EB-NEXT: dsll $1, $1, 39 +; MIPS64R5EB-NEXT: daddiu $1, $1, 129 +; MIPS64R5EB-NEXT: daddiu $2, $zero, 261 +; MIPS64R5EB-NEXT: dsll $2, $2, 33 +; MIPS64R5EB-NEXT: daddiu $3, $zero, 383 +; MIPS64R5EB-NEXT: dsll $4, $3, 23 +; MIPS64R5EB-NEXT: dsll $5, $1, 23 +; MIPS64R5EB-NEXT: daddiu $1, $2, 523 +; MIPS64R5EB-NEXT: dsll $6, $1, 21 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 1047 +; MIPS64R5EB-NEXT: dsll $1, $1, 29 +; MIPS64R5EB-NEXT: daddiu $1, $1, 131 +; MIPS64R5EB-NEXT: dsll $7, $1, 23 +; MIPS64R5EB-NEXT: ld $25, %call16(float4_extern)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4f32)($gp) +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: callfloat_4: ; MIPS64EL: # %bb.0: # %entry @@ -5881,6 +5720,44 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: callfloat_4: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 129 +; MIPS64R5EL-NEXT: dsll $1, $1, 25 +; MIPS64R5EL-NEXT: daddiu $1, $1, 1 +; MIPS64R5EL-NEXT: daddiu $2, $zero, 523 +; MIPS64R5EL-NEXT: dsll $2, $2, 31 +; MIPS64R5EL-NEXT: daddiu $3, $zero, 383 +; MIPS64R5EL-NEXT: dsll $4, $3, 55 +; MIPS64R5EL-NEXT: dsll $5, $1, 30 +; MIPS64R5EL-NEXT: daddiu $1, $2, 261 +; MIPS64R5EL-NEXT: dsll $6, $1, 22 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 131 +; MIPS64R5EL-NEXT: dsll $1, $1, 35 +; MIPS64R5EL-NEXT: daddiu $1, $1, 1047 +; MIPS64R5EL-NEXT: dsll $7, $1, 20 +; MIPS64R5EL-NEXT: ld $25, %call16(float4_extern)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4f32)($gp) +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <4 x float> @float4_extern(<4 x float> , <4 x float> ) store <4 x float> %0, <4 x float> * @gv4f32 @@ -5957,51 +5834,42 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5-LABEL: calldouble_2: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -80 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 80 -; MIPS32R5-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: .cfi_offset 30, -8 -; MIPS32R5-NEXT: move $fp, $sp -; MIPS32R5-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5-NEXT: addiu $1, $zero, -16 -; MIPS32R5-NEXT: and $sp, $sp, $1 -; MIPS32R5-NEXT: lui $1, %hi($CPI39_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI39_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $6, $w0[0] -; MIPS32R5-NEXT: copy_s.w $7, $w0[1] -; MIPS32R5-NEXT: copy_s.w $1, $w0[2] -; MIPS32R5-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5-NEXT: lui $3, %hi($CPI39_1) -; MIPS32R5-NEXT: addiu $3, $3, %lo($CPI39_1) -; MIPS32R5-NEXT: ld.w $w0, 0($3) -; MIPS32R5-NEXT: copy_s.w $3, $w0[0] -; MIPS32R5-NEXT: copy_s.w $4, $w0[1] -; MIPS32R5-NEXT: copy_s.w $5, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 36($sp) -; MIPS32R5-NEXT: sw $5, 32($sp) -; MIPS32R5-NEXT: sw $4, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: addiu $4, $sp, 48 -; MIPS32R5-NEXT: jal double2_extern -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv2f64) -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2f64) -; MIPS32R5-NEXT: ld.d $w0, 48($sp) -; MIPS32R5-NEXT: st.d $w0, 0($1) -; MIPS32R5-NEXT: move $sp, $fp -; MIPS32R5-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 80 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calldouble_2: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -80 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 80 +; MIPS32R5EB-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 +; MIPS32R5EB-NEXT: move $fp, $sp +; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EB-NEXT: addiu $1, $zero, -16 +; MIPS32R5EB-NEXT: and $sp, $sp, $1 +; MIPS32R5EB-NEXT: lui $1, 16424 +; MIPS32R5EB-NEXT: lui $2, 16428 +; MIPS32R5EB-NEXT: sw $2, 32($sp) +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: lui $1, 49136 +; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: sw $zero, 36($sp) +; MIPS32R5EB-NEXT: sw $zero, 28($sp) +; MIPS32R5EB-NEXT: sw $zero, 20($sp) +; MIPS32R5EB-NEXT: addiu $4, $sp, 48 +; MIPS32R5EB-NEXT: addiu $6, $zero, 0 +; MIPS32R5EB-NEXT: addiu $7, $zero, 0 +; MIPS32R5EB-NEXT: jal double2_extern +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: lui $1, %hi(gv2f64) +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2f64) +; MIPS32R5EB-NEXT: ld.d $w0, 48($sp) +; MIPS32R5EB-NEXT: st.d $w0, 0($1) +; MIPS32R5EB-NEXT: move $sp, $fp +; MIPS32R5EB-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 80 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; ; MIPS64R5-LABEL: calldouble_2: ; MIPS64R5: # %bb.0: # %entry @@ -6014,17 +5882,14 @@ ; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calldouble_2))) ; MIPS64R5-NEXT: daddu $1, $1, $25 ; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calldouble_2))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI39_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI39_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI39_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI39_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5-NEXT: daddiu $1, $zero, 3071 +; MIPS64R5-NEXT: dsll $5, $1, 52 +; MIPS64R5-NEXT: daddiu $1, $zero, 2053 +; MIPS64R5-NEXT: dsll $6, $1, 51 +; MIPS64R5-NEXT: daddiu $1, $zero, 4107 +; MIPS64R5-NEXT: dsll $7, $1, 50 ; MIPS64R5-NEXT: ld $25, %call16(double2_extern)($gp) +; MIPS64R5-NEXT: daddiu $4, $zero, 0 ; MIPS64R5-NEXT: jalr $25 ; MIPS64R5-NEXT: nop ; MIPS64R5-NEXT: insert.d $w0[0], $2 @@ -6075,6 +5940,43 @@ ; MIPS32EL-NEXT: addiu $sp, $sp, 80 ; MIPS32EL-NEXT: jr $ra ; MIPS32EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calldouble_2: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -80 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 80 +; MIPS32R5EL-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 +; MIPS32R5EL-NEXT: move $fp, $sp +; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EL-NEXT: addiu $1, $zero, -16 +; MIPS32R5EL-NEXT: and $sp, $sp, $1 +; MIPS32R5EL-NEXT: lui $1, 16424 +; MIPS32R5EL-NEXT: lui $2, 16428 +; MIPS32R5EL-NEXT: sw $2, 36($sp) +; MIPS32R5EL-NEXT: sw $1, 28($sp) +; MIPS32R5EL-NEXT: lui $1, 49136 +; MIPS32R5EL-NEXT: sw $1, 20($sp) +; MIPS32R5EL-NEXT: sw $zero, 32($sp) +; MIPS32R5EL-NEXT: sw $zero, 24($sp) +; MIPS32R5EL-NEXT: sw $zero, 16($sp) +; MIPS32R5EL-NEXT: addiu $4, $sp, 48 +; MIPS32R5EL-NEXT: addiu $6, $zero, 0 +; MIPS32R5EL-NEXT: addiu $7, $zero, 0 +; MIPS32R5EL-NEXT: jal double2_extern +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: lui $1, %hi(gv2f64) +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2f64) +; MIPS32R5EL-NEXT: ld.d $w0, 48($sp) +; MIPS32R5EL-NEXT: st.d $w0, 0($1) +; MIPS32R5EL-NEXT: move $sp, $fp +; MIPS32R5EL-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 80 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop entry: %0 = call <2 x double> @double2_extern(<2 x double> , <2 x double> ) store <2 x double> %0, <2 x double> * @gv2f64 diff --git a/llvm/test/CodeGen/PowerPC/pr45709.ll b/llvm/test/CodeGen/PowerPC/pr45709.ll --- a/llvm/test/CodeGen/PowerPC/pr45709.ll +++ b/llvm/test/CodeGen/PowerPC/pr45709.ll @@ -10,7 +10,7 @@ define dso_local void @_ZN1a1bEv(<4 x float> %in) local_unnamed_addr #0 align 2 { ; CHECK-LABEL: _ZN1a1bEv: ; CHECK: # %bb.0: -; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_6 +; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_4 ; CHECK-NEXT: b .LBB0_1 ; CHECK-NEXT: .LBB0_1: # %.preheader ; CHECK-NEXT: b .LBB0_2 @@ -21,26 +21,18 @@ ; CHECK-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-NEXT: lvx v3, 0, r3 ; CHECK-NEXT: vperm v2, v2, v2, v3 -; CHECK-NEXT: vxor v3, v3, v3 -; CHECK-NEXT: addi r3, r1, -48 -; CHECK-NEXT: stvx v3, 0, r3 ; CHECK-NEXT: addi r3, r1, -32 ; CHECK-NEXT: stvx v2, 0, r3 -; CHECK-NEXT: lwz r3, -48(r1) -; CHECK-NEXT: lwz r4, -32(r1) -; CHECK-NEXT: cmpw r4, r3 -; CHECK-NEXT: bc 12, gt, .LBB0_4 -; CHECK-NEXT: b .LBB0_5 -; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: addi r3, r4, 0 -; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: lwz r3, -32(r1) +; CHECK-NEXT: srawi r4, r3, 31 +; CHECK-NEXT: andc r3, r3, r4 ; CHECK-NEXT: cmpw r3, r3 -; CHECK-NEXT: stw r3, -64(r1) -; CHECK-NEXT: addi r3, r1, -64 +; CHECK-NEXT: stw r3, -48(r1) +; CHECK-NEXT: addi r3, r1, -48 ; CHECK-NEXT: lvx v2, 0, r3 ; CHECK-NEXT: addi r3, r1, -16 ; CHECK-NEXT: stvx v2, 0, r3 -; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: blr br i1 undef, label %7, label %1 diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -86,12 +86,12 @@ ; CHECK-NEXT: ldr r1, [sp, #24] ; CHECK-NEXT: vmov.32 q1[2], r1 ; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: strd r3, r2, [r0, #16] -; CHECK-NEXT: str r1, [r0, #24] +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: strd r1, r3, [r0, #16] +; CHECK-NEXT: str r2, [r0, #24] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -256,10 +256,9 @@ ; CHECK-NEXT: adr r7, .LCPI1_1 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: vldrw.u32 q1, [r7] +; CHECK-NEXT: mov.w r9, #-1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: mvn r9, #-2147483648 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 @@ -277,7 +276,7 @@ ; CHECK-NEXT: vmov lr, s26 ; CHECK-NEXT: rsbs.w r5, r6, #-2147483648 ; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: sbcs.w r5, r3, r7 +; CHECK-NEXT: sbcs.w r5, r9, r7 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r5, #1 @@ -290,7 +289,7 @@ ; CHECK-NEXT: vmov.32 q6[0], r6 ; CHECK-NEXT: rsbs.w r4, lr, #-2147483648 ; CHECK-NEXT: vmov.32 q6[1], r7 -; CHECK-NEXT: sbcs.w r4, r3, r5 +; CHECK-NEXT: sbcs.w r4, r9, r5 ; CHECK-NEXT: vmov.32 q6[2], lr ; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: vmov.32 q6[3], r5 @@ -300,14 +299,16 @@ ; CHECK-NEXT: csetm r4, ne ; CHECK-NEXT: mov lr, r2 ; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: vmov.32 q4[3], r4 -; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: vbic q5, q0, q4 ; CHECK-NEXT: vand q4, q6, q4 ; CHECK-NEXT: vorr q4, q4, q5 ; CHECK-NEXT: vmov r6, s16 ; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: subs.w r6, r6, r9 +; CHECK-NEXT: subs r6, r6, r2 +; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: sbcs r7, r7, #0 ; CHECK-NEXT: vmov r6, s18 ; CHECK-NEXT: mov.w r7, #0 @@ -318,8 +319,8 @@ ; CHECK-NEXT: vmov.32 q5[0], r7 ; CHECK-NEXT: vmov.32 q5[1], r7 ; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: subs.w r6, r6, r9 -; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: subs r6, r6, r2 +; CHECK-NEXT: vmov r6, s14 ; CHECK-NEXT: sbcs r7, r7, #0 ; CHECK-NEXT: mov.w r7, #0 ; CHECK-NEXT: it lt @@ -327,45 +328,46 @@ ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: csetm r7, ne ; CHECK-NEXT: vmov.32 q5[2], r7 -; CHECK-NEXT: vmov r7, s8 +; CHECK-NEXT: vmov r7, s10 ; CHECK-NEXT: vbic q6, q1, q5 ; CHECK-NEXT: vand q4, q4, q5 ; CHECK-NEXT: vorr q4, q4, q6 ; CHECK-NEXT: smull r6, r7, r6, r7 ; CHECK-NEXT: asrl r6, r7, #31 ; CHECK-NEXT: rsbs.w r5, r6, #-2147483648 -; CHECK-NEXT: vmov.32 q3[0], r6 -; CHECK-NEXT: sbcs.w r5, r3, r7 -; CHECK-NEXT: vmov.32 q3[1], r7 +; CHECK-NEXT: sbcs.w r5, r9, r7 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r5, #1 ; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov.32 q5[1], r5 -; CHECK-NEXT: vmov r5, s10 +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: smull r4, r5, r4, r5 ; CHECK-NEXT: asrl r4, r5, #31 -; CHECK-NEXT: rsbs.w r2, r4, #-2147483648 -; CHECK-NEXT: vmov.32 q3[2], r4 -; CHECK-NEXT: sbcs.w r2, r3, r5 -; CHECK-NEXT: vmov.32 q3[3], r5 -; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 +; CHECK-NEXT: vmov.32 q5[0], r4 +; CHECK-NEXT: sbcs.w r3, r9, r5 +; CHECK-NEXT: vmov.32 q5[1], r5 +; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: vmov.32 q5[2], r6 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r2 -; CHECK-NEXT: vbic q2, q0, q5 -; CHECK-NEXT: vand q3, q3, q5 -; CHECK-NEXT: vorr q2, q3, q2 -; CHECK-NEXT: vmov r7, s8 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q5[3], r7 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: mvn r4, #-2147483648 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vbic q3, q0, q2 +; CHECK-NEXT: vand q2, q5, q2 +; CHECK-NEXT: vorr q2, q2, q3 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: subs.w r7, r7, r9 +; CHECK-NEXT: subs r3, r3, r4 ; CHECK-NEXT: sbcs r2, r2, #0 -; CHECK-NEXT: vmov r7, s10 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r2, #1 @@ -374,7 +376,7 @@ ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.32 q3[1], r2 ; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: subs.w r7, r7, r9 +; CHECK-NEXT: subs r3, r3, r4 ; CHECK-NEXT: sbcs r2, r2, #0 ; CHECK-NEXT: mov.w r2, #0 ; CHECK-NEXT: it lt @@ -561,7 +563,6 @@ ; CHECK-NEXT: vmov.f32 s26, s19 ; CHECK-NEXT: vmov.f32 s30, s23 ; CHECK-NEXT: vmullb.s32 q0, q7, q6 -; CHECK-NEXT: vmov.f32 s18, s17 ; CHECK-NEXT: vmov r5, s1 ; CHECK-NEXT: vmov r6, s0 ; CHECK-NEXT: asrl r6, r5, #31 @@ -571,7 +572,6 @@ ; CHECK-NEXT: sbcs.w r4, r12, r5 ; CHECK-NEXT: vmov.32 q7[1], r5 ; CHECK-NEXT: mov.w r4, #0 -; CHECK-NEXT: vmov.f32 s22, s21 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 @@ -609,6 +609,7 @@ ; CHECK-NEXT: vmov r3, s27 ; CHECK-NEXT: subs.w r4, r4, r8 ; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: vmov.f32 s22, s21 ; CHECK-NEXT: sbcs r3, r3, #0 ; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt @@ -617,6 +618,7 @@ ; CHECK-NEXT: csetm r3, ne ; CHECK-NEXT: vmov.32 q0[2], r3 ; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov.f32 s18, s17 ; CHECK-NEXT: vbic q7, q3, q0 ; CHECK-NEXT: vand q0, q6, q0 ; CHECK-NEXT: vorr q6, q0, q7 diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll @@ -52,12 +52,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q5, [r0] -; CHECK-NEXT: vmov.f64 d8, d10 -; CHECK-NEXT: vmov.f32 s18, s21 +; CHECK-NEXT: vmov.f64 d8, d11 +; CHECK-NEXT: vmov.f32 s18, s23 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: bl __aeabi_l2d @@ -67,24 +67,23 @@ ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov.f64 d12, d11 -; CHECK-NEXT: vmov.f32 s26, s23 +; CHECK-NEXT: vmov.f32 s22, s21 ; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov r2, s22 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov d11, r0, r1 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d ; CHECK-NEXT: vmov d10, r0, r1 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} entry: %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -103,14 +103,18 @@ ; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.16 q3[6], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.f32 s3, s15 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -387,19 +391,23 @@ ; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vmov.16 q1[2], r1 ; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.16 q2[4], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q3[6], r1 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll --- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll +++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll @@ -73,10 +73,11 @@ ; CHECK-FP-NEXT: vmov.32 q0[0], r1 ; CHECK-FP-NEXT: adcs r0, r2 ; CHECK-FP-NEXT: vmov.32 q0[1], r0 -; CHECK-FP-NEXT: vmov.32 q0[2], lr -; CHECK-FP-NEXT: vmov.32 q0[3], r12 +; CHECK-FP-NEXT: vmov q1, q0 ; CHECK-FP-NEXT: vmov r0, r1, d0 -; CHECK-FP-NEXT: vmov r2, r3, d1 +; CHECK-FP-NEXT: vmov.32 q1[2], lr +; CHECK-FP-NEXT: vmov.32 q1[3], r12 +; CHECK-FP-NEXT: vmov r2, r3, d3 ; CHECK-FP-NEXT: pop {r7, pc} entry: %sum = add <2 x i64> %lhs, %rhs diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -36,14 +36,11 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x) { ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -55,11 +52,8 @@ ; CHECK-LABEL: add_v2i32_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr @@ -134,44 +128,41 @@ ; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u16 r2, q0[6] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q0, q2, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i16> %x to <8 x i64> @@ -183,53 +174,27 @@ ; CHECK-LABEL: add_v8i16_v8i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: asrs r3, r1, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: adc.w r1, r2, r1, asr #31 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u16 r2, q0[5] ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u16 r2, q0[6] ; CHECK-NEXT: sxth r2, r2 @@ -249,12 +214,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) { ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov.i32 q1, #0xffff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -268,11 +233,8 @@ ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr @@ -399,96 +361,85 @@ ; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[1] ; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[9] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[12] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q0, q2, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i64> @@ -500,121 +451,59 @@ ; CHECK-LABEL: add_v16i8_v16i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: asrs r0, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: asrs r3, r1, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: adc.w r1, r2, r1, asr #31 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[5] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[9] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[12] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: sxtb r2, r2 @@ -634,12 +523,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) { ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -653,11 +542,8 @@ ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr @@ -720,19 +606,14 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) { ; CHECK-LABEL: add_v2i32_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) @@ -744,15 +625,12 @@ ; CHECK-LABEL: add_v2i32_v2i64_acc_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: adc.w r2, r2, r3, asr #31 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i32> %x to <2 x i64> @@ -826,54 +704,51 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u16 r2, q0[0] ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add.w r12, r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r4, r12, r3 -; CHECK-NEXT: adc.w r12, r2, lr +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[6] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: vand q0, q2, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds.w r12, lr, r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <8 x i16> %x to <8 x i64> %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -884,68 +759,42 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov.u16 r3, q0[1] ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov.u16 r4, q0[4] -; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov.32 q1[1], r4 -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: vmov.32 q1[2], r4 -; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: vmov.32 q1[3], r2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4, asr #31 -; CHECK-NEXT: vmov.u16 r4, q0[6] -; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: vmov.u16 r4, q0[7] -; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adc.w r2, r3, r2, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[3] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w lr, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[7] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: adds.w r2, r12, r3 +; CHECK-NEXT: adc.w r3, lr, r3, asr #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <8 x i16> %x to <8 x i64> %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -956,14 +805,13 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov.i32 q1, #0xffff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -978,15 +826,12 @@ ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: asr.w r12, r2, #31 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: adc.w r2, r2, r3, asr #31 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i16> %x to <2 x i64> @@ -1116,106 +961,95 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add.w r12, r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r4, r12, r3 -; CHECK-NEXT: adc.w r12, r2, lr +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w r12, lr, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: adc.w r3, r12, r4 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r4, s9 ; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r4, s9 ; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r4, s1 ; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx) @@ -1226,136 +1060,74 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov.u8 r3, q0[1] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[1] -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: asrs r3, r2, #31 -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov.32 q1[1], r4 -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov.32 q1[2], r4 -; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: vmov.32 q1[3], r2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[6] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov.32 q1[1], r4 -; CHECK-NEXT: vmov.u8 r4, q0[7] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov.32 q1[2], r4 -; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: vmov.32 q1[3], r2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov.32 q1[1], r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov.32 q1[2], r4 -; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: vmov.32 q1[3], r2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov.32 q1[1], r4 -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov.32 q1[2], r4 -; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: vmov.32 q1[3], r2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: asrs r4, r4, #31 -; CHECK-NEXT: vmov.32 q1[1], r4 -; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: vmov.32 q1[2], r4 -; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: vmov.32 q1[3], r2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adc.w r2, r3, r2, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[13] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w lr, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[15] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r2, r12, r3 +; CHECK-NEXT: adc.w r3, lr, r3, asr #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx) @@ -1366,14 +1138,13 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -1388,15 +1159,12 @@ ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: asr.w r12, r2, #31 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: adc.w r2, r2, r3, asr #31 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i8> %x to <2 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -46,28 +46,23 @@ ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.32 q2[1], r0 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -215,44 +210,36 @@ ; CHECK-NEXT: and r1, r0, #1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.32 q3[1], r1 ; CHECK-NEXT: ubfx r1, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.32 q3[3], r1 ; CHECK-NEXT: vmov.u16 r1, q0[0] ; CHECK-NEXT: vmov.32 q4[0], r1 ; CHECK-NEXT: vmov.u16 r1, q0[1] ; CHECK-NEXT: vmov.32 q4[2], r1 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: ubfx r3, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: ubfx r2, r0, #8, #1 ; CHECK-NEXT: ubfx r0, r0, #12, #1 -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov.32 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q2[4] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] @@ -261,53 +248,45 @@ ; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[2], r3 -; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: vmov.u16 r3, q0[4] ; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[6] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -472,21 +451,17 @@ ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.32 q3[1], r0 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -723,44 +698,36 @@ ; CHECK-NEXT: and r1, r0, #1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov.32 q6[0], r1 -; CHECK-NEXT: vmov.32 q6[1], r1 ; CHECK-NEXT: ubfx r1, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov.32 q6[2], r1 -; CHECK-NEXT: vmov.32 q6[3], r1 ; CHECK-NEXT: vmov.u8 r1, q0[0] ; CHECK-NEXT: vmov.32 q7[0], r1 ; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: vmov.32 q7[2], r1 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r1, s27 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: ubfx r3, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov r1, s26 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: ubfx r2, r0, #8, #1 ; CHECK-NEXT: ubfx r0, r0, #12, #1 -; CHECK-NEXT: vmov.32 q6[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q6[1], r3 ; CHECK-NEXT: vmov.32 q6[2], r0 -; CHECK-NEXT: vmov.32 q6[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vmov.32 q7[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.32 q7[2], r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q5[4] ; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u16 r2, q5[5] @@ -769,53 +736,44 @@ ; CHECK-NEXT: vmov.32 q6[2], r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.32 q6[3], r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q5[0], r3 -; CHECK-NEXT: vmov.32 q5[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q5[2], r3 -; CHECK-NEXT: vmov.32 q5[3], r3 ; CHECK-NEXT: vmov.u8 r3, q0[4] ; CHECK-NEXT: vmov.32 q6[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov.32 q6[2], r3 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q5[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -832,6 +790,7 @@ ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q2[0] @@ -847,47 +806,38 @@ ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vmov.32 q3[3], r3 ; CHECK-NEXT: vmov.u8 r3, q0[8] ; CHECK-NEXT: vmov.32 q4[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[9] ; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.32 q4[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q2[4] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] @@ -896,52 +846,45 @@ ; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[2], r3 -; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: vmov.u8 r3, q0[12] ; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[13] ; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1273,21 +1216,17 @@ ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.32 q3[1], r0 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1423,34 +1362,27 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %b, i64 %a) { ; CHECK-LABEL: add_v2i32_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer %xx = zext <2 x i32> %x to <2 x i64> @@ -1610,45 +1542,36 @@ ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vmov.32 q3[3], r3 ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov.32 q4[0], r3 ; CHECK-NEXT: vmov.u16 r3, q0[1] ; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r12, s15 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov lr, s14 -; CHECK-NEXT: orr.w r12, r12, r3 +; CHECK-NEXT: vmov r12, s14 ; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: add lr, r3 +; CHECK-NEXT: add r12, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov.32 q4[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds.w r4, lr, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adc.w lr, r12, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds.w r12, r4, r3 -; CHECK-NEXT: adc.w lr, lr, r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q2[4] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] @@ -1657,52 +1580,45 @@ ; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.32 q2[1], r4 -; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q2[2], r4 -; CHECK-NEXT: vmov.32 q2[3], r4 -; CHECK-NEXT: vmov.u16 r4, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r4 -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adc.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, lr, r3 +; CHECK-NEXT: adds.w r4, r12, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adc r12, lr, #0 +; CHECK-NEXT: adds.w lr, r4, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[6] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc r4, r12, #0 ; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9} @@ -1875,23 +1791,18 @@ ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.32 q3[1], r2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: orr.w r12, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -2145,45 +2056,36 @@ ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q6[0], r3 -; CHECK-NEXT: vmov.32 q6[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q6[2], r3 -; CHECK-NEXT: vmov.32 q6[3], r3 ; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vmov.32 q7[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[1] ; CHECK-NEXT: vmov.32 q7[2], r3 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r12, s27 -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: vmov lr, s26 -; CHECK-NEXT: orr.w r12, r12, r3 +; CHECK-NEXT: vmov r12, s26 ; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: add lr, r3 +; CHECK-NEXT: add r12, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q6[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q6[1], r3 ; CHECK-NEXT: vmov.32 q6[2], r2 -; CHECK-NEXT: vmov.32 q6[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[2] ; CHECK-NEXT: vmov.32 q7[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: vmov.32 q7[2], r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: adds.w r4, lr, r3 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: adc.w lr, r12, r2 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: adds.w r12, r4, r3 -; CHECK-NEXT: adc.w lr, lr, r2 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q5[4] ; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u16 r2, q5[5] @@ -2192,178 +2094,155 @@ ; CHECK-NEXT: vmov.32 q6[2], r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.32 q6[3], r2 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q5[0], r4 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q5[2], r4 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: vmov.32 q6[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: vmov.32 q6[2], r4 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q5[2], r3 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vmov.32 q6[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: vmov.32 q6[2], r3 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 ; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r4, s21 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adc.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, lr, r3 +; CHECK-NEXT: adds.w r4, r12, r3 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: adc r12, lr, #0 +; CHECK-NEXT: adds.w lr, r4, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q5[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: adc r4, r12, #0 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s23 -; CHECK-NEXT: adc.w lr, r12, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q4[8] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u8 r2, q4[9] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u8 r2, q4[10] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u8 r2, q4[11] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u8 r2, q4[12] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u8 r2, q4[13] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u8 r2, q4[14] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u8 r2, q4[15] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: adc.w lr, lr, r4 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r4, s22 +; CHECK-NEXT: adds.w r12, r2, r4 +; CHECK-NEXT: vmov.u8 r4, q4[8] +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov.u8 r4, q4[9] +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov.u8 r4, q4[10] +; CHECK-NEXT: vmov.16 q5[2], r4 +; CHECK-NEXT: vmov.u8 r4, q4[11] +; CHECK-NEXT: vmov.16 q5[3], r4 +; CHECK-NEXT: vmov.u8 r4, q4[12] +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov.u8 r4, q4[13] +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: vmov.u8 r4, q4[14] +; CHECK-NEXT: vmov.16 q5[6], r4 +; CHECK-NEXT: vmov.u8 r4, q4[15] +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.32 q3[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov.u16 r4, q2[0] ; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: vmov.u16 r4, q2[1] ; CHECK-NEXT: vmov.32 q3[1], r4 -; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov.u16 r4, q2[2] ; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.u16 r4, q2[3] ; CHECK-NEXT: vmov.32 q3[3], r4 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmrs r4, p0 +; CHECK-NEXT: and r2, r4, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: ubfx r2, r4, #4, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adc.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, lr, r3 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: ubfx r2, r4, #8, #1 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: ubfx r2, r4, #12, #1 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.32 q4[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s15 -; CHECK-NEXT: adc.w lr, r12, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.32 q3[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: adc.w lr, lr, r4 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.32 q2[1], r4 -; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q2[2], r4 -; CHECK-NEXT: vmov.32 q2[3], r4 -; CHECK-NEXT: vmov.u8 r4, q0[12] +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r2, r4 +; CHECK-NEXT: vmov.u16 r4, q2[4] +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: vmov.u16 r4, q2[5] +; CHECK-NEXT: vmov.32 q3[1], r4 +; CHECK-NEXT: vmov.u16 r4, q2[6] ; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.u16 r4, q2[7] +; CHECK-NEXT: vmov.32 q3[3], r4 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmrs r4, p0 +; CHECK-NEXT: and r2, r4, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: ubfx r2, r4, #4, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adc.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, lr, r3 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: ubfx r2, r4, #8, #1 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: ubfx r2, r4, #12, #1 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -2703,23 +2582,18 @@ ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.32 q3[1], r2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: orr.w r12, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -215,14 +215,17 @@ ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q2, #0xffff -; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umlal r0, r1, r3, r2 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -235,10 +238,10 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) { ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: sxth r1, r1 ; CHECK-NEXT: smull r0, r1, r1, r0 @@ -429,10 +432,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.u8 r0, q1[0] ; CHECK-NEXT: vmov.u8 r1, q0[0] ; CHECK-NEXT: vmov.32 q3[0], r0 @@ -444,194 +445,151 @@ ; CHECK-NEXT: vmov.32 q4[2], r1 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r12, r1, r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: smlabb r0, r2, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q1[2] +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u8 r1, q1[3] +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov.32 q3[2], r1 +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: orr.w lr, r3, r1 -; CHECK-NEXT: vmov.u8 r3, q1[2] -; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: vmov.u8 r3, q1[3] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: smlabb r0, r2, r1, r0 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: smlabb r0, r2, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q1[4] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u8 r1, q1[5] +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.32 q3[2], r1 +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: umull r0, r3, r0, r3 -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r0, r3, r3, r0 -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, lr, r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vmov.u8 r2, q1[4] -; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.u8 r2, q1[5] +; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: vmov.32 q4[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u8 r3, q0[7] ; CHECK-NEXT: vmov.32 q4[2], r3 -; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[0], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.32 q4[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[7] ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[0], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u8 r2, q1[8] -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.u8 r2, q1[9] ; CHECK-NEXT: vmov.32 q4[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.32 q4[2], r3 -; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[0], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[8] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.u8 r2, q1[11] ; CHECK-NEXT: vmov.32 q4[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.32 q4[2], r3 -; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[0], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[10] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[11] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adds.w r12, r0, r2 -; CHECK-NEXT: vmov.u8 r2, q1[12] -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.u8 r2, q1[13] ; CHECK-NEXT: vmov.32 q4[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.32 q4[2], r3 -; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[0], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[12] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[13] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[14] ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[14] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q1[15] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q1, q3, q2 ; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q0, q3, q2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umlal r0, r1, r3, r2 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umlal r0, r1, r3, r2 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -643,152 +601,77 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r1 ; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q1[2] ; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u8 r1, q1[2] -; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r1, r3, r3, r1 -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q1[3] -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r1, r3, r3, r1 -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adc.w r1, r1, r12 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[4] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[4] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[7] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[8] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[8] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[9] -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[9] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[10] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[11] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[12] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[12] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[13] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[14] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.u8 r3, q0[14] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 @@ -811,16 +694,15 @@ ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q2, #0xff -; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: mla r0, r2, r1, r0 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -833,10 +715,10 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: smull r0, r1, r1, r0 @@ -857,25 +739,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r4, s5 ; CHECK-NEXT: umull r12, r3, r1, r0 ; CHECK-NEXT: mla r1, r1, r2, r3 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: mla lr, r2, r0, r1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: umull r3, r1, r2, r0 +; CHECK-NEXT: mla r1, r2, r4, r1 ; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.32 q2[0], r12 ; CHECK-NEXT: mla r1, r2, r0, r1 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov r12, s8 -; CHECK-NEXT: umull lr, r0, r3, r2 -; CHECK-NEXT: mla r0, r3, r4, r0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: mla r2, r3, r2, r0 -; CHECK-NEXT: adds.w r0, r12, lr -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds.w r0, r3, r12 +; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: pop {r4, pc} entry: %m = mul <2 x i64> %x, %y @@ -1119,20 +998,21 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xffff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s6 -; CHECK-NEXT: umull r2, lr, r3, r2 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umlal r2, lr, r3, r12 +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> %yy = zext <2 x i16> %y to <2 x i64> @@ -1147,14 +1027,14 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: smull r2, r12, r3, r2 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: sxth.w lr, r3 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: smlal r2, r12, r3, lr ; CHECK-NEXT: adds r0, r0, r2 @@ -1356,10 +1236,10 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.u8 r2, q1[0] ; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vmov.32 q3[0], r2 @@ -1371,196 +1251,153 @@ ; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov.u8 r4, q0[2] -; CHECK-NEXT: umull r12, lr, r3, r2 -; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[3] -; CHECK-NEXT: vmov.32 q4[2], r4 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: orr.w lr, lr, r3 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: smlabb r12, r2, r3, r12 ; CHECK-NEXT: vmov.u8 r3, q1[2] +; CHECK-NEXT: vmov.u8 r2, q0[2] ; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.u8 r3, q1[3] +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: smlabb r12, r2, r3, r12 ; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov.32 q5[2], r3 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[4] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[5] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: smlabb r12, r2, r3, r12 +; CHECK-NEXT: vmov.u8 r3, q1[4] +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u8 r3, q1[5] +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.32 q3[2], r3 +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[2], r5 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[6] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[6] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[7] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[7] -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.32 q4[2], r4 -; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[2], r5 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[8] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[9] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q1[6] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc lr, lr, #0 ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc lr, lr, #0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[2], r5 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[10] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[11] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q1[8] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc lr, lr, #0 ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc lr, lr, #0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[2], r5 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[12] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[13] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q1[10] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[11] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc lr, lr, #0 ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc lr, lr, #0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[13] +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[2], r5 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[14] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[15] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.u8 r4, q0[14] +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q1[12] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[13] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc lr, lr, #0 +; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc lr, lr, #0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q1[14] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[15] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc lr, lr, #0 ; CHECK-NEXT: vand q1, q3, q2 -; CHECK-NEXT: vmov.32 q3[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: vmov.32 q3[2], r4 -; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[15] +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q0, q3, q2 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: umlal r2, r3, r4, r5 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: umlal r2, r3, r4, r5 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adc lr, lr, #0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, lr, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -1573,166 +1410,91 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov.u8 r2, q1[0] -; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u8 r2, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.u8 r3, q0[1] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vmov lr, s10 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r12, s9 -; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: smull r3, r12, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[2] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[2] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[3] -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w lr, r4, r2 -; CHECK-NEXT: vmov.u8 r4, q1[4] +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: sxtb.w lr, r2 ; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: smull r2, r4, r2, r4 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: vmov.32 q2[1], r4 -; CHECK-NEXT: vmov.u8 r4, q0[5] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[5] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smull r2, r4, r4, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[6] +; CHECK-NEXT: sxtb.w lr, r2 ; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: adc.w r12, r12, r4 -; CHECK-NEXT: vmov.u8 r4, q1[6] -; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r2, r4, r2, r4 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.32 q2[1], r4 -; CHECK-NEXT: vmov.u8 r4, q0[7] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smull r2, r4, r4, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[8] +; CHECK-NEXT: sxtb.w lr, r2 ; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: adc.w r12, r12, r4 -; CHECK-NEXT: vmov.u8 r4, q1[8] -; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r2, r4, r2, r4 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[9] -; CHECK-NEXT: vmov.32 q2[1], r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smull r2, r4, r4, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[10] +; CHECK-NEXT: sxtb.w lr, r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: adc.w r12, r12, r4 -; CHECK-NEXT: vmov.u8 r4, q1[10] -; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r2, r4, r2, r4 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: vmov.32 q2[1], r4 -; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smull r2, r4, r4, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[12] +; CHECK-NEXT: sxtb.w lr, r2 ; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: adc.w r12, r12, r4 -; CHECK-NEXT: vmov.u8 r4, q1[12] -; CHECK-NEXT: sxtb r4, r4 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r2, r4, r2, r4 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: vmov.32 q2[1], r4 -; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smull r2, r4, r4, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 -; CHECK-NEXT: vmov.u8 r4, q1[14] -; CHECK-NEXT: sxtb.w r12, r4 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smlal r2, r3, r4, r12 -; CHECK-NEXT: vmov.u8 r4, q1[15] -; CHECK-NEXT: sxtb.w r12, r4 -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smlal r2, r3, r4, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[14] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[15] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> %yy = sext <16 x i8> %y to <16 x i64> @@ -1745,22 +1507,18 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umull r12, lr, r3, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: orr.w r3, r3, lr +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: mla r2, r2, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> %yy = zext <2 x i8> %y to <2 x i64> @@ -1775,14 +1533,14 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: smull r2, r12, r3, r2 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: sxtb.w lr, r3 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: smlal r2, r12, r3, lr ; CHECK-NEXT: adds r0, r0, r2 @@ -1802,27 +1560,24 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov r6, s7 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: vmov r6, s5 ; CHECK-NEXT: umull r12, lr, r3, r2 ; CHECK-NEXT: mla r3, r3, r4, lr -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov.32 q2[0], r12 -; CHECK-NEXT: mla r2, r4, r2, r3 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.32 q2[1], r2 -; CHECK-NEXT: vmov r12, s8 -; CHECK-NEXT: umull lr, r5, r3, r4 -; CHECK-NEXT: mla r3, r3, r6, r5 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: adds.w r6, r12, lr -; CHECK-NEXT: mla r3, r5, r4, r3 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds r0, r0, r6 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: mla lr, r4, r2, r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: umull r2, r5, r4, r3 +; CHECK-NEXT: mla r4, r4, r6, r5 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: mla r3, r5, r3, r4 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %m = mul <2 x i64> %x, %y diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -66,34 +66,34 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) { ; CHECK-LABEL: vld2_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0], #32 -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov.f64 d4, d0 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov lr, s10 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r12, s7 -; CHECK-NEXT: vmov lr, s3 -; CHECK-NEXT: adds r6, r3, r2 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adc.w r12, r12, lr -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov.32 q0[0], r5 -; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: adcs r2, r4 ; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: vmov.32 q0[2], r6 +; CHECK-NEXT: vmov.32 q0[2], lr ; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %l1 = load <4 x i64>, <4 x i64>* %src, align 4 %s1 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -262,27 +262,27 @@ ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f64 d4, d0 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r3, s7 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r12, s7 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: adds.w lr, r2, r0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.32 q0[2], lr ; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -301,59 +301,59 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f64 d8, d7 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f64 d6, d8 +; CHECK-NEXT: vmov.f32 s13, s17 ; CHECK-NEXT: vmov.f32 s14, s20 ; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmov.f64 d12, d9 +; CHECK-NEXT: vmov r12, s15 +; CHECK-NEXT: vmov.f32 s25, s19 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.f64 d4, d0 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov.f64 d6, d1 +; CHECK-NEXT: adds.w lr, r2, r0 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.32 q4[2], lr +; CHECK-NEXT: vmov.32 q4[3], r12 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: adds.w lr, r2, r4 ; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov r12, s19 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov.32 q3[1], r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov.32 q3[2], lr -; CHECK-NEXT: vmov.32 q3[3], r12 -; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: adc.w r12, r2, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adc.w r12, r0, r3 +; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.32 q0[2], lr ; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 4 @@ -465,18 +465,19 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r2, r0, [r0] ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -334,82 +334,88 @@ define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.f32 s22, s12 +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[1] ; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovnb.i32 q6, q4 -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.f32 s13, s17 ; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s14, s22 ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.u16 r0, q2[4] ; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.f32 s15, s27 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.32 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] ; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vmov.u16 r0, q1[2] ; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov.32 q3[0], r0 ; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmovnb.i32 q2, q5 -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vadd.i16 q1, q4, q3 -; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmovnb.i32 q7, q5 +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: vmov r0, s27 +; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vadd.i16 q3, q3, q4 +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov.f32 s10, s0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmovnb.i32 q0, q4 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.32 q1[3], r0 +; CHECK-NEXT: vadd.i16 q0, q3, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x i16>, <24 x i16>* %src, align 4 @@ -427,152 +433,173 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.f32 s22, s12 -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmovnb.i32 q6, q4 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.u16 r2, q3[2] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q4[5], r2 ; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov.16 q4[3], r2 ; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.f32 s13, s17 +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.16 q6[7], r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.16 q5[5], r2 +; CHECK-NEXT: vmov.u16 r2, q2[1] +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov.f32 s15, s27 +; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vmov.u16 r2, q1[2] ; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov.32 q4[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov.16 q6[5], r2 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov.u16 r2, q1[5] ; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmovnb.i32 q7, q5 +; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: vmov r2, s27 +; CHECK-NEXT: vmov.32 q4[3], r2 +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vadd.i16 q3, q3, q4 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov.f32 s10, s0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmovnb.i32 q0, q4 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.32 q1[3], r2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vadd.i16 q0, q3, q1 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vmov.u16 r2, q3[0] -; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov.u16 r2, q3[3] -; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.16 q4[1], r2 ; CHECK-NEXT: vmov.u16 r2, q3[6] -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmovnb.i32 q2, q5 -; CHECK-NEXT: vmov.32 q3[1], r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vadd.i16 q1, q4, q3 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.u16 r2, q2[1] ; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.f32 s26, s12 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov q7, q6 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovnb.i32 q7, q5 -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: vmov r0, s30 -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov r0, s27 -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.16 q7[6], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vmov.f32 s17, s21 ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.f32 s18, s26 ; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q6[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q6[7], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.u16 r0, q3[7] +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.f32 s19, s31 ; CHECK-NEXT: vmov.16 q6[2], r0 ; CHECK-NEXT: vmov.u16 r0, q2[2] ; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov q1, q6 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.16 q7[5], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.16 q7[6], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vmovnb.i32 q0, q1 ; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov r0, s25 ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q3[6] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmovnb.i32 q3, q6 -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov r0, s31 ; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vadd.i16 q1, q5, q1 -; CHECK-NEXT: vadd.i16 q1, q1, q4 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.u16 r0, q3[5] +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vadd.i16 q4, q4, q1 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovnb.i32 q1, q5 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vadd.i16 q0, q4, q2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -756,16 +783,17 @@ ; CHECK-NEXT: vmov.u8 r2, q2[9] ; CHECK-NEXT: vmov.8 q1[3], r2 ; CHECK-NEXT: vmov.u8 r2, q2[12] -; CHECK-NEXT: vmov.8 q1[4], r2 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov.8 q4[4], r2 ; CHECK-NEXT: vmov.u8 r2, q2[15] -; CHECK-NEXT: vmov.8 q1[5], r2 +; CHECK-NEXT: vmov.8 q4[5], r2 ; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: vmov.8 q1[6], r2 +; CHECK-NEXT: vmov.8 q4[6], r2 ; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: vmov.8 q1[7], r2 +; CHECK-NEXT: vmov.8 q4[7], r2 ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: vmov.32 q3[1], r2 ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] @@ -779,14 +807,15 @@ ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov.32 q3[2], r0 ; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q5[0], r0 +; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q5[1], r0 +; CHECK-NEXT: vmov.8 q4[1], r0 ; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q5[2], r0 +; CHECK-NEXT: vmov.8 q4[2], r0 ; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q5[3], r0 +; CHECK-NEXT: vmov.8 q4[3], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov q5, q4 ; CHECK-NEXT: vmov.8 q5[4], r0 ; CHECK-NEXT: vmov.u8 r0, q0[0] ; CHECK-NEXT: vmov.8 q5[5], r0 @@ -794,7 +823,7 @@ ; CHECK-NEXT: vmov.8 q5[6], r0 ; CHECK-NEXT: vmov.u8 r0, q0[6] ; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmov.32 q4[0], r0 ; CHECK-NEXT: vmov r0, s21 ; CHECK-NEXT: vmov.32 q4[1], r0 @@ -838,16 +867,17 @@ ; CHECK-NEXT: vmov.u8 r0, q2[11] ; CHECK-NEXT: vmov.8 q4[3], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.8 q5[4], r0 ; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.8 q5[5], r0 ; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.8 q5[6], r0 ; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.8 q4[7], r0 +; CHECK-NEXT: vmov.8 q5[7], r0 ; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov r0, s21 ; CHECK-NEXT: vmov.32 q2[1], r0 ; CHECK-NEXT: vmov.u8 r0, q0[10] ; CHECK-NEXT: vmov.8 q4[8], r0 @@ -891,47 +921,38 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d6, d3 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s14, s16 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.f32 s15, s17 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: adc.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov r12, s15 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, lr, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: adc.w r12, r2, r3 +; CHECK-NEXT: adds.w lr, r2, r0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: adcs r0, r3 ; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.32 q0[2], lr ; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <6 x i64>, <6 x i64>* %src, align 4 @@ -951,87 +972,70 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-NEXT: vldrw.u32 q6, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f64 d4, d0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.f64 d14, d11 -; CHECK-NEXT: vmov.f32 s29, s23 -; CHECK-NEXT: vmov.f32 s30, s0 -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f32 s31, s1 -; CHECK-NEXT: vmov r3, s30 +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov.f64 d12, d9 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f32 s25, s19 +; CHECK-NEXT: vmov.f32 s26, s20 +; CHECK-NEXT: vmov.f32 s27, s21 +; CHECK-NEXT: vmov r3, s26 +; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov r0, s31 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov r12, s27 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: adc.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.f64 d6, d3 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s10, s18 -; CHECK-NEXT: vmov.f32 s14, s16 -; CHECK-NEXT: vmov.f32 s11, s19 -; CHECK-NEXT: vmov.f32 s15, s17 -; CHECK-NEXT: vmov.f64 d8, d12 -; CHECK-NEXT: vmov.f32 s17, s25 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s19, s3 -; CHECK-NEXT: vmov r12, s31 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, lr, r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: adc.w r12, r2, r3 +; CHECK-NEXT: adds.w lr, r2, r0 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov r4, s28 +; CHECK-NEXT: adcs r0, r3 ; CHECK-NEXT: vmov r3, s29 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: vmov.32 q0[2], lr -; CHECK-NEXT: vmov.32 q0[3], r12 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.32 q4[2], lr +; CHECK-NEXT: vmov.32 q4[3], r12 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] +; CHECK-NEXT: adds r2, r2, r4 ; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w lr, r3, r4 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: adc.w r12, r0, r2 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: adds.w lr, r2, r4 ; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adc.w r12, r0, r3 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.32 q0[2], lr ; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, pc} entry: @@ -1257,30 +1261,35 @@ define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) { ; CHECK-LABEL: vld3_v2f16: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: ldrd r2, r3, [r0] ; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmovx.f16 s0, s4 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmovx.f16 s16, s1 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.16 q1[0], r2 ; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vadd.f16 q1, q2, q1 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vmov.16 q1[1], r0 ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vadd.f16 q1, q1, q3 ; CHECK-NEXT: vmov.16 q0[1], r0 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <6 x half>, <6 x half>* %src, align 4 @@ -1296,48 +1305,49 @@ define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmovx.f16 s4, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmovx.f16 s12, s1 +; CHECK-NEXT: vmovx.f16 s16, s1 ; CHECK-NEXT: vmov.16 q2[1], r2 ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov.16 q2[2], r2 ; CHECK-NEXT: ldrd r2, r0, [r0, #16] -; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmovx.f16 s20, s12 ; CHECK-NEXT: vmov.32 q1[1], r0 ; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmovx.f16 s16, s4 +; CHECK-NEXT: vmovx.f16 s4, s5 ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.16 q4[1], r0 ; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vadd.f16 q2, q4, q2 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vadd.f16 q2, q3, q2 ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vadd.f16 q0, q2, q0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x half>, <12 x half>* %src, align 4 @@ -1355,94 +1365,101 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmov r3, s13 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q0[0], r3 ; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s12, s19 -; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmovx.f16 s4, s9 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmovx.f16 s20, s16 +; CHECK-NEXT: vmov.16 q1[2], r3 ; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmovx.f16 s24, s15 +; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmovx.f16 s4, s19 ; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmovx.f16 s20, s16 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q3[6], r3 -; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmovx.f16 s28, s18 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q1[6], r3 +; CHECK-NEXT: vmov.16 q1[7], r0 ; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.f32 s14, s16 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov.f32 s6, s16 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov r2, s17 ; CHECK-NEXT: vmov.16 q5[5], r0 ; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov lr, s22 +; CHECK-NEXT: vmov r12, s22 ; CHECK-NEXT: vmovx.f16 s20, s17 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q5[6], r3 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q5[6], r2 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r12, s23 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q5[4], r3 -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmovx.f16 s20, s18 +; CHECK-NEXT: vmov lr, s23 +; CHECK-NEXT: vmovx.f16 s20, s12 ; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmovx.f16 s20, s5 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmovx.f16 s24, s11 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s4 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q6[0], r2 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmov.16 q6[1], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: vmov.16 q6[2], r0 ; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.16 q5[1], r3 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.16 q6[2], r3 ; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: vmovx.f16 s24, s10 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov.16 q6[5], r2 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmovx.f16 s24, s13 ; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.32 q1[1], r4 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.32 q1[2], lr -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov r4, s15 -; CHECK-NEXT: vmov.f32 s23, s19 -; CHECK-NEXT: vmov.32 q0[3], r12 -; CHECK-NEXT: vmov.32 q1[3], r4 -; CHECK-NEXT: vadd.f16 q0, q5, q0 -; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vmov.16 q6[0], r4 +; CHECK-NEXT: vmov.16 q6[1], r2 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov q3, q6 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov r2, s28 +; CHECK-NEXT: vmov.16 q3[3], r4 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov.16 q4[6], r4 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov.16 q4[7], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov.f32 s25, s13 +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q2[5], r4 +; CHECK-NEXT: vmov.f32 s26, s10 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov.f32 s27, s19 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmov.32 q2[3], lr +; CHECK-NEXT: vmov.32 q3[2], r12 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vadd.f16 q2, q6, q2 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vadd.f16 q0, q2, q3 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <24 x half>, <24 x half>* %src, align 4 @@ -1458,182 +1475,203 @@ define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld3_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmovx.f16 s4, s16 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmovx.f16 s20, s13 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] ; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q0[7], r3 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov.16 q1[0], r2 ; CHECK-NEXT: vmov.16 q1[1], r3 +; CHECK-NEXT: vmovx.f16 s0, s13 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vmovx.f16 s0, s19 ; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmovx.f16 s24, s12 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmovx.f16 s16, s18 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.16 q5[5], r12 -; CHECK-NEXT: vmov lr, s22 -; CHECK-NEXT: vmovx.f16 s20, s14 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r12, s6 +; CHECK-NEXT: vmov.16 q0[4], r12 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vmov r2, s19 -; CHECK-NEXT: vmov.16 q5[5], r3 -; CHECK-NEXT: vmov r12, s22 -; CHECK-NEXT: vmovx.f16 s20, s17 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q5[6], r3 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov lr, s2 +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q0[6], r3 ; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov.16 q4[6], r3 -; CHECK-NEXT: vmov r2, s23 ; CHECK-NEXT: vmov.16 q4[7], r4 -; CHECK-NEXT: vmovx.f16 s20, s9 +; CHECK-NEXT: vmovx.f16 s0, s9 ; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov.16 q5[0], r4 ; CHECK-NEXT: vmov.16 q5[1], r3 +; CHECK-NEXT: vmovx.f16 s0, s12 ; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: vmov.16 q5[2], r3 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov.16 q5[3], r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmovx.f16 s24, s15 -; CHECK-NEXT: vmov.16 q5[4], r3 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmov.16 q5[5], r3 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov.16 q6[0], r3 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmov.16 q6[1], r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov.32 q2[0], r5 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: vmov.16 q6[2], r3 +; CHECK-NEXT: vmov.16 q6[3], r4 +; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q7[4], r4 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov.16 q7[5], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov.16 q0[1], r4 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov.16 q6[3], r3 -; CHECK-NEXT: vmov r5, s5 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov.32 q2[1], r5 -; CHECK-NEXT: vmov r5, s3 +; CHECK-NEXT: vmov.16 q2[2], r4 +; CHECK-NEXT: vmov.f32 s21, s25 +; CHECK-NEXT: vmov.16 q2[3], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r4, s9 ; CHECK-NEXT: vmov.32 q0[0], r3 -; CHECK-NEXT: vmov r4, s25 -; CHECK-NEXT: vmov.32 q2[2], lr ; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.f32 s22, s30 +; CHECK-NEXT: vmov.32 q0[2], lr ; CHECK-NEXT: vmov.f32 s23, s19 -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmov.32 q2[3], r5 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov.32 q2[2], r12 ; CHECK-NEXT: vadd.f16 q0, q5, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.32 q2[3], r2 +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vadd.f16 q0, q0, q2 -; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmovx.f16 s12, s16 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmov.16 q1[0], r0 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.16 q3[1], r3 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s20, s5 -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovx.f16 s24, s4 -; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r3 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov.f32 s14, s7 ; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmovx.f16 s16, s18 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q5[5], r3 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r3 ; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: vmov r12, s22 -; CHECK-NEXT: vmovx.f16 s20, s17 -; CHECK-NEXT: vmov r5, s20 -; CHECK-NEXT: vmov.16 q5[6], r5 -; CHECK-NEXT: vmov r5, s17 -; CHECK-NEXT: vmov.16 q5[7], r3 -; CHECK-NEXT: vmov.16 q4[6], r5 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: vmov.16 q4[7], r4 -; CHECK-NEXT: vmovx.f16 s20, s9 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r5, s20 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.16 q0[6], r4 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov.16 q0[7], r3 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q4[6], r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmovx.f16 s0, s13 +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vmov.16 q5[1], r5 -; CHECK-NEXT: vmov r5, s11 -; CHECK-NEXT: vmov.16 q5[2], r5 -; CHECK-NEXT: vmov r5, s24 -; CHECK-NEXT: vmov.16 q5[3], r5 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov.16 q5[4], r5 -; CHECK-NEXT: vmov r5, s24 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmov.16 q5[5], r5 -; CHECK-NEXT: vmov r5, s24 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.16 q6[2], r0 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmov.16 q6[3], r4 ; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov.16 q6[0], r5 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmov.16 q6[1], r4 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov.16 q6[2], r5 -; CHECK-NEXT: vmov r5, s5 -; CHECK-NEXT: vmov.16 q6[3], r5 -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov r5, s24 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.32 q0[0], r5 -; CHECK-NEXT: vmov r4, s25 -; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q7[4], r4 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov.16 q7[5], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.16 q0[1], r4 +; CHECK-NEXT: vmovx.f16 s8, s15 +; CHECK-NEXT: vmov.f32 s21, s25 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.16 q2[2], r4 +; CHECK-NEXT: vmov.f32 s22, s30 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.32 q0[2], r2 ; CHECK-NEXT: vmov.f32 s23, s19 -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q1[3], r0 ; CHECK-NEXT: vmov.32 q0[3], r3 +; CHECK-NEXT: vmov.32 q2[2], r12 +; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vadd.f16 q0, q5, q0 -; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <48 x half>, <48 x half>* %src, align 4 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -106,58 +106,58 @@ ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q0, [r0], #64 -; CHECK-NEXT: vldrw.u32 q3, [r0, #-48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #-48] ; CHECK-NEXT: vldrw.u32 q5, [r0, #-16] -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov.f64 d8, d7 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f64 d6, d0 +; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vmov.f32 s19, s21 +; CHECK-NEXT: vmov lr, s18 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov.f32 s13, s1 +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmov.f32 s15, s5 ; CHECK-NEXT: vmov r12, s19 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: vmov.f64 d12, d5 +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f64 d8, d1 +; CHECK-NEXT: vmov.f32 s17, s3 ; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: adds r6, r3, r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adc.w r12, r12, lr -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds.w lr, r5, r6 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r6, s1 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: vmov r5, s25 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r6, s17 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: adcs r6, r5 -; CHECK-NEXT: vmov r5, s5 -; CHECK-NEXT: adds r3, r3, r7 -; CHECK-NEXT: adcs r4, r5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r4, r6 -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: adds r5, r5, r7 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q0[1], r2 ; CHECK-NEXT: vmov.32 q0[2], lr ; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -543,58 +543,58 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f64 d8, d7 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vmov.f32 s19, s21 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmov.f64 d6, d0 ; CHECK-NEXT: vmov r12, s19 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov.f32 s13, s1 +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov.f64 d12, d5 +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f64 d8, d1 +; CHECK-NEXT: adds.w lr, r2, r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.f32 s17, s3 ; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds.w lr, lr, r0 +; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: adcs r0, r3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: adcs r2, r3 ; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r4, r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: adds r4, r4, r6 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adds r0, r0, r4 ; CHECK-NEXT: adcs r2, r3 ; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: vmov.32 q0[1], r2 ; CHECK-NEXT: vmov.32 q0[2], lr ; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 4 @@ -616,117 +616,111 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #72 -; CHECK-NEXT: sub sp, #72 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q0, [r0, #96] -; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vldrw.u32 q7, [r0, #16] -; CHECK-NEXT: vmov.f64 d8, d3 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s19, s3 -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vmov.f64 d12, d11 +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-NEXT: vldrw.u32 q1, [r0, #96] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmov.f64 d10, d0 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s25, s23 -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vmov.f64 d6, d3 -; CHECK-NEXT: vmov.f32 s27, s3 -; CHECK-NEXT: vldrw.u32 q0, [r0, #32] -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s14, s2 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d4, d15 -; CHECK-NEXT: vmov.f32 s15, s3 -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vmov.f32 s9, s31 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s30, s0 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s31, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r0, s30 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s6 ; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov.f32 s22, s0 -; CHECK-NEXT: vmov.f32 s23, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r7, s16 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [r0, #112] +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vmov.f32 s23, s5 +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vmov.f32 s1, s9 ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.f64 d14, d6 +; CHECK-NEXT: vmov r12, s3 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.f64 d2, d8 +; CHECK-NEXT: vmov.f32 s5, s17 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.f32 s29, s13 +; CHECK-NEXT: vmov.f32 s30, s24 +; CHECK-NEXT: vmov.f32 s31, s25 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s23 -; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r5, s3 +; CHECK-NEXT: vmov.f64 d0, d7 ; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r4, r3 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s20 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: adds r0, r0, r5 -; CHECK-NEXT: vmov r5, s24 -; CHECK-NEXT: adc.w r8, r3, r2 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov r4, s21 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: adds r3, r3, r7 -; CHECK-NEXT: vmov r7, s28 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s1, s15 +; CHECK-NEXT: vmov r7, s5 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: vmov r4, s27 +; CHECK-NEXT: adcs r3, r2 +; CHECK-NEXT: adds.w lr, lr, r0 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s30 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov r0, s31 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r0, r4 +; CHECK-NEXT: vmov r4, s22 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: adds r4, r4, r6 +; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: adcs r5, r0 +; CHECK-NEXT: adds r0, r4, r2 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: adc.w r8, r5, r3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: vmov.f64 d0, d3 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: adds r4, r4, r6 +; CHECK-NEXT: vmov r6, s0 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov.f64 d0, d5 +; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: vmov r6, s9 +; CHECK-NEXT: adcs r5, r7 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r7, s8 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.f64 d0, d9 +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: vmov.32 q1[3], r8 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vmov.f32 s1, s19 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: adds r5, r5, r7 ; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: vmov.32 q0[0], r3 -; CHECK-NEXT: vmov r5, s29 -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov.32 q0[3], r8 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: adds r6, r6, r7 -; CHECK-NEXT: adcs r4, r5 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: adds r0, r0, r5 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds r0, r0, r6 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r0, r4 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r0 ; CHECK-NEXT: vmov.32 q0[2], lr ; CHECK-NEXT: vmov.32 q0[3], r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #72 +; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -8,15 +8,15 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #80 -; CHECK-NEXT: sub sp, #80 +; CHECK-NEXT: .pad #192 +; CHECK-NEXT: sub sp, #192 ; CHECK-NEXT: mul r12, r3, r2 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: cmp.w r2, r12, lsr #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 -; CHECK-NEXT: ldr r5, [sp, #160] +; CHECK-NEXT: ldr r5, [sp, #272] ; CHECK-NEXT: and.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -24,213 +24,251 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q5, [r0, #32] -; CHECK-NEXT: vldrh.u16 q3, [r0, #48] -; CHECK-NEXT: vldrh.u16 q7, [r0], #64 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovx.f16 s8, s12 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vldrh.u16 q4, [r0], #64 +; CHECK-NEXT: vldrh.u16 q7, [r0, #-32] +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vldrh.u16 q6, [r0, #-48] -; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r3 ; CHECK-NEXT: vmov r3, s30 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vldrh.u16 q6, [r0, #-16] +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vldrh.u16 q5, [r0, #-48] +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.16 q2[6], r3 ; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmul.f16 q0, q1, r5 -; CHECK-NEXT: vmovx.f16 s4, s24 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vmovx.f16 s12, s22 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmovx.f16 s4, s26 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vmovx.f16 s8, s18 +; CHECK-NEXT: vmul.f16 q0, q0, r5 +; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill ; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmovx.f16 s0, s30 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s28 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s22 -; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: vmov.16 q0[5], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s20 +; CHECK-NEXT: vmovx.f16 s4, s24 ; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q1[4], r4 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s14 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.16 q1[6], r4 ; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmul.f16 q0, q0, r5 -; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmovx.f16 s8, s16 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov.16 q2[0], r4 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s12, s20 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.16 q3[2], r4 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmul.f16 q0, q2, r5 +; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill ; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vstrw.32 q0, [sp, #160] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r2, s29 +; CHECK-NEXT: vmov.16 q0[1], r3 ; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: vmov r3, s29 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov r3, s31 +; CHECK-NEXT: vmov.16 q1[5], r3 ; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov r3, s27 +; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.16 q2[7], r3 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vmovx.f16 s12, s23 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmovx.f16 s4, s27 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vmovx.f16 s8, s19 +; CHECK-NEXT: vmul.f16 q0, q0, r5 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmovx.f16 s0, s31 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s29 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmul.f16 q4, q1, r5 -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: vmovx.f16 s4, s25 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s23 -; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: vmov.16 q0[5], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s21 +; CHECK-NEXT: vmovx.f16 s4, s25 +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q1[4], r4 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s27 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.16 q1[6], r4 ; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov.16 q2[0], r4 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s12, s21 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.16 q3[2], r4 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vldrw.u32 q3, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmovx.f16 s0, s24 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmul.f16 q6, q0, r5 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmul.f16 q2, q2, r5 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov.16 q1[3], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s24 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov.16 q5[1], r3 -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.16 q7[0], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.16 q7[1], r3 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vmov q1, q6 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s5 ; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov q1, q4 ; CHECK-NEXT: vmov.16 q3[3], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s25 -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q7[0], r2 +; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov.16 q3[7], r3 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov.16 q7[1], r3 +; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q4[1], r3 ; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov.16 q7[4], r2 +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q7[5], r2 -; CHECK-NEXT: vmov r3, s26 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmov.16 q1[3], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov.16 q6[0], r2 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q2[3], r3 +; CHECK-NEXT: vmov.16 q6[1], r3 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vmov q1, q6 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: vldrw.u32 q5, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vmov.f32 s29, s21 +; CHECK-NEXT: vldrw.u32 q5, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmov.f32 s30, s22 +; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.f32 s31, s23 +; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov.f32 s17, s21 +; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmovx.f16 s4, s6 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s18 -; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vmov.16 q2[7], r3 +; CHECK-NEXT: vmov.f32 s25, s21 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s26, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov r2, s19 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmovx.f16 s16, s19 -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovx.f16 s16, s27 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s16, s19 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s21, s25 -; CHECK-NEXT: vstrh.16 q0, [r1, #32] -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov.f32 s29, s13 -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vstrh.16 q2, [r1, #48] -; CHECK-NEXT: vstrh.16 q5, [r1], #64 -; CHECK-NEXT: vmov.f32 s31, s15 -; CHECK-NEXT: vstrh.16 q7, [r1, #-48] +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vstrh.16 q6, [r1, #32] +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vstrh.16 q1, [r1, #48] +; CHECK-NEXT: vstrh.16 q7, [r1], #64 +; CHECK-NEXT: vstrh.16 q4, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: add sp, #80 +; CHECK-NEXT: add sp, #192 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -49,19 +49,19 @@ ; CHECK-LABEL: vmulhs_v4i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s12, s0 -; CHECK-NEXT: vmov.f32 s14, s1 ; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.f32 s12, s6 ; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: smmul r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmullb.s32 q0, q1, q3 -; CHECK-NEXT: smmul r0, r1, r0 -; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: smmul r1, r2, r1 ; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.32 q2[1], r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll @@ -6,48 +6,45 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r5, lr} ; CHECK-NEXT: push {r5, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: it lt -; CHECK-NEXT: poplt {r5, pc} +; CHECK-NEXT: blt .LBB0_2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: vldrw.u32 q3, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vmullt.s32 q3, q2, q1 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: vmov r12, s12 +; CHECK-NEXT: vmullt.s32 q4, q3, q2 +; CHECK-NEXT: vmov r5, s17 +; CHECK-NEXT: vmov r12, s16 ; CHECK-NEXT: lsrl r12, r5, #31 ; CHECK-NEXT: vmov.32 q0[0], r12 -; CHECK-NEXT: vmov r12, s14 -; CHECK-NEXT: vmov.32 q0[1], r5 -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmullb.s32 q3, q2, q1 -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov r12, s12 -; CHECK-NEXT: vmov.32 q0[3], r5 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmov.32 q1[0], r12 -; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmov r12, s18 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov.32 q1[1], r5 -; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: vmov r5, s19 ; CHECK-NEXT: lsrl r12, r5, #31 +; CHECK-NEXT: vmullb.s32 q4, q3, q2 ; CHECK-NEXT: vmov.32 q1[2], r12 -; CHECK-NEXT: vmov.32 q1[3], r5 -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vmov r5, s17 +; CHECK-NEXT: vmov r12, s16 +; CHECK-NEXT: lsrl r12, r5, #31 +; CHECK-NEXT: vmov.32 q2[0], r12 +; CHECK-NEXT: vmov r12, s18 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.32 q3[1], r5 +; CHECK-NEXT: vmov r5, s19 +; CHECK-NEXT: lsrl r12, r5, #31 +; CHECK-NEXT: vmov.32 q3[2], r12 +; CHECK-NEXT: vmov.f32 s15, s6 +; CHECK-NEXT: vmov.f32 s10, s14 +; CHECK-NEXT: vmov.f32 s11, s15 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: bne .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: .LBB0_2: @ %for.cond.cleanup +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r5, pc} entry: %0 = and i32 %n, 3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll @@ -191,14 +191,14 @@ ; CHECK-NEXT: vmov.32 q1[3], r1 ; CHECK-NEXT: adr r1, .LCPI12_0 ; CHECK-NEXT: vldrw.u32 q2, [r1] -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vand q3, q0, q1 +; CHECK-NEXT: vmov r2, s13 ; CHECK-NEXT: vbic q2, q2, q1 -; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r3, r1 -; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vorr q0, q3, q2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r3, r2 +; CHECK-NEXT: vmov r2, s15 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 @@ -206,9 +206,9 @@ ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov.32 q1[0], r1 ; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r3, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r3, r2 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll @@ -215,14 +215,14 @@ ; CHECK-NEXT: vmov.32 q1[3], r1 ; CHECK-NEXT: adr r1, .LCPI12_0 ; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s9 ; CHECK-NEXT: vbic q0, q0, q1 -; CHECK-NEXT: vand q1, q2, q1 -; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: sbcs.w r0, r2, r0 -; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vorr q0, q2, q0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r2, r1 +; CHECK-NEXT: vmov r1, s11 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 @@ -230,9 +230,9 @@ ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: sbcs.w r0, r2, r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r2, r1 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w r12, #1 ; CHECK-NEXT: cmp.w r12, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -9,18 +9,15 @@ ; CHECK-NEXT: ldrd r2, r12, [r0] ; CHECK-NEXT: ldrd r3, r0, [r0, #8] ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.32 q1[2], r12 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -446,12 +443,12 @@ ; CHECK-NEXT: vmov.16 q2[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov.16 q2[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strd r0, r2, [r1] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: strd r2, r0, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -466,33 +463,33 @@ define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vst2_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldm.w r0, {r2, r3, r12} -; CHECK-NEXT: vmov.32 q0[0], r12 -; CHECK-NEXT: ldr r0, [r0, #12] +; CHECK-NEXT: ldrd r2, r12, [r0] +; CHECK-NEXT: ldrd r3, r0, [r0, #8] ; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q1[0], r3 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.32 q2[1], r12 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.16 q0[5], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -8,24 +8,30 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} ; CHECK-NEXT: ldrd r4, r0, [r0, #16] ; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov.32 q1[2], r12 -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.32 q1[3], lr -; CHECK-NEXT: vmov.f32 s8, s7 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q3[2], r12 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vmov.f32 s5, s14 +; CHECK-NEXT: vmov.32 q5[3], lr +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s16, s23 +; CHECK-NEXT: vmov.f32 s18, s17 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: strd r2, r0, [r1, #16] -; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, pc} entry: %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 @@ -301,22 +307,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrh r2, [r0, #6] -; CHECK-NEXT: ldrh r3, [r0, #4] +; CHECK-NEXT: ldrh r4, [r0, #6] +; CHECK-NEXT: ldrh.w lr, [r0, #4] ; CHECK-NEXT: ldrh.w r12, [r0, #8] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrh.w lr, [r0, #2] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q2[0], lr +; CHECK-NEXT: ldrh r3, [r0] ; CHECK-NEXT: ldrh r0, [r0, #10] ; CHECK-NEXT: vmov.16 q0[5], r0 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov.32 q0[2], lr -; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.32 q1[2], r2 +; CHECK-NEXT: vmov.f32 s3, s6 ; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: vmov.f32 s3, s2 ; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vstrh.32 q0, [r1] ; CHECK-NEXT: str r0, [r1, #8] @@ -388,79 +394,100 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vldrw.u32 q5, [r0, #16] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q3 ; CHECK-NEXT: vmov.16 q0[6], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] ; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r0, q5[0] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q5[1] +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vmov.f32 s17, s8 +; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vdup.32 q5, r2 -; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vmov.u16 r2, q5[2] -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.f32 s13, s25 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.f32 s14, s26 -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.16 q7[6], r0 +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.16 q7[7], r0 ; CHECK-NEXT: vdup.32 q6, r2 -; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.f32 s19, s31 ; CHECK-NEXT: vmov.u16 r2, q6[2] -; CHECK-NEXT: vmov.f32 s22, s7 -; CHECK-NEXT: vrev32.16 q4, q4 -; CHECK-NEXT: vmov.16 q7[2], r2 -; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.u16 r2, q4[2] -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.16 q7[3], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vmov.16 q7[4], r0 ; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vmov.16 q7[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.f32 s21, s29 -; CHECK-NEXT: vmov.f32 s1, s13 -; CHECK-NEXT: vmov.f32 s22, s30 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.u16 r0, q5[5] +; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov.u16 r0, q5[6] +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vmov.f32 s18, s30 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q5[7] +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vmov.f32 s25, s1 +; CHECK-NEXT: vmov.16 q7[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vmov.f32 s26, s7 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: vmov.f32 s27, s31 +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.u16 r0, q6[3] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q6[4] +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vmov.f32 s25, s1 +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vrev32.16 q0, q5 +; CHECK-NEXT: vmov.16 q7[5], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.f32 s26, s30 +; CHECK-NEXT: vmov.f32 s13, s1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s14, s10 +; CHECK-NEXT: vstrw.32 q6, [r1, #32] +; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov.u16 r2, q3[4] +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s13, s17 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -482,185 +509,218 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #160 -; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: .pad #240 +; CHECK-NEXT: sub sp, #240 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] ; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[0] +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[1] +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vstrw.32 q1, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s18, s2 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.u16 r2, q2[2] ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vstrw.32 q0, [sp, #224] @ 16-byte Spill ; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s19, s7 ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[2], r3 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] +; CHECK-NEXT: vmov.u16 r2, q4[3] +; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.u16 r2, q4[4] ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vstrw.32 q1, [sp, #160] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[4], r2 ; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmov.u16 r2, q7[5] +; CHECK-NEXT: vmov.16 q6[1], r2 ; CHECK-NEXT: vmov.u16 r2, q3[6] -; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s22, s3 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s25, s1 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r2, q5[3] +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.u16 r2, q7[7] +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov.f32 s26, s31 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill ; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s27, s7 ; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.u16 r2, q6[3] +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.u16 r2, q6[4] +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: vmov.16 q5[0], r0 ; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.f32 s21, s16 +; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.f32 s22, s2 +; CHECK-NEXT: vmov.16 q2[7], r0 ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.f32 s23, s11 ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.f32 s25, s8 +; CHECK-NEXT: vmov.u16 r0, q5[3] ; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u16 r0, q6[3] -; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov.u16 r2, q5[4] ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vldrw.u32 q4, [sp, #192] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vldrw.u32 q7, [sp, #224] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] +; CHECK-NEXT: vmov.u16 r0, q4[5] ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r0, q7[5] ; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q7[6] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q7[7] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.f32 s2, s19 ; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.u16 r0, q4[6] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.u16 r0, q4[7] +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.u16 r0, q7[7] +; CHECK-NEXT: vmov.f32 s2, s31 +; CHECK-NEXT: vldrw.u32 q7, [sp, #208] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov r2, s31 +; CHECK-NEXT: vdup.32 q2, r2 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmov.u16 r2, q2[2] ; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vdup.32 q7, r2 -; CHECK-NEXT: vrev32.16 q3, q3 -; CHECK-NEXT: vmov.u16 r2, q7[2] -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vldrw.u32 q2, [sp, #192] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.16 q4[4], r2 +; CHECK-NEXT: vrev32.16 q2, q2 +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: vrev32.16 q2, q3 +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #224] @ 16-byte Reload +; CHECK-NEXT: vmov.u16 r0, q2[2] ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.u16 r0, q7[3] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vrev32.16 q7, q7 -; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u16 r0, q7[5] +; CHECK-NEXT: vmov.16 q2[7], r0 ; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #176] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.16 q7[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov.f32 s25, s5 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.f32 s26, s6 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov q2, q7 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov.f32 s1, s5 ; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp, #128] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s17 +; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s26, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [r1, #80] +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s5, s17 +; CHECK-NEXT: vldrw.u32 q4, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s6, s18 ; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q6, [r1, #32] +; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s30, s18 -; CHECK-NEXT: vstrw.32 q6, [r1] +; CHECK-NEXT: vldrw.u32 q1, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s30, s6 +; CHECK-NEXT: vmov.f32 s31, s11 ; CHECK-NEXT: vmov.u16 r2, q7[3] -; CHECK-NEXT: vmov.f32 s13, s5 ; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vmov.u16 r2, q7[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s6 -; CHECK-NEXT: vstrw.32 q7, [r1, #16] +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.f32 s30, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #224] @ 16-byte Reload ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #208] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vstrw.32 q7, [r1, #16] +; CHECK-NEXT: vmov.f32 s14, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s15, s7 ; CHECK-NEXT: vmov.u16 r2, q3[3] ; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.f32 s14, s6 ; CHECK-NEXT: vstrw.32 q3, [r1, #64] -; CHECK-NEXT: add sp, #160 +; CHECK-NEXT: add sp, #240 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -682,39 +742,36 @@ define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) { ; CHECK-LABEL: vst3_v2i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: ldrb r5, [r0, #5] -; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #20 +; CHECK-NEXT: sub sp, #20 +; CHECK-NEXT: ldrb r3, [r0] +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: ldrb r4, [r0, #2] +; CHECK-NEXT: mov r7, sp +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r5, [r0, #3] +; CHECK-NEXT: vmov.16 q0[1], r4 +; CHECK-NEXT: ldrb r6, [r0, #5] ; CHECK-NEXT: ldrb r0, [r0, #4] -; CHECK-NEXT: vmov.16 q0[1], r12 -; CHECK-NEXT: mov r2, sp ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: add r0, sp, #8 -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: vmov.16 q0[4], lr -; CHECK-NEXT: vmov.16 q0[5], r5 -; CHECK-NEXT: vmov.16 q0[6], r6 -; CHECK-NEXT: vmov.16 q0[7], r6 -; CHECK-NEXT: vstrb.16 q0, [r2] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q0[4], r5 +; CHECK-NEXT: vmov.16 q0[5], r6 +; CHECK-NEXT: vmov.16 q0[6], r12 +; CHECK-NEXT: vmov.16 q0[7], r12 +; CHECK-NEXT: vstrb.16 q0, [r7] ; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: ldr r2, [sp] ; CHECK-NEXT: str r2, [r1] ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: strh r0, [r1, #4] -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: add sp, #20 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0 %l1 = load <2 x i8>, <2 x i8>* %s1, align 4 @@ -783,32 +840,38 @@ define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) { ; CHECK-LABEL: vst3_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrb.u16 q1, [r0, #8] ; CHECK-NEXT: vldrb.u16 q2, [r0, #16] -; CHECK-NEXT: vldrb.u16 q3, [r0] ; CHECK-NEXT: vmov.u16 r2, q1[5] ; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] ; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.16 q3[3], r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vldrb.u16 q3, [r0] +; CHECK-NEXT: vmov.16 q4[6], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmov.16 q4[7], r2 ; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov.f32 s3, s19 ; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vmov.u16 r0, q4[2] ; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] ; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.16 q6[5], r0 ; CHECK-NEXT: vmov.u16 r0, q3[0] ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] @@ -838,14 +901,13 @@ ; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.8 q4[13], r0 ; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.f32 s1, s21 ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.f32 s2, s22 +; CHECK-NEXT: vmov.f32 s2, s26 ; CHECK-NEXT: vmov.8 q4[15], r0 ; CHECK-NEXT: vstrb.16 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0 @@ -864,15 +926,28 @@ define void @vst3_v16i8(<16 x i8> *%src, <48 x i8> *%dst) { ; CHECK-LABEL: vst3_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.u8 r2, q1[0] ; CHECK-NEXT: vmov.u8 r3, q3[0] -; CHECK-NEXT: vmov.u8 r0, q2[0] +; CHECK-NEXT: vmov.8 q4[2], r2 +; CHECK-NEXT: vmov.u8 r2, q1[2] +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.8 q0[8], r2 +; CHECK-NEXT: vmov.u8 r2, q1[3] +; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.8 q2[14], r2 ; CHECK-NEXT: vmov.8 q5[0], r3 -; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: vmov.f32 s19, s11 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.u8 r2, q4[2] +; CHECK-NEXT: vmov.u8 r0, q2[0] ; CHECK-NEXT: vmov.8 q5[1], r0 ; CHECK-NEXT: vmov.u8 r0, q3[1] ; CHECK-NEXT: vmov.8 q5[3], r0 @@ -887,26 +962,17 @@ ; CHECK-NEXT: vmov.u8 r0, q2[3] ; CHECK-NEXT: vmov.8 q5[10], r0 ; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov.8 q4[2], r2 -; CHECK-NEXT: vmov.u8 r2, q1[2] ; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q4[8], r2 -; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q3[5] ; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov.8 q4[11], r2 -; CHECK-NEXT: vmov.u8 r2, q1[4] ; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vmov.8 q4[14], r2 ; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.f32 s17, s4 ; CHECK-NEXT: vmov.u8 r0, q5[1] ; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r2, q4[2] -; CHECK-NEXT: vmov.8 q0[2], r2 ; CHECK-NEXT: vmov.u8 r0, q5[3] +; CHECK-NEXT: vmov.8 q0[2], r2 ; CHECK-NEXT: vmov.8 q0[3], r0 ; CHECK-NEXT: vmov.u8 r0, q5[4] ; CHECK-NEXT: vmov.8 q0[4], r0 @@ -962,10 +1028,12 @@ ; CHECK-NEXT: vmov.u8 r0, q3[7] ; CHECK-NEXT: vmov.8 q6[5], r0 ; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmov.8 q6[8], r0 -; CHECK-NEXT: vmov.u8 r0, q3[9] -; CHECK-NEXT: vmov.8 q6[11], r0 +; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vmov.f32 s24, s13 +; CHECK-NEXT: vmov.8 q7[8], r0 +; CHECK-NEXT: vmov.u8 r0, q3[9] +; CHECK-NEXT: vmov.8 q7[11], r0 +; CHECK-NEXT: vmov.f32 s26, s30 ; CHECK-NEXT: vmov.f32 s27, s14 ; CHECK-NEXT: vmov.u8 r0, q6[2] ; CHECK-NEXT: vmov.8 q4[2], r0 @@ -1061,7 +1129,7 @@ ; CHECK-NEXT: vmov.u8 r0, q5[15] ; CHECK-NEXT: vmov.8 q1[15], r0 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 @@ -1430,19 +1498,20 @@ ; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmovx.f16 s4, s5 ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmovx.f16 s4, s8 +; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: vmov.16 q0[4], r0 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: stm r1!, {r0, r2, r3} +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: strd r2, r3, [r1] +; CHECK-NEXT: str r0, [r1, #8] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -1463,48 +1532,56 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: ldrd r3, r2, [r0] +; CHECK-NEXT: ldrd lr, r12, [r0, #8] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.16 q3[0], r3 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovx.f16 s16, s16 +; CHECK-NEXT: vmov.32 q0[2], lr +; CHECK-NEXT: vmovx.f16 s4, s5 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: ldrd r2, r0, [r0, #16] -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: ldr r2, [r0, #16] +; CHECK-NEXT: ldr r0, [r0, #20] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmovx.f16 s16, s2 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmovx.f16 s16, s8 +; CHECK-NEXT: vmov.16 q3[4], r2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmov.32 q0[3], r12 +; CHECK-NEXT: vmov.16 q3[5], r2 ; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: vmov r2, s3 ; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q3[7], r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vmovx.f16 s8, s9 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vstrw.32 q3, [r1] +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: strd r2, r0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 @@ -1525,97 +1602,114 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q6, [r0, #32] +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q1[1], r2 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.16 q2[4], r3 +; CHECK-NEXT: vmov.f32 s5, s16 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.16 q3[7], r12 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vdup.32 q2, r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmovx.f16 s8, s10 ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmovx.f16 s16, s18 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s22 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s23 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vdup.32 q7, r2 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov r2, s29 -; CHECK-NEXT: vmov.f32 s18, s23 -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmovx.f16 s28, s30 -; CHECK-NEXT: vmovx.f16 s4, s10 -; CHECK-NEXT: vmov.f32 s1, s13 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmovx.f16 s28, s9 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov.16 q7[1], r2 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q7[7], r0 -; CHECK-NEXT: vmov.f32 s17, s25 -; CHECK-NEXT: vmov.f32 s29, s21 -; CHECK-NEXT: vmov.f32 s30, s10 -; CHECK-NEXT: vmovx.f16 s4, s29 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vrev32.16 q2, q1 -; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov.16 q3[5], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s27 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmov.16 q2[3], r3 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.f32 s22, s27 +; CHECK-NEXT: vmov.16 q3[7], r2 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov.f32 s23, s15 +; CHECK-NEXT: vmovx.f16 s8, s21 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vdup.32 q2, r2 ; CHECK-NEXT: vmov r2, s9 ; CHECK-NEXT: vmovx.f16 s8, s10 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s30 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vstrw.32 q7, [r1, #16] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vmov.16 q7[2], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q7[3], r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov.16 q7[5], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmovx.f16 s12, s18 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.f32 s9, s25 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vrev32.16 q3, q0 +; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmovx.f16 s12, s14 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q4[5], r2 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s21, s13 +; CHECK-NEXT: vmov.f32 s22, s30 +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1637,203 +1731,239 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #144 -; CHECK-NEXT: sub sp, #144 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: .pad #240 +; CHECK-NEXT: sub sp, #240 +; CHECK-NEXT: vldrw.u32 q6, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q3[2], r3 -; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmovx.f16 s0, s27 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.16 q1[6], r12 +; CHECK-NEXT: vmov.f32 s10, s27 +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vmov r3, s31 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmovx.f16 s0, s9 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q2[2], r3 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.16 q7[0], r3 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vstrw.32 q1, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vstrw.32 q1, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vstrw.32 q2, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.16 q2[1], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov.16 q7[6], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vmov.16 q7[7], r2 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov r12, s13 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.16 q1[6], r12 +; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vstrw.32 q2, [sp, #176] @ 16-byte Spill ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vdup.32 q0, r0 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.f32 s29, s20 +; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vstrw.32 q1, [sp, #160] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q5 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q5[0], r0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.16 q5[1], r2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s15 -; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vmov.16 q1[6], r3 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov.f32 s23, s7 +; CHECK-NEXT: vmovx.f16 s0, s21 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vdup.32 q0, r2 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: vmov r0, s29 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s9, s28 +; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmovx.f16 s0, s29 +; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov.16 q4[0], r0 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.f32 s9, s25 -; CHECK-NEXT: vmov.f32 s17, s13 -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmovx.f16 s4, s30 +; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s30 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmovx.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vrev32.16 q0, q1 +; CHECK-NEXT: vrev32.16 q0, q3 ; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.f32 s10, s26 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #80] ; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmovx.f16 s0, s13 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s29 +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmovx.f16 s0, s21 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vrev32.16 q3, q0 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmovx.f16 s12, s14 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s1, s29 +; CHECK-NEXT: vmovx.f16 s12, s26 +; CHECK-NEXT: vmov.f32 s2, s26 +; CHECK-NEXT: vldrw.u32 q6, [sp, #208] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp, #224] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[6], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vldrw.u32 q3, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.f32 s13, s25 -; CHECK-NEXT: vmov.f32 s14, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s25, s5 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.f32 s21, s1 -; CHECK-NEXT: vmov.f32 s26, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s22, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vstrw.32 q6, [r1, #32] +; CHECK-NEXT: vmov.f32 s29, s25 +; CHECK-NEXT: vldrw.u32 q6, [sp, #192] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmov.f32 s30, s26 +; CHECK-NEXT: vldrw.u32 q6, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vrev32.16 q1, q1 +; CHECK-NEXT: vmov.f32 s25, s29 +; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.f32 s26, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vstrw.32 q6, [r1] +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmov.f32 s21, s29 +; CHECK-NEXT: vldrw.u32 q7, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.f32 s22, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s9, s29 +; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vstrw.32 q5, [r1, #16] -; CHECK-NEXT: vmov.f32 s30, s6 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s30 +; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vldrw.u32 q2, [sp, #224] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vstrw.32 q7, [r1, #48] +; CHECK-NEXT: vstrw.32 q2, [r1, #80] ; CHECK-NEXT: vstrw.32 q4, [r1, #64] -; CHECK-NEXT: add sp, #144 +; CHECK-NEXT: add sp, #240 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -8,28 +8,30 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} +; CHECK-NEXT: ldrd r3, r2, [r0] +; CHECK-NEXT: ldrd lr, r12, [r0, #8] ; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov.f64 d0, d2 ; CHECK-NEXT: vmov.f32 s1, s6 -; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s2, s0 ; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.32 q1[2], r12 -; CHECK-NEXT: vmov.32 q1[3], lr -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.32 q2[1], r2 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.32 q3[2], lr +; CHECK-NEXT: vmov.f32 s5, s14 +; CHECK-NEXT: vmov.32 q3[3], r12 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s2 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vmov.f32 s5, s15 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: pop {r4, pc} entry: %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 @@ -210,22 +212,19 @@ ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: ldrh r4, [r0] -; CHECK-NEXT: ldrh.w lr, [r0, #4] -; CHECK-NEXT: ldrh r3, [r0, #8] -; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: ldrh.w r12, [r0, #6] -; CHECK-NEXT: ldrh r2, [r0, #10] -; CHECK-NEXT: ldrh r0, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: ldrh r2, [r0, #4] ; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh.w lr, [r0, #2] +; CHECK-NEXT: ldrh.w r12, [r0, #6] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrh r3, [r0, #10] +; CHECK-NEXT: ldrh r0, [r0, #8] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.16 q0[4], lr ; CHECK-NEXT: vmov.16 q0[5], r12 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: pop {r4, pc} entry: @@ -375,19 +374,16 @@ ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #1] ; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[1], r3 ; CHECK-NEXT: ldrb r4, [r0, #5] -; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: ldrb r0, [r0, #4] -; CHECK-NEXT: vmov.16 q0[1], r12 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: vmov.16 q0[4], r12 ; CHECK-NEXT: vmov.16 q0[5], lr ; CHECK-NEXT: vmov.16 q0[6], r4 ; CHECK-NEXT: vmov.16 q0[7], r4 @@ -913,58 +909,71 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov.32 q0[2], r12 -; CHECK-NEXT: vmov.32 q0[3], lr -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov.32 q6[0], r2 +; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.32 q6[1], r3 +; CHECK-NEXT: vmov q1, q6 +; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: vmov.32 q1[2], r12 +; CHECK-NEXT: vmov.16 q4[0], r3 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vmovx.f16 s24, s25 +; CHECK-NEXT: vmov.32 q5[3], lr +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmovx.f16 s20, s23 +; CHECK-NEXT: vmov.16 q4[1], r2 ; CHECK-NEXT: ldrd r2, r0, [r0, #16] -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.32 q1[3], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s3 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov q7, q2 +; CHECK-NEXT: vmov.32 q7[1], r0 +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vmov r3, s29 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.16 q4[2], r3 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.32 q3[3], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmovx.f16 s12, s15 +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmovx.f16 s20, s29 +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q4[6], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vstrw.32 q4, [r1, #16] ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s12, s12 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s8 +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q4[6], r0 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll @@ -50,48 +50,39 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pSrc, i32 %blockSize, <4 x i32> %a) { ; CHECK-LABEL: foo_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q4, [r0] -; CHECK-NEXT: vmov.f64 d0, d8 -; CHECK-NEXT: vmov.i64 q5, #0xffffffff -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vand q6, q0, q5 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov r1, s25 +; CHECK-NEXT: vmov.f64 d10, d9 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmov.f32 s22, s19 +; CHECK-NEXT: vmov r0, s20 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov r0, s22 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov r1, s27 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: vmov.f64 d0, d9 -; CHECK-NEXT: vmov.f32 s2, s19 -; CHECK-NEXT: vand q0, q0, q5 +; CHECK-NEXT: vmov.f32 s18, s17 +; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r7, s1 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmov d8, r4, r5 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d11, r0, r1 ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d10, r0, r1 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %pSrc, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -513,7 +513,6 @@ ; SSE2-SSSE3-NEXT: shll $16, %edx ; SSE2-SSSE3-NEXT: orl %eax, %edx ; SSE2-SSSE3-NEXT: shlq $32, %rdx -; SSE2-SSSE3-NEXT: orq %rcx, %rdx ; SSE2-SSSE3-NEXT: movq %rdx, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-SSSE3-NEXT: movd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -773,6 +773,7 @@ define <4 x i32> @ossfuzz5688(i32 %a0) { ; CHECK-LABEL: ossfuzz5688: ; CHECK: # %bb.0: +; CHECK-NEXT: movl $0, (%rax) ; CHECK-NEXT: retq %1 = insertelement <4 x i32> zeroinitializer, i32 -2147483648, i32 %a0 %2 = extractelement <4 x i32> %1, i32 %a0 diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2054,22 +2054,22 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,7,42,32] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [32768,4294934528,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[0,2] -; SSE2-NEXT: pmuludq %xmm3, %xmm0 -; SSE2-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pmuludq %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE2-NEXT: pmuludq %xmm3, %xmm2 +; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295] +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_negative2: diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll --- a/llvm/test/CodeGen/X86/promote-cmp.ll +++ b/llvm/test/CodeGen/X86/promote-cmp.ll @@ -14,9 +14,10 @@ ; SSE2-NEXT: pxor %xmm4, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -31,12 +32,9 @@ ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] ; SSE2-NEXT: pxor {{.*}}(%rip), %xmm4 -; SSE2-NEXT: psllq $63, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,1,3] ; SSE2-NEXT: psllq $63, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 @@ -56,7 +54,6 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE4-NEXT: pxor %xmm5, %xmm6 -; SSE4-NEXT: psllq $63, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero ; SSE4-NEXT: psllq $63, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_setcc.ll b/llvm/test/CodeGen/X86/vec_setcc.ll --- a/llvm/test/CodeGen/X86/vec_setcc.ll +++ b/llvm/test/CodeGen/X86/vec_setcc.ll @@ -206,11 +206,12 @@ ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_setcc_v3i1_v3i16: @@ -218,9 +219,10 @@ ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: pextrb $2, %xmm1, %edx -; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: pextrb $4, %xmm0, %edx +; SSE41-NEXT: pextrb $8, %xmm0, %ecx ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx ; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx @@ -231,9 +233,10 @@ ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: vpextrb $2, %xmm0, %edx -; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpextrb $4, %xmm1, %edx +; AVX-NEXT: vpextrb $8, %xmm1, %ecx ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: # kill: def $dl killed $dl killed $edx ; AVX-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -185,13 +185,14 @@ ; X32-SSE-NEXT: psrlq $1, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X32-SSE-NEXT: pxor %xmm6, %xmm6 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pand %xmm3, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 @@ -1240,13 +1241,14 @@ ; X32-SSE-NEXT: psrlq $1, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X32-SSE-NEXT: pxor %xmm6, %xmm6 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pand %xmm3, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 @@ -2505,17 +2507,15 @@ ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = <4,u,14,u> ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 ; X32-SSE-NEXT: pandn %xmm2, %xmm4 -; X32-SSE-NEXT: psrlq $1, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 +; X32-SSE-NEXT: psrlq $1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X32-SSE-NEXT: psrlq %xmm4, %xmm1 +; X32-SSE-NEXT: psrlq $50, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psllq %xmm3, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psllq %xmm3, %xmm0 +; X32-SSE-NEXT: psllq $14, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 ; X32-SSE-NEXT: retl @@ -3032,10 +3032,8 @@ ; X32-SSE-LABEL: splatconstant_funnnel_v2i64: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: psrlq $50, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X32-SSE-NEXT: psllq $14, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X32-SSE-NEXT: orpd %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -136,17 +136,18 @@ ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllq %xmm1, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-SSE-NEXT: movdqa %xmm0, %xmm6 +; X32-SSE-NEXT: psllq %xmm1, %xmm6 +; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psrlq %xmm2, %xmm0 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-SSE-NEXT: psrlq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm6, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt) ret <2 x i64> %res @@ -745,17 +746,18 @@ ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllq %xmm1, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-SSE-NEXT: movdqa %xmm0, %xmm6 +; X32-SSE-NEXT: psllq %xmm1, %xmm6 +; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psrlq %xmm2, %xmm0 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-SSE-NEXT: psrlq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm6, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat) @@ -1263,17 +1265,15 @@ ; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm2, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllq %xmm2, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psllq $14, %xmm2 +; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; X32-SSE-NEXT: pand %xmm1, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psrlq %xmm2, %xmm0 +; X32-SSE-NEXT: psrlq $50, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm2, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res @@ -1665,10 +1665,8 @@ ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq $50, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X32-SSE-NEXT: psllq $14, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X32-SSE-NEXT: orpd %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -185,14 +185,15 @@ ; X32-SSE-NEXT: pand %xmm3, %xmm4 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X32-SSE-NEXT: pxor %xmm6, %xmm6 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pandn %xmm3, %xmm2 ; X32-SSE-NEXT: psllq $1, %xmm0 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 @@ -1225,14 +1226,15 @@ ; X32-SSE-NEXT: pand %xmm3, %xmm4 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X32-SSE-NEXT: pxor %xmm6, %xmm6 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pandn %xmm3, %xmm2 ; X32-SSE-NEXT: psllq $1, %xmm0 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 @@ -2124,15 +2126,13 @@ ; X32-SSE-NEXT: pand %xmm2, %xmm4 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X32-SSE-NEXT: psrlq %xmm4, %xmm1 +; X32-SSE-NEXT: psrlq $14, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pandn %xmm2, %xmm3 -; X32-SSE-NEXT: psllq $1, %xmm0 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psllq $1, %xmm2 ; X32-SSE-NEXT: psllq %xmm3, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psllq %xmm3, %xmm0 +; X32-SSE-NEXT: psllq $50, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 ; X32-SSE-NEXT: retl @@ -2662,10 +2662,8 @@ ; X32-SSE-LABEL: splatconstant_funnnel_v2i64: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: psrlq $14, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X32-SSE-NEXT: psllq $50, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X32-SSE-NEXT: orpd %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -138,17 +138,18 @@ ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrlq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psrlq %xmm1, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-SSE-NEXT: movdqa %xmm0, %xmm6 +; X32-SSE-NEXT: psrlq %xmm1, %xmm6 +; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psllq %xmm2, %xmm0 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-SSE-NEXT: psllq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm6, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt) ret <2 x i64> %res @@ -789,17 +790,18 @@ ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrlq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psrlq %xmm1, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-SSE-NEXT: movdqa %xmm0, %xmm6 +; X32-SSE-NEXT: psrlq %xmm1, %xmm6 +; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psllq %xmm2, %xmm0 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-SSE-NEXT: psllq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm6, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat) @@ -1343,17 +1345,15 @@ ; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrlq %xmm2, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psrlq %xmm2, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrlq $14, %xmm2 +; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; X32-SSE-NEXT: pand %xmm1, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psllq %xmm2, %xmm0 +; X32-SSE-NEXT: psllq $50, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm2, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res @@ -1745,10 +1745,8 @@ ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq $50, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X32-SSE-NEXT: psrlq $14, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X32-SSE-NEXT: orpd %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -1928,20 +1928,17 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -2006,20 +2003,17 @@ ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsrlq $32, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vmovd %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax @@ -2032,20 +2026,17 @@ ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsrlq $32, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax @@ -2168,19 +2159,16 @@ ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpmullw %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpmullw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -2204,20 +2192,17 @@ ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -2250,8 +2235,8 @@ ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax @@ -2504,19 +2489,16 @@ ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm4, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpmullw %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpmullw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -2543,20 +2525,17 @@ ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmullw %zmm0, %zmm3, %zmm0 ; AVX512BW-NEXT: vpmullw %zmm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -2592,8 +2571,8 @@ ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[12],zero,xmm0[14],zero,zero,zero,zero,zero ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpsrld $8, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2313,11 +2313,9 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -569,8 +569,7 @@ ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 @@ -583,23 +582,32 @@ ; SSE41-NEXT: movd %edi, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE41-NEXT: pinsrd $1, %edi, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: simplify_select: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vmovd %edi, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: simplify_select: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: simplify_select: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: retq %a = insertelement <2 x i32> , i32 %x, i32 1 %b = insertelement <2 x i32> , i32 %x, i32 0 %y = or <2 x i32> %a, %b