diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17881,6 +17881,21 @@ unsigned NumElts = VecVT.getVectorNumElements(); unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); + // If all bits of the extracted element are known, return it as a constant. + if (IndexC && ScalarVT == VecVT.getVectorElementType()) { + APInt DemandedElts = APInt::getOneBitSet(NumElts, IndexC->getZExtValue()); + KnownBits Known = DAG.computeKnownBits(VecOp, DemandedElts); + if (Known.isConstant()) { + APInt KnownVal = Known.getConstant(); + if (ScalarVT.isInteger()) + return DAG.getConstant(KnownVal, DL, ScalarVT); + if (ScalarVT.isFloatingPoint()) + return DAG.getConstantFP( + APFloat(DAG.EVTToAPFloatSemantics(ScalarVT), KnownVal), DL, + ScalarVT); + } + } + // TODO: These transforms should not require the 'hasOneUse' restriction, but // there are regressions on multiple targets without it. We can end up with a // mess of scalar and vector code if we reduce only part of the DAG to scalar. diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -57,8 +57,8 @@ ; CHECK-LABEL: widen_f16_build_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #13294 -; CHECK-NEXT: dup.4h v0, w8 -; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: movk w8, #13294, lsl #16 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %1 = bitcast half* %addr to <2 x half>* store <2 x half> , <2 x half>* %1, align 2 diff --git a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll --- a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll +++ b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll @@ -30,10 +30,10 @@ define [1 x <4 x float>] @test2() { ; CHECK-LABEL: .p2align 4 ; -- Begin function test2 ; CHECK-NEXT: lCPI1_0: -; CHECK-NEXT: .long 0x00000000 ; float 0 -; CHECK-NEXT: .long 0x00000000 ; float 0 -; CHECK-NEXT: .long 0x00000000 ; float 0 -; CHECK-NEXT: .long 0x3f800000 ; float 1 +; CHECK-NEXT: .long 0x80000000 ; float -0 +; CHECK-NEXT: .long 0x80000000 ; float -0 +; CHECK-NEXT: .long 0x80000000 ; float -0 +; CHECK-NEXT: .long 0xbf800000 ; float -1 ; CHECK-NEXT: .section __TEXT,__text,regular,pure_instructions ; CHECK-NEXT: .globl _test2 ; CHECK-NEXT: .p2align 2 @@ -43,17 +43,7 @@ ; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x8, lCPI1_0@PAGE ; CHECK-NEXT: Lloh3: -; CHECK-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF] -; CHECK-NEXT: mov s2, v1[1] -; CHECK-NEXT: fneg s0, s1 -; CHECK-NEXT: mov s3, v1[2] -; CHECK-NEXT: fneg s2, s2 -; CHECK-NEXT: mov s1, v1[3] -; CHECK-NEXT: fneg s3, s3 -; CHECK-NEXT: mov.s v0[1], v2[0] -; CHECK-NEXT: mov.s v0[2], v3[0] -; CHECK-NEXT: fneg s1, s1 -; CHECK-NEXT: mov.s v0[3], v1[0] +; CHECK-NEXT: ldr q0, [x8, lCPI1_0@PAGEOFF] ; CHECK-NEXT: ret ; ret [1 x <4 x float>] [<4 x float> diff --git a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll --- a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll +++ b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll @@ -8,19 +8,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: adrp x8, .LCPI0_1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: movi v2.4h, #1 ; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: mov w1, wzr ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: movi v1.4h, #1 -; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: shl v0.4h, v0.4h, #15 -; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: cmgt v0.4h, v2.4h, v0.4h ; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: umov w3, v0.h[3] -; CHECK-NEXT: mov w1, wzr ; CHECK-NEXT: mov w2, wzr ; CHECK-NEXT: b foo %tmp3 = shufflevector <4 x i16> %a1, <4 x i16> undef, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll @@ -74,8 +74,7 @@ ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: fmov w0, s1 ; CHECK-NEXT: // kill: def $x0 killed $w0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov x1, v0.d[1] +; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: ret Entry: %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0) diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -108,7 +108,6 @@ ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64 ; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]] -; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], 0 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] @@ -119,6 +118,7 @@ ; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] ; VI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] ; GCN: v_mov_b32_e32 v[[R_I64_0_High:[0-9]+]], 0 +; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], v[[R_I64_0_High]] ; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @fptoui_v2f16_to_v2i64( diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -125,18 +125,18 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 64, v0 -; GCN-NEXT: v_lshr_b64 v[2:3], 17, v1 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, 64, v0 -; GCN-NEXT: v_lshl_b64 v[4:5], 17, v1 +; GCN-NEXT: v_lshr_b64 v[1:2], 17, v1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 64, v0 +; GCN-NEXT: v_lshl_b64 v[2:3], 17, v2 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN-NEXT: v_lshl_b64 v[4:5], 17, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v1, s[4:5] -; GCN-NEXT: v_lshl_b64 v[0:1], 17, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 17, %rhs ret i128 %shl @@ -146,16 +146,15 @@ ; GCN-LABEL: v_lshr_i128_kv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_movk_i32 s4, 0x41 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_movk_i32 s4, 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -169,11 +168,10 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshr_b64 v[1:2], 33, v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 33, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -442,9 +442,9 @@ ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_pk_sub_i16 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/ARM/func-argpassing-endian.ll b/llvm/test/CodeGen/ARM/func-argpassing-endian.ll --- a/llvm/test/CodeGen/ARM/func-argpassing-endian.ll +++ b/llvm/test/CodeGen/ARM/func-argpassing-endian.ll @@ -102,31 +102,33 @@ define <4 x i32> @return_v4i32() { ; CHECK-LE-LABEL: return_v4i32: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: adr r0, .LCPI6_0 -; CHECK-LE-NEXT: vld1.64 {d16, d17}, [r0:128] +; CHECK-LE-NEXT: vldr d16, .LCPI6_0 +; CHECK-LE-NEXT: vldr d17, .LCPI6_1 ; CHECK-LE-NEXT: vmov r0, r1, d16 ; CHECK-LE-NEXT: vmov r2, r3, d17 ; CHECK-LE-NEXT: bx lr -; CHECK-LE-NEXT: .p2align 4 +; CHECK-LE-NEXT: .p2align 3 ; CHECK-LE-NEXT: @ %bb.1: ; CHECK-LE-NEXT: .LCPI6_0: ; CHECK-LE-NEXT: .long 42 @ double 9.1245819032257467E-313 ; CHECK-LE-NEXT: .long 43 +; CHECK-LE-NEXT: .LCPI6_1: ; CHECK-LE-NEXT: .long 44 @ double 9.5489810615176143E-313 ; CHECK-LE-NEXT: .long 45 ; ; CHECK-BE-LABEL: return_v4i32: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: adr r0, .LCPI6_0 -; CHECK-BE-NEXT: vld1.64 {d16, d17}, [r0:128] +; CHECK-BE-NEXT: vldr d16, .LCPI6_0 +; CHECK-BE-NEXT: vldr d17, .LCPI6_1 ; CHECK-BE-NEXT: vmov r1, r0, d16 ; CHECK-BE-NEXT: vmov r3, r2, d17 ; CHECK-BE-NEXT: bx lr -; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .p2align 3 ; CHECK-BE-NEXT: @ %bb.1: ; CHECK-BE-NEXT: .LCPI6_0: ; CHECK-BE-NEXT: .long 42 @ double 8.912382324178626E-313 ; CHECK-BE-NEXT: .long 43 +; CHECK-BE-NEXT: .LCPI6_1: ; CHECK-BE-NEXT: .long 44 @ double 9.3367814824704935E-313 ; CHECK-BE-NEXT: .long 45 ret < 4 x i32> < i32 42, i32 43, i32 44, i32 45 > diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll --- a/llvm/test/CodeGen/Mips/cconv/vector.ll +++ b/llvm/test/CodeGen/Mips/cconv/vector.ll @@ -4170,77 +4170,81 @@ ; MIPS64EB-NEXT: jr $ra ; MIPS64EB-NEXT: nop ; -; MIPS32R5-LABEL: calli8_16: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -40 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 40 -; MIPS32R5-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: lui $1, %hi($CPI30_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI30_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5-NEXT: lui $1, %hi($CPI30_1) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI30_1) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: jal i8_16 -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv16i8) -; MIPS32R5-NEXT: insert.w $w0[0], $2 -; MIPS32R5-NEXT: insert.w $w0[1], $3 -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv16i8) -; MIPS32R5-NEXT: insert.w $w0[2], $4 -; MIPS32R5-NEXT: insert.w $w0[3], $5 -; MIPS32R5-NEXT: st.w $w0, 0($1) -; MIPS32R5-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 40 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calli8_16: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: lui $1, 3080 +; MIPS32R5EB-NEXT: ori $1, $1, 2314 +; MIPS32R5EB-NEXT: lui $2, 1801 +; MIPS32R5EB-NEXT: sw $1, 28($sp) +; MIPS32R5EB-NEXT: ori $1, $2, 1801 +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: sw $1, 20($sp) +; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: lui $1, 1543 +; MIPS32R5EB-NEXT: ori $4, $1, 1543 +; MIPS32R5EB-NEXT: ori $7, $1, 2314 +; MIPS32R5EB-NEXT: move $5, $4 +; MIPS32R5EB-NEXT: move $6, $4 +; MIPS32R5EB-NEXT: jal i8_16 +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: insert.w $w0[0], $2 +; MIPS32R5EB-NEXT: insert.w $w0[1], $3 +; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv16i8) +; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv16i8) +; MIPS32R5EB-NEXT: st.w $w0, 0($1) +; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; -; MIPS64R5-LABEL: calli8_16: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI30_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI30_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI30_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI30_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(i8_16)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv16i8)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: calli8_16: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) +; MIPS64R5EB-NEXT: lui $1, 1801 +; MIPS64R5EB-NEXT: daddiu $1, $1, 1801 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 1801 +; MIPS64R5EB-NEXT: lui $2, 1543 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $2, $2, 1543 +; MIPS64R5EB-NEXT: dsll $2, $2, 16 +; MIPS64R5EB-NEXT: daddiu $2, $2, 1543 +; MIPS64R5EB-NEXT: dsll $2, $2, 16 +; MIPS64R5EB-NEXT: daddiu $4, $2, 1543 +; MIPS64R5EB-NEXT: daddiu $5, $2, 2314 +; MIPS64R5EB-NEXT: daddiu $6, $1, 1801 +; MIPS64R5EB-NEXT: lui $1, 225 +; MIPS64R5EB-NEXT: daddiu $1, $1, 8417 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 8577 +; MIPS64R5EB-NEXT: dsll $1, $1, 19 +; MIPS64R5EB-NEXT: daddiu $7, $1, 2314 +; MIPS64R5EB-NEXT: ld $25, %call16(i8_16)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv16i8)($gp) +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS32EL-LABEL: calli8_16: ; MIPS32EL: # %bb.0: # %entry @@ -4320,6 +4324,87 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calli8_16: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: lui $1, 2569 +; MIPS32R5EL-NEXT: ori $2, $1, 2060 +; MIPS32R5EL-NEXT: lui $3, 2311 +; MIPS32R5EL-NEXT: sw $2, 28($sp) +; MIPS32R5EL-NEXT: ori $2, $3, 2311 +; MIPS32R5EL-NEXT: sw $2, 24($sp) +; MIPS32R5EL-NEXT: sw $2, 20($sp) +; MIPS32R5EL-NEXT: sw $2, 16($sp) +; MIPS32R5EL-NEXT: lui $2, 1798 +; MIPS32R5EL-NEXT: ori $4, $2, 1798 +; MIPS32R5EL-NEXT: ori $7, $1, 1798 +; MIPS32R5EL-NEXT: move $5, $4 +; MIPS32R5EL-NEXT: move $6, $4 +; MIPS32R5EL-NEXT: jal i8_16 +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: insert.w $w0[0], $2 +; MIPS32R5EL-NEXT: insert.w $w0[1], $3 +; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv16i8) +; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv16i8) +; MIPS32R5EL-NEXT: st.w $w0, 0($1) +; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop +; +; MIPS64R5EL-LABEL: calli8_16: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) +; MIPS64R5EL-NEXT: lui $1, 1285 +; MIPS64R5EL-NEXT: daddiu $1, $1, -31869 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 899 +; MIPS64R5EL-NEXT: lui $2, 2311 +; MIPS64R5EL-NEXT: daddiu $2, $2, 2311 +; MIPS64R5EL-NEXT: dsll $2, $2, 16 +; MIPS64R5EL-NEXT: daddiu $2, $2, 2311 +; MIPS64R5EL-NEXT: dsll $2, $2, 16 +; MIPS64R5EL-NEXT: dsll $1, $1, 17 +; MIPS64R5EL-NEXT: lui $3, 899 +; MIPS64R5EL-NEXT: daddiu $3, $3, 899 +; MIPS64R5EL-NEXT: dsll $3, $3, 16 +; MIPS64R5EL-NEXT: daddiu $3, $3, 899 +; MIPS64R5EL-NEXT: dsll $3, $3, 17 +; MIPS64R5EL-NEXT: daddiu $4, $3, 1798 +; MIPS64R5EL-NEXT: daddiu $5, $1, 1798 +; MIPS64R5EL-NEXT: daddiu $6, $2, 2311 +; MIPS64R5EL-NEXT: lui $1, 642 +; MIPS64R5EL-NEXT: daddiu $1, $1, 16899 +; MIPS64R5EL-NEXT: dsll $1, $1, 18 +; MIPS64R5EL-NEXT: daddiu $1, $1, 2311 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $7, $1, 2311 +; MIPS64R5EL-NEXT: ld $25, %call16(i8_16)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv16i8)($gp) +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <16 x i8> @i8_16(<16 x i8> , <16 x i8> ) store <16 x i8> %0, <16 x i8> * @gv16i8 @@ -4825,36 +4910,26 @@ ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 ; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill ; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: lui $1, 9 +; MIPS32R5EB-NEXT: ori $5, $1, 10 +; MIPS32R5EB-NEXT: sw $5, 28($sp) +; MIPS32R5EB-NEXT: lui $1, 12 +; MIPS32R5EB-NEXT: ori $1, $1, 8 +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: sw $5, 20($sp) ; MIPS32R5EB-NEXT: lui $1, 6 -; MIPS32R5EB-NEXT: ori $1, $1, 7 -; MIPS32R5EB-NEXT: lui $2, 9 -; MIPS32R5EB-NEXT: ori $2, $2, 10 -; MIPS32R5EB-NEXT: fill.w $w0, $2 -; MIPS32R5EB-NEXT: insert.w $w0[1], $1 -; MIPS32R5EB-NEXT: splati.d $w0, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5EB-NEXT: lui $1, %hi($CPI33_0) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo($CPI33_0) -; MIPS32R5EB-NEXT: ld.w $w0, 0($1) -; MIPS32R5EB-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5EB-NEXT: sw $8, 28($sp) -; MIPS32R5EB-NEXT: sw $3, 24($sp) -; MIPS32R5EB-NEXT: sw $2, 20($sp) -; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: ori $4, $1, 7 +; MIPS32R5EB-NEXT: sw $4, 16($sp) +; MIPS32R5EB-NEXT: move $6, $4 +; MIPS32R5EB-NEXT: move $7, $5 ; MIPS32R5EB-NEXT: jal i16_8 ; MIPS32R5EB-NEXT: nop -; MIPS32R5EB-NEXT: lui $1, %hi(gv8i16) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EB-NEXT: insert.w $w0[0], $2 ; MIPS32R5EB-NEXT: insert.w $w0[1], $3 ; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv8i16) ; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EB-NEXT: st.w $w0, 0($1) ; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 @@ -4872,20 +4947,21 @@ ; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_8))) ; MIPS64R5EB-NEXT: daddu $1, $1, $25 ; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli16_8))) -; MIPS64R5EB-NEXT: lui $1, 9 -; MIPS64R5EB-NEXT: ori $1, $1, 10 -; MIPS64R5EB-NEXT: lui $2, 6 -; MIPS64R5EB-NEXT: ori $2, $2, 7 -; MIPS64R5EB-NEXT: dinsu $1, $2, 32, 32 -; MIPS64R5EB-NEXT: fill.d $w0, $1 -; MIPS64R5EB-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5EB-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5EB-NEXT: ld $1, %got_page(.LCPI33_0)($gp) -; MIPS64R5EB-NEXT: daddiu $1, $1, %got_ofst(.LCPI33_0) -; MIPS64R5EB-NEXT: ld.d $w0, 0($1) -; MIPS64R5EB-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5EB-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5EB-NEXT: lui $1, 6 +; MIPS64R5EB-NEXT: daddiu $1, $1, 7 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $4, $1, 10 +; MIPS64R5EB-NEXT: lui $1, 2 +; MIPS64R5EB-NEXT: daddiu $1, $1, -32767 +; MIPS64R5EB-NEXT: dsll $1, $1, 19 +; MIPS64R5EB-NEXT: daddiu $1, $1, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $7, $1, 10 ; MIPS64R5EB-NEXT: ld $25, %call16(i16_8)($gp) +; MIPS64R5EB-NEXT: move $5, $4 +; MIPS64R5EB-NEXT: move $6, $4 ; MIPS64R5EB-NEXT: jalr $25 ; MIPS64R5EB-NEXT: nop ; MIPS64R5EB-NEXT: ld $1, %got_disp(gv8i16)($gp) @@ -4973,35 +5049,25 @@ ; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill ; MIPS32R5EL-NEXT: .cfi_offset 31, -4 ; MIPS32R5EL-NEXT: lui $1, 10 -; MIPS32R5EL-NEXT: ori $1, $1, 9 -; MIPS32R5EL-NEXT: lui $2, 7 -; MIPS32R5EL-NEXT: ori $2, $2, 6 -; MIPS32R5EL-NEXT: fill.w $w0, $2 -; MIPS32R5EL-NEXT: insert.w $w0[1], $1 -; MIPS32R5EL-NEXT: splati.d $w0, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5EL-NEXT: lui $1, %hi($CPI33_0) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo($CPI33_0) -; MIPS32R5EL-NEXT: ld.w $w0, 0($1) -; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5EL-NEXT: sw $8, 28($sp) -; MIPS32R5EL-NEXT: sw $3, 24($sp) -; MIPS32R5EL-NEXT: sw $2, 20($sp) -; MIPS32R5EL-NEXT: sw $1, 16($sp) +; MIPS32R5EL-NEXT: ori $5, $1, 9 +; MIPS32R5EL-NEXT: sw $5, 28($sp) +; MIPS32R5EL-NEXT: lui $1, 8 +; MIPS32R5EL-NEXT: ori $1, $1, 12 +; MIPS32R5EL-NEXT: sw $1, 24($sp) +; MIPS32R5EL-NEXT: sw $5, 20($sp) +; MIPS32R5EL-NEXT: lui $1, 7 +; MIPS32R5EL-NEXT: ori $4, $1, 6 +; MIPS32R5EL-NEXT: sw $4, 16($sp) +; MIPS32R5EL-NEXT: move $6, $4 +; MIPS32R5EL-NEXT: move $7, $5 ; MIPS32R5EL-NEXT: jal i16_8 ; MIPS32R5EL-NEXT: nop -; MIPS32R5EL-NEXT: lui $1, %hi(gv8i16) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EL-NEXT: insert.w $w0[0], $2 ; MIPS32R5EL-NEXT: insert.w $w0[1], $3 ; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv8i16) ; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EL-NEXT: st.w $w0, 0($1) ; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 @@ -5019,20 +5085,21 @@ ; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_8))) ; MIPS64R5EL-NEXT: daddu $1, $1, $25 ; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli16_8))) -; MIPS64R5EL-NEXT: lui $1, 7 -; MIPS64R5EL-NEXT: ori $1, $1, 6 -; MIPS64R5EL-NEXT: lui $2, 10 -; MIPS64R5EL-NEXT: ori $2, $2, 9 -; MIPS64R5EL-NEXT: dinsu $1, $2, 32, 32 -; MIPS64R5EL-NEXT: fill.d $w0, $1 -; MIPS64R5EL-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5EL-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5EL-NEXT: ld $1, %got_page(.LCPI33_0)($gp) -; MIPS64R5EL-NEXT: daddiu $1, $1, %got_ofst(.LCPI33_0) -; MIPS64R5EL-NEXT: ld.d $w0, 0($1) -; MIPS64R5EL-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5EL-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5EL-NEXT: lui $1, 10 +; MIPS64R5EL-NEXT: daddiu $1, $1, 9 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 7 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $4, $1, 6 +; MIPS64R5EL-NEXT: lui $1, 1 +; MIPS64R5EL-NEXT: daddiu $1, $1, 16385 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 8193 +; MIPS64R5EL-NEXT: dsll $1, $1, 19 +; MIPS64R5EL-NEXT: daddiu $7, $1, 12 ; MIPS64R5EL-NEXT: ld $25, %call16(i16_8)($gp) +; MIPS64R5EL-NEXT: move $5, $4 +; MIPS64R5EL-NEXT: move $6, $4 ; MIPS64R5EL-NEXT: jalr $25 ; MIPS64R5EL-NEXT: nop ; MIPS64R5EL-NEXT: ld $1, %got_disp(gv8i16)($gp) @@ -5304,39 +5371,38 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: calli32_4: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI35_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI35_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI35_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI35_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(i32_4)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv4i32)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: calli32_4: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 3 +; MIPS64R5EB-NEXT: dsll $2, $1, 33 +; MIPS64R5EB-NEXT: daddiu $4, $2, 7 +; MIPS64R5EB-NEXT: dsll $1, $1, 34 +; MIPS64R5EB-NEXT: daddiu $6, $1, 8 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 32 +; MIPS64R5EB-NEXT: daddiu $5, $1, 10 +; MIPS64R5EB-NEXT: ld $25, %call16(i32_4)($gp) +; MIPS64R5EB-NEXT: move $7, $5 +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4i32)($gp) +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: calli32_4: ; MIPS64EL: # %bb.0: # %entry @@ -5370,6 +5436,40 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: calli32_4: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 7 +; MIPS64R5EL-NEXT: dsll $1, $1, 32 +; MIPS64R5EL-NEXT: daddiu $4, $1, 6 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 1 +; MIPS64R5EL-NEXT: dsll $1, $1, 35 +; MIPS64R5EL-NEXT: daddiu $6, $1, 12 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 5 +; MIPS64R5EL-NEXT: dsll $1, $1, 33 +; MIPS64R5EL-NEXT: daddiu $5, $1, 9 +; MIPS64R5EL-NEXT: ld $25, %call16(i32_4)($gp) +; MIPS64R5EL-NEXT: move $7, $5 +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4i32)($gp) +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <4 x i32> @i32_4(<4 x i32> , <4 x i32> ) store <4 x i32> %0, <4 x i32> * @gv4i32 @@ -5433,43 +5533,35 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5-LABEL: calli64_2: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -40 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 40 -; MIPS32R5-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: lui $1, %hi($CPI36_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI36_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5-NEXT: lui $1, %hi($CPI36_1) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI36_1) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: jal i64_2 -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv2i64) -; MIPS32R5-NEXT: insert.w $w0[0], $2 -; MIPS32R5-NEXT: insert.w $w0[1], $3 -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2i64) -; MIPS32R5-NEXT: insert.w $w0[2], $4 -; MIPS32R5-NEXT: insert.w $w0[3], $5 -; MIPS32R5-NEXT: st.w $w0, 0($1) -; MIPS32R5-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 40 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calli64_2: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: addiu $1, $zero, 8 +; MIPS32R5EB-NEXT: sw $1, 28($sp) +; MIPS32R5EB-NEXT: addiu $1, $zero, 12 +; MIPS32R5EB-NEXT: sw $1, 20($sp) +; MIPS32R5EB-NEXT: sw $zero, 24($sp) +; MIPS32R5EB-NEXT: sw $zero, 16($sp) +; MIPS32R5EB-NEXT: addiu $4, $zero, 0 +; MIPS32R5EB-NEXT: addiu $5, $zero, 6 +; MIPS32R5EB-NEXT: addiu $6, $zero, 0 +; MIPS32R5EB-NEXT: addiu $7, $zero, 7 +; MIPS32R5EB-NEXT: jal i64_2 +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: insert.w $w0[0], $2 +; MIPS32R5EB-NEXT: insert.w $w0[1], $3 +; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv2i64) +; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2i64) +; MIPS32R5EB-NEXT: st.w $w0, 0($1) +; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; ; MIPS64R5-LABEL: calli64_2: ; MIPS64R5: # %bb.0: # %entry @@ -5527,6 +5619,36 @@ ; MIPS32EL-NEXT: addiu $sp, $sp, 40 ; MIPS32EL-NEXT: jr $ra ; MIPS32EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calli64_2: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: addiu $1, $zero, 8 +; MIPS32R5EL-NEXT: sw $1, 24($sp) +; MIPS32R5EL-NEXT: addiu $1, $zero, 12 +; MIPS32R5EL-NEXT: sw $1, 16($sp) +; MIPS32R5EL-NEXT: sw $zero, 28($sp) +; MIPS32R5EL-NEXT: sw $zero, 20($sp) +; MIPS32R5EL-NEXT: addiu $4, $zero, 6 +; MIPS32R5EL-NEXT: addiu $5, $zero, 0 +; MIPS32R5EL-NEXT: addiu $6, $zero, 7 +; MIPS32R5EL-NEXT: addiu $7, $zero, 0 +; MIPS32R5EL-NEXT: jal i64_2 +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: insert.w $w0[0], $2 +; MIPS32R5EL-NEXT: insert.w $w0[1], $3 +; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv2i64) +; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2i64) +; MIPS32R5EL-NEXT: st.w $w0, 0($1) +; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop entry: %0 = call <2 x i64> @i64_2(<2 x i64> , <2 x i64> ) store <2 x i64> %0, <2 x i64> * @gv2i64 @@ -5618,35 +5740,33 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: callfloat_2: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI37_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI37_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI37_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI37_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $5, $w0[0] -; MIPS64R5-NEXT: ld $25, %call16(float2_extern)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: ld $1, %got_disp(gv2f32)($gp) -; MIPS64R5-NEXT: sd $2, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: callfloat_2: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 383 +; MIPS64R5EB-NEXT: dsll $4, $1, 23 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 261 +; MIPS64R5EB-NEXT: dsll $1, $1, 33 +; MIPS64R5EB-NEXT: daddiu $1, $1, 523 +; MIPS64R5EB-NEXT: dsll $5, $1, 21 +; MIPS64R5EB-NEXT: ld $25, %call16(float2_extern)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv2f32)($gp) +; MIPS64R5EB-NEXT: sd $2, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: callfloat_2: ; MIPS64EL: # %bb.0: # %entry @@ -5675,6 +5795,34 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: callfloat_2: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 383 +; MIPS64R5EL-NEXT: dsll $4, $1, 55 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 523 +; MIPS64R5EL-NEXT: dsll $1, $1, 31 +; MIPS64R5EL-NEXT: daddiu $1, $1, 261 +; MIPS64R5EL-NEXT: dsll $5, $1, 22 +; MIPS64R5EL-NEXT: ld $25, %call16(float2_extern)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv2f32)($gp) +; MIPS64R5EL-NEXT: sd $2, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <2 x float> @float2_extern(<2 x float> , <2 x float> ) store <2 x float> %0, <2 x float> * @gv2f32 @@ -5777,27 +5925,21 @@ ; MIPS32R5-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5-NEXT: addiu $1, $zero, -16 ; MIPS32R5-NEXT: and $sp, $sp, $1 -; MIPS32R5-NEXT: lui $1, %hi($CPI38_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI38_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $6, $w0[0] -; MIPS32R5-NEXT: copy_s.w $7, $w0[1] -; MIPS32R5-NEXT: copy_s.w $1, $w0[2] -; MIPS32R5-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5-NEXT: lui $3, %hi($CPI38_1) -; MIPS32R5-NEXT: addiu $3, $3, %lo($CPI38_1) -; MIPS32R5-NEXT: ld.w $w0, 0($3) -; MIPS32R5-NEXT: copy_s.w $3, $w0[0] -; MIPS32R5-NEXT: copy_s.w $4, $w0[1] -; MIPS32R5-NEXT: copy_s.w $5, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 36($sp) -; MIPS32R5-NEXT: sw $5, 32($sp) -; MIPS32R5-NEXT: sw $4, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) +; MIPS32R5-NEXT: lui $1, 16704 +; MIPS32R5-NEXT: lui $2, 16736 +; MIPS32R5-NEXT: lui $3, 16752 +; MIPS32R5-NEXT: lui $4, 16768 +; MIPS32R5-NEXT: sw $4, 36($sp) +; MIPS32R5-NEXT: sw $3, 32($sp) +; MIPS32R5-NEXT: sw $2, 28($sp) +; MIPS32R5-NEXT: sw $1, 24($sp) +; MIPS32R5-NEXT: lui $1, 16512 +; MIPS32R5-NEXT: sw $1, 20($sp) +; MIPS32R5-NEXT: lui $1, 16384 ; MIPS32R5-NEXT: sw $1, 16($sp) ; MIPS32R5-NEXT: addiu $4, $sp, 48 +; MIPS32R5-NEXT: addiu $6, $zero, 0 +; MIPS32R5-NEXT: lui $7, 49024 ; MIPS32R5-NEXT: jal float4_extern ; MIPS32R5-NEXT: nop ; MIPS32R5-NEXT: lui $1, %hi(gv4f32) @@ -5811,39 +5953,43 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: callfloat_4: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI38_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI38_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI38_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI38_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(float4_extern)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv4f32)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: callfloat_4: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 1 +; MIPS64R5EB-NEXT: dsll $1, $1, 39 +; MIPS64R5EB-NEXT: daddiu $1, $1, 129 +; MIPS64R5EB-NEXT: daddiu $2, $zero, 261 +; MIPS64R5EB-NEXT: dsll $2, $2, 33 +; MIPS64R5EB-NEXT: daddiu $3, $zero, 383 +; MIPS64R5EB-NEXT: dsll $4, $3, 23 +; MIPS64R5EB-NEXT: dsll $5, $1, 23 +; MIPS64R5EB-NEXT: daddiu $1, $2, 523 +; MIPS64R5EB-NEXT: dsll $6, $1, 21 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 1047 +; MIPS64R5EB-NEXT: dsll $1, $1, 29 +; MIPS64R5EB-NEXT: daddiu $1, $1, 131 +; MIPS64R5EB-NEXT: dsll $7, $1, 23 +; MIPS64R5EB-NEXT: ld $25, %call16(float4_extern)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4f32)($gp) +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: callfloat_4: ; MIPS64EL: # %bb.0: # %entry @@ -5881,6 +6027,44 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: callfloat_4: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 129 +; MIPS64R5EL-NEXT: dsll $1, $1, 25 +; MIPS64R5EL-NEXT: daddiu $1, $1, 1 +; MIPS64R5EL-NEXT: daddiu $2, $zero, 523 +; MIPS64R5EL-NEXT: dsll $2, $2, 31 +; MIPS64R5EL-NEXT: daddiu $3, $zero, 383 +; MIPS64R5EL-NEXT: dsll $4, $3, 55 +; MIPS64R5EL-NEXT: dsll $5, $1, 30 +; MIPS64R5EL-NEXT: daddiu $1, $2, 261 +; MIPS64R5EL-NEXT: dsll $6, $1, 22 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 131 +; MIPS64R5EL-NEXT: dsll $1, $1, 35 +; MIPS64R5EL-NEXT: daddiu $1, $1, 1047 +; MIPS64R5EL-NEXT: dsll $7, $1, 20 +; MIPS64R5EL-NEXT: ld $25, %call16(float4_extern)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4f32)($gp) +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <4 x float> @float4_extern(<4 x float> , <4 x float> ) store <4 x float> %0, <4 x float> * @gv4f32 @@ -5957,51 +6141,42 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5-LABEL: calldouble_2: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -80 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 80 -; MIPS32R5-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: .cfi_offset 30, -8 -; MIPS32R5-NEXT: move $fp, $sp -; MIPS32R5-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5-NEXT: addiu $1, $zero, -16 -; MIPS32R5-NEXT: and $sp, $sp, $1 -; MIPS32R5-NEXT: lui $1, %hi($CPI39_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI39_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $6, $w0[0] -; MIPS32R5-NEXT: copy_s.w $7, $w0[1] -; MIPS32R5-NEXT: copy_s.w $1, $w0[2] -; MIPS32R5-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5-NEXT: lui $3, %hi($CPI39_1) -; MIPS32R5-NEXT: addiu $3, $3, %lo($CPI39_1) -; MIPS32R5-NEXT: ld.w $w0, 0($3) -; MIPS32R5-NEXT: copy_s.w $3, $w0[0] -; MIPS32R5-NEXT: copy_s.w $4, $w0[1] -; MIPS32R5-NEXT: copy_s.w $5, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 36($sp) -; MIPS32R5-NEXT: sw $5, 32($sp) -; MIPS32R5-NEXT: sw $4, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: addiu $4, $sp, 48 -; MIPS32R5-NEXT: jal double2_extern -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv2f64) -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2f64) -; MIPS32R5-NEXT: ld.d $w0, 48($sp) -; MIPS32R5-NEXT: st.d $w0, 0($1) -; MIPS32R5-NEXT: move $sp, $fp -; MIPS32R5-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 80 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calldouble_2: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -80 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 80 +; MIPS32R5EB-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 +; MIPS32R5EB-NEXT: move $fp, $sp +; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EB-NEXT: addiu $1, $zero, -16 +; MIPS32R5EB-NEXT: and $sp, $sp, $1 +; MIPS32R5EB-NEXT: lui $1, 16424 +; MIPS32R5EB-NEXT: lui $2, 16428 +; MIPS32R5EB-NEXT: sw $2, 32($sp) +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: lui $1, 49136 +; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: sw $zero, 36($sp) +; MIPS32R5EB-NEXT: sw $zero, 28($sp) +; MIPS32R5EB-NEXT: sw $zero, 20($sp) +; MIPS32R5EB-NEXT: addiu $4, $sp, 48 +; MIPS32R5EB-NEXT: addiu $6, $zero, 0 +; MIPS32R5EB-NEXT: addiu $7, $zero, 0 +; MIPS32R5EB-NEXT: jal double2_extern +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: lui $1, %hi(gv2f64) +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2f64) +; MIPS32R5EB-NEXT: ld.d $w0, 48($sp) +; MIPS32R5EB-NEXT: st.d $w0, 0($1) +; MIPS32R5EB-NEXT: move $sp, $fp +; MIPS32R5EB-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 80 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; ; MIPS64R5-LABEL: calldouble_2: ; MIPS64R5: # %bb.0: # %entry @@ -6014,17 +6189,14 @@ ; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calldouble_2))) ; MIPS64R5-NEXT: daddu $1, $1, $25 ; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calldouble_2))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI39_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI39_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI39_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI39_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5-NEXT: daddiu $1, $zero, 3071 +; MIPS64R5-NEXT: dsll $5, $1, 52 +; MIPS64R5-NEXT: daddiu $1, $zero, 2053 +; MIPS64R5-NEXT: dsll $6, $1, 51 +; MIPS64R5-NEXT: daddiu $1, $zero, 4107 +; MIPS64R5-NEXT: dsll $7, $1, 50 ; MIPS64R5-NEXT: ld $25, %call16(double2_extern)($gp) +; MIPS64R5-NEXT: daddiu $4, $zero, 0 ; MIPS64R5-NEXT: jalr $25 ; MIPS64R5-NEXT: nop ; MIPS64R5-NEXT: insert.d $w0[0], $2 @@ -6075,6 +6247,43 @@ ; MIPS32EL-NEXT: addiu $sp, $sp, 80 ; MIPS32EL-NEXT: jr $ra ; MIPS32EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calldouble_2: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -80 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 80 +; MIPS32R5EL-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 +; MIPS32R5EL-NEXT: move $fp, $sp +; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EL-NEXT: addiu $1, $zero, -16 +; MIPS32R5EL-NEXT: and $sp, $sp, $1 +; MIPS32R5EL-NEXT: lui $1, 16424 +; MIPS32R5EL-NEXT: lui $2, 16428 +; MIPS32R5EL-NEXT: sw $2, 36($sp) +; MIPS32R5EL-NEXT: sw $1, 28($sp) +; MIPS32R5EL-NEXT: lui $1, 49136 +; MIPS32R5EL-NEXT: sw $1, 20($sp) +; MIPS32R5EL-NEXT: sw $zero, 32($sp) +; MIPS32R5EL-NEXT: sw $zero, 24($sp) +; MIPS32R5EL-NEXT: sw $zero, 16($sp) +; MIPS32R5EL-NEXT: addiu $4, $sp, 48 +; MIPS32R5EL-NEXT: addiu $6, $zero, 0 +; MIPS32R5EL-NEXT: addiu $7, $zero, 0 +; MIPS32R5EL-NEXT: jal double2_extern +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: lui $1, %hi(gv2f64) +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2f64) +; MIPS32R5EL-NEXT: ld.d $w0, 48($sp) +; MIPS32R5EL-NEXT: st.d $w0, 0($1) +; MIPS32R5EL-NEXT: move $sp, $fp +; MIPS32R5EL-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 80 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop entry: %0 = call <2 x double> @double2_extern(<2 x double> , <2 x double> ) store <2 x double> %0, <2 x double> * @gv2f64 diff --git a/llvm/test/CodeGen/PowerPC/pr45709.ll b/llvm/test/CodeGen/PowerPC/pr45709.ll --- a/llvm/test/CodeGen/PowerPC/pr45709.ll +++ b/llvm/test/CodeGen/PowerPC/pr45709.ll @@ -10,7 +10,7 @@ define dso_local void @_ZN1a1bEv(<4 x float> %in) local_unnamed_addr #0 align 2 { ; CHECK-LABEL: _ZN1a1bEv: ; CHECK: # %bb.0: -; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_6 +; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_4 ; CHECK-NEXT: b .LBB0_1 ; CHECK-NEXT: .LBB0_1: # %.preheader ; CHECK-NEXT: b .LBB0_2 @@ -21,26 +21,18 @@ ; CHECK-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-NEXT: lvx v3, 0, r3 ; CHECK-NEXT: vperm v2, v2, v2, v3 -; CHECK-NEXT: vxor v3, v3, v3 -; CHECK-NEXT: addi r3, r1, -48 -; CHECK-NEXT: stvx v3, 0, r3 ; CHECK-NEXT: addi r3, r1, -32 ; CHECK-NEXT: stvx v2, 0, r3 -; CHECK-NEXT: lwz r3, -48(r1) -; CHECK-NEXT: lwz r4, -32(r1) -; CHECK-NEXT: cmpw r4, r3 -; CHECK-NEXT: bc 12, gt, .LBB0_4 -; CHECK-NEXT: b .LBB0_5 -; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: addi r3, r4, 0 -; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: lwz r3, -32(r1) +; CHECK-NEXT: srawi r4, r3, 31 +; CHECK-NEXT: andc r3, r3, r4 ; CHECK-NEXT: cmpw r3, r3 -; CHECK-NEXT: stw r3, -64(r1) -; CHECK-NEXT: addi r3, r1, -64 +; CHECK-NEXT: stw r3, -48(r1) +; CHECK-NEXT: addi r3, r1, -48 ; CHECK-NEXT: lvx v2, 0, r3 ; CHECK-NEXT: addi r3, r1, -16 ; CHECK-NEXT: stvx v2, 0, r3 -; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: blr br i1 undef, label %7, label %1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -36,14 +36,11 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x) { ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -134,44 +131,41 @@ ; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u16 r2, q0[6] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q0, q2, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i16> %x to <8 x i64> @@ -249,12 +243,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) { ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov.i32 q1, #0xffff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -399,96 +393,85 @@ ; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[1] ; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r2, s11 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.32 q2[2], r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[9] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[12] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q0, q2, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i64> @@ -634,12 +617,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) { ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -720,19 +703,14 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) { ; CHECK-LABEL: add_v2i32_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> %z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %xx) @@ -826,54 +804,51 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u16 r2, q0[0] ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add.w r12, r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r4, r12, r3 -; CHECK-NEXT: adc.w r12, r2, lr +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[6] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: vand q0, q2, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds.w r12, lr, r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <8 x i16> %x to <8 x i64> %z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -956,14 +931,13 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov.i32 q1, #0xffff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -1116,106 +1090,95 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add.w r12, r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r4, r12, r3 -; CHECK-NEXT: adc.w r12, r2, lr +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w r12, lr, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: adc.w r3, r12, r4 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r4, s9 ; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r4, s9 ; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: vmov.32 q2[2], r4 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vand q0, q2, q1 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r4, s1 ; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %xx) @@ -1366,14 +1329,13 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -46,28 +46,23 @@ ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.32 q2[1], r0 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -215,44 +210,36 @@ ; CHECK-NEXT: and r1, r0, #1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov.32 q3[0], r1 -; CHECK-NEXT: vmov.32 q3[1], r1 ; CHECK-NEXT: ubfx r1, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov.32 q3[2], r1 -; CHECK-NEXT: vmov.32 q3[3], r1 ; CHECK-NEXT: vmov.u16 r1, q0[0] ; CHECK-NEXT: vmov.32 q4[0], r1 ; CHECK-NEXT: vmov.u16 r1, q0[1] ; CHECK-NEXT: vmov.32 q4[2], r1 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: ubfx r3, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: ubfx r2, r0, #8, #1 ; CHECK-NEXT: ubfx r0, r0, #12, #1 -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov.32 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q0[3] ; CHECK-NEXT: vmov.32 q4[2], r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q2[4] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] @@ -261,53 +248,45 @@ ; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[2], r3 -; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: vmov.u16 r3, q0[4] ; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[6] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -472,21 +451,17 @@ ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.32 q3[1], r0 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -723,44 +698,36 @@ ; CHECK-NEXT: and r1, r0, #1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov.32 q6[0], r1 -; CHECK-NEXT: vmov.32 q6[1], r1 ; CHECK-NEXT: ubfx r1, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov.32 q6[2], r1 -; CHECK-NEXT: vmov.32 q6[3], r1 ; CHECK-NEXT: vmov.u8 r1, q0[0] ; CHECK-NEXT: vmov.32 q7[0], r1 ; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: vmov.32 q7[2], r1 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r1, s27 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: ubfx r3, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov r1, s26 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: ubfx r2, r0, #8, #1 ; CHECK-NEXT: ubfx r0, r0, #12, #1 -; CHECK-NEXT: vmov.32 q6[0], r3 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q6[1], r3 ; CHECK-NEXT: vmov.32 q6[2], r0 -; CHECK-NEXT: vmov.32 q6[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] ; CHECK-NEXT: vmov.32 q7[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] ; CHECK-NEXT: vmov.32 q7[2], r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q5[4] ; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u16 r2, q5[5] @@ -769,53 +736,44 @@ ; CHECK-NEXT: vmov.32 q6[2], r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.32 q6[3], r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q5[0], r3 -; CHECK-NEXT: vmov.32 q5[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q5[2], r3 -; CHECK-NEXT: vmov.32 q5[3], r3 ; CHECK-NEXT: vmov.u8 r3, q0[4] ; CHECK-NEXT: vmov.32 q6[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov.32 q6[2], r3 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q5[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -832,6 +790,7 @@ ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q2[0] @@ -847,47 +806,38 @@ ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vmov.32 q3[3], r3 ; CHECK-NEXT: vmov.u8 r3, q0[8] ; CHECK-NEXT: vmov.32 q4[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[9] ; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.32 q4[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q2[4] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] @@ -896,52 +846,45 @@ ; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[2], r3 -; CHECK-NEXT: vmov.32 q2[3], r3 ; CHECK-NEXT: vmov.u8 r3, q0[12] ; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[13] ; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: adds.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1273,21 +1216,17 @@ ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.32 q3[1], r0 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.32 q3[3], r0 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1423,34 +1362,27 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %b, i64 %a) { ; CHECK-LABEL: add_v2i32_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer %xx = zext <2 x i32> %x to <2 x i64> @@ -1610,45 +1542,36 @@ ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vmov.32 q3[3], r3 ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov.32 q4[0], r3 ; CHECK-NEXT: vmov.u16 r3, q0[1] ; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r12, s15 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov lr, s14 -; CHECK-NEXT: orr.w r12, r12, r3 +; CHECK-NEXT: vmov r12, s14 ; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: add lr, r3 +; CHECK-NEXT: add r12, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov.32 q4[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds.w r4, lr, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adc.w lr, r12, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds.w r12, r4, r3 -; CHECK-NEXT: adc.w lr, lr, r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q2[4] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] @@ -1657,52 +1580,45 @@ ; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.32 q3[3], r2 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.32 q2[1], r4 -; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q2[2], r4 -; CHECK-NEXT: vmov.32 q2[3], r4 -; CHECK-NEXT: vmov.u16 r4, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r4 -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[2], r3 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adc.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, lr, r3 +; CHECK-NEXT: adds.w r4, r12, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adc r12, lr, #0 +; CHECK-NEXT: adds.w lr, r4, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u16 r2, q0[6] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: adc r4, r12, #0 ; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9} @@ -1875,23 +1791,18 @@ ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.32 q3[1], r2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: orr.w r12, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -2145,45 +2056,36 @@ ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q6[0], r3 -; CHECK-NEXT: vmov.32 q6[1], r3 ; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q6[2], r3 -; CHECK-NEXT: vmov.32 q6[3], r3 ; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vmov.32 q7[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[1] ; CHECK-NEXT: vmov.32 q7[2], r3 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r12, s27 -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: vmov lr, s26 -; CHECK-NEXT: orr.w r12, r12, r3 +; CHECK-NEXT: vmov r12, s26 ; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: add lr, r3 +; CHECK-NEXT: add r12, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q6[0], r3 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q6[1], r3 ; CHECK-NEXT: vmov.32 q6[2], r2 -; CHECK-NEXT: vmov.32 q6[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[2] ; CHECK-NEXT: vmov.32 q7[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: vmov.32 q7[2], r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: adds.w r4, lr, r3 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: adc.w lr, r12, r2 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: adds.w r12, r4, r3 -; CHECK-NEXT: adc.w lr, lr, r2 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q5[4] ; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u16 r2, q5[5] @@ -2192,178 +2094,155 @@ ; CHECK-NEXT: vmov.32 q6[2], r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.32 q6[3], r2 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q5[0], r4 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q5[2], r4 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: vmov.32 q6[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: vmov.32 q6[2], r4 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: ubfx r3, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: vmov.32 q5[2], r3 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vmov.32 q6[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: vmov.32 q6[2], r3 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 ; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r4, s21 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adc.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, lr, r3 +; CHECK-NEXT: adds.w r4, r12, r3 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: adc r12, lr, #0 +; CHECK-NEXT: adds.w lr, r4, r3 ; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: vmov.32 q5[0], r3 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: rsb.w r2, r2, #0 ; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: vmov.32 q6[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: adc r4, r12, #0 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s23 -; CHECK-NEXT: adc.w lr, r12, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q4[8] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u8 r2, q4[9] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u8 r2, q4[10] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u8 r2, q4[11] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u8 r2, q4[12] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u8 r2, q4[13] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u8 r2, q4[14] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u8 r2, q4[15] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: adc.w lr, lr, r4 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r4, s22 +; CHECK-NEXT: adds.w r12, r2, r4 +; CHECK-NEXT: vmov.u8 r4, q4[8] +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov.u8 r4, q4[9] +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov.u8 r4, q4[10] +; CHECK-NEXT: vmov.16 q5[2], r4 +; CHECK-NEXT: vmov.u8 r4, q4[11] +; CHECK-NEXT: vmov.16 q5[3], r4 +; CHECK-NEXT: vmov.u8 r4, q4[12] +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov.u8 r4, q4[13] +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: vmov.u8 r4, q4[14] +; CHECK-NEXT: vmov.16 q5[6], r4 +; CHECK-NEXT: vmov.u8 r4, q4[15] +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[1] -; CHECK-NEXT: vmov.32 q3[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov.u16 r4, q2[0] ; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: vmov.u16 r4, q2[1] ; CHECK-NEXT: vmov.32 q3[1], r4 -; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 +; CHECK-NEXT: vmov.u16 r4, q2[2] ; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.u16 r4, q2[3] ; CHECK-NEXT: vmov.32 q3[3], r4 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmrs r4, p0 +; CHECK-NEXT: and r2, r4, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: ubfx r2, r4, #4, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adc.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, lr, r3 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: ubfx r2, r4, #8, #1 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: ubfx r2, r4, #12, #1 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q3[1], r3 ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: vmov.32 q4[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s15 -; CHECK-NEXT: adc.w lr, r12, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.32 q3[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: adc.w lr, lr, r4 -; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r4, r2, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: vmov.32 q2[1], r4 -; CHECK-NEXT: ubfx r4, r2, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q2[2], r4 -; CHECK-NEXT: vmov.32 q2[3], r4 -; CHECK-NEXT: vmov.u8 r4, q0[12] +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r2, r4 +; CHECK-NEXT: vmov.u16 r4, q2[4] +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: vmov.u16 r4, q2[5] +; CHECK-NEXT: vmov.32 q3[1], r4 +; CHECK-NEXT: vmov.u16 r4, q2[6] ; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.u16 r4, q2[7] +; CHECK-NEXT: vmov.32 q3[3], r4 +; CHECK-NEXT: vcmp.i32 ne, q3, zr +; CHECK-NEXT: vmrs r4, p0 +; CHECK-NEXT: and r2, r4, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: ubfx r2, r4, #4, #1 +; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmov.32 q2[2], r2 +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adc.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r12, lr, r3 -; CHECK-NEXT: ubfx r3, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: ubfx r2, r4, #8, #1 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: ubfx r2, r4, #12, #1 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vand q0, q3, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -2703,23 +2582,18 @@ ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.32 q3[1], r2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q3[3], r2 ; CHECK-NEXT: vand q0, q0, q3 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: orr.w r12, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -215,14 +215,20 @@ ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q2, #0xffff -; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umlal r0, r1, r3, r2 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -429,16 +435,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.u8 r1, q0[2] ; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.u8 r0, q1[3] ; CHECK-NEXT: vmov.32 q4[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.u8 r1, q0[3] ; CHECK-NEXT: vmov.32 q3[2], r0 ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmov.32 q4[2], r1 @@ -446,192 +450,174 @@ ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r0, s14 ; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r12, r1, r1, r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: umull r2, r3, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: vmov.32 q4[2], r3 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: orr.w lr, r3, r1 -; CHECK-NEXT: vmov.u8 r3, q1[2] -; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: vmov.u8 r3, q1[3] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: vmov.32 q3[2], r0 +; CHECK-NEXT: vmov r12, s12 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[1] +; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: umull r0, r3, r0, r3 -; CHECK-NEXT: vmov.32 q5[0], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r0, r3, r3, r0 -; CHECK-NEXT: vmov.32 q5[2], r0 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r1, s20 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, lr, r0 -; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: smlabb r1, r1, r3, r2 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: adds.w r1, r1, r12 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: adc r2, r2, #0 ; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u8 r2, q1[4] -; CHECK-NEXT: vmov.u8 r3, q0[4] ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: vmov.32 q4[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[0], r2 +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: muls r0, r3, r0 +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s21 ; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.32 q4[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[7] ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: muls r0, r3, r0 +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s21 ; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[8] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q1[9] -; CHECK-NEXT: vmov.32 q4[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[9] ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: muls r0, r3, r0 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s21 ; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: vmov.32 q4[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[11] ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: muls r0, r3, r0 +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q0[13] +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s21 ; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[12] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: vmov.32 q4[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[13] ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q4[2], r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[0], r2 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.32 q5[1], r3 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r3 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: muls r0, r3, r0 +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: vmov.32 q3[0], r0 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q1[14] +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q1[15] -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.u8 r3, q0[14] ; CHECK-NEXT: vand q1, q3, q2 ; CHECK-NEXT: vmov.32 q3[0], r3 ; CHECK-NEXT: vmov.u8 r3, q0[15] +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vand q0, q3, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umlal r0, r1, r3, r2 -; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umlal r0, r1, r3, r2 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: muls r0, r3, r0 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -811,16 +797,15 @@ ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q2, #0xff -; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: mla r0, r2, r1, r0 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -1119,20 +1104,24 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xffff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s6 -; CHECK-NEXT: umull r2, lr, r3, r2 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umlal r2, lr, r3, r12 +; CHECK-NEXT: vmov r12, s4 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: mul r3, r3, r12 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> %yy = zext <2 x i16> %y to <2 x i64> @@ -1356,16 +1345,16 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.u8 r2, q1[0] -; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.u8 r2, q1[2] +; CHECK-NEXT: vmov.u8 r3, q0[2] ; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.u8 r2, q1[1] +; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.32 q4[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmov.32 q4[2], r3 @@ -1373,194 +1362,176 @@ ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov.u8 r4, q0[2] -; CHECK-NEXT: umull r12, lr, r3, r2 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[3] -; CHECK-NEXT: vmov.32 q4[2], r4 -; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: orr.w lr, lr, r3 -; CHECK-NEXT: vmov.u8 r3, q1[2] +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q1[0] +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov.32 q3[2], r12 +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov lr, s12 ; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: vmov.u8 r3, q1[3] +; CHECK-NEXT: vmov.u8 r3, q1[1] +; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov.32 q5[0], r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov.32 q5[2], r3 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[4] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[5] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: smlabb r2, r4, r3, r2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vmov.u8 r4, q1[4] +; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: vmov.u8 r4, q1[5] +; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: adds.w r2, r2, lr ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[2], r5 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[6] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[6] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[7] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[7] -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: muls r3, r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc r4, lr, #0 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: adc lr, r4, #0 +; CHECK-NEXT: vmov.u8 r4, q1[6] +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: vmov.u8 r4, q1[7] +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[2], r5 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[8] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[9] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: muls r3, r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc r4, lr, #0 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: adc lr, r4, #0 +; CHECK-NEXT: vmov.u8 r4, q1[8] +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: vmov.u8 r4, q1[9] +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[2], r5 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[10] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[11] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: muls r3, r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc r4, lr, #0 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: adc lr, r4, #0 +; CHECK-NEXT: vmov.u8 r4, q1[10] +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: vmov.u8 r4, q1[11] +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[2], r5 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[12] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[13] -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.32 q4[2], r4 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: muls r3, r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc r4, lr, #0 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: adc lr, r4, #0 +; CHECK-NEXT: vmov.u8 r4, q1[12] +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov.32 q3[0], r4 +; CHECK-NEXT: vmov.u8 r4, q1[13] +; CHECK-NEXT: vmov.32 q4[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: vmov.32 q4[2], r2 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.32 q5[1], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: umull r5, r4, r4, r5 -; CHECK-NEXT: vmov.32 q5[2], r5 -; CHECK-NEXT: vmov.32 q5[3], r4 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s22 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[14] -; CHECK-NEXT: vmov.32 q3[0], r5 -; CHECK-NEXT: vmov.u8 r5, q1[15] -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.32 q3[2], r5 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: vand q1, q3, q2 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: muls r3, r4, r3 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc r4, lr, #0 +; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: adc lr, r4, #0 +; CHECK-NEXT: vmov.u8 r4, q1[14] ; CHECK-NEXT: vmov.32 q3[0], r4 -; CHECK-NEXT: vmov.u8 r4, q0[15] +; CHECK-NEXT: vmov.u8 r4, q1[15] ; CHECK-NEXT: vmov.32 q3[2], r4 -; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vand q1, q3, q2 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov.32 q3[2], r2 ; CHECK-NEXT: vand q0, q3, q2 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: umlal r2, r3, r4, r5 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: umlal r2, r3, r4, r5 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: muls r3, r4, r3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc r4, lr, #0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc r3, r4, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -1745,22 +1716,18 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umull r12, lr, r3, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: orr.w r3, r3, lr +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: mla r2, r2, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> %yy = zext <2 x i8> %y to <2 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -289,44 +289,36 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) { ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i64 q3, #0xffff +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vand q4, q0, q3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vand q1, q2, q3 -; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov.32 q0[0], r1 ; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.32 q2[1], r0 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -609,18 +601,18 @@ ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: .pad #56 +; CHECK-NEXT: sub sp, #56 +; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill ; CHECK-NEXT: vpsel q5, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vmov.i64 q4, #0xff +; CHECK-NEXT: vmov.u8 r2, q4[0] ; CHECK-NEXT: vmov.16 q2[0], r0 ; CHECK-NEXT: vmov.u8 r0, q5[1] ; CHECK-NEXT: vmov.16 q2[1], r0 @@ -647,81 +639,68 @@ ; CHECK-NEXT: vmov.u16 r0, q6[3] ; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vcmp.i32 ne, q2, zr +; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.u8 r2, q1[0] ; CHECK-NEXT: and r1, r0, #1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov.32 q7[0], r1 -; CHECK-NEXT: vmov.32 q7[1], r1 ; CHECK-NEXT: ubfx r1, r0, #4, #1 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov.32 q7[2], r1 -; CHECK-NEXT: vmov.32 q7[3], r1 ; CHECK-NEXT: vmov.u8 r1, q3[0] ; CHECK-NEXT: vmov.32 q0[0], r1 ; CHECK-NEXT: vmov.u8 r1, q3[1] ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vand q2, q0, q4 +; CHECK-NEXT: vand q1, q0, q2 ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.u8 r2, q1[1] -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.u8 r2, q4[1] +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: vand q1, q0, q4 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: umull r1, r2, r2, r1 -; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: umull r1, r2, r2, r1 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vand q0, q0, q7 ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r3, r12 +; CHECK-NEXT: adds.w r12, r2, r1 +; CHECK-NEXT: adc r2, r3, #0 ; CHECK-NEXT: ubfx r3, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: ubfx r0, r0, #12, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q7[0], r3 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q7[1], r3 -; CHECK-NEXT: vmov.u8 r3, q2[2] ; CHECK-NEXT: vmov.32 q7[2], r0 -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.32 q7[3], r0 ; CHECK-NEXT: vmov.u8 r0, q3[2] +; CHECK-NEXT: vmov.u8 r3, q4[2] ; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: vmov.u8 r0, q3[3] -; CHECK-NEXT: vmov.u8 r3, q2[3] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.u8 r3, q4[3] ; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: umull r0, r3, r3, r0 -; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: umull r0, r3, r3, r0 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vand q0, q2, q7 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: muls r0, r3, r0 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: adds r1, r1, r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adcs r2, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds.w r12, r1, r0 -; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: muls r1, r3, r1 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vand q0, q0, q7 +; CHECK-NEXT: vmov q7, q4 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q6[4] ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q6[5] @@ -730,89 +709,76 @@ ; CHECK-NEXT: vmov.32 q0[2], r2 ; CHECK-NEXT: vmov.u16 r2, q6[7] ; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vmov.u8 r0, q4[4] ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill -; CHECK-NEXT: and r3, lr, #1 +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: vmov.u8 r0, q4[5] +; CHECK-NEXT: vmov.32 q1[2], r0 +; CHECK-NEXT: adc lr, r1, #0 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q6[0], r3 -; CHECK-NEXT: vmov.32 q6[1], r3 -; CHECK-NEXT: ubfx r3, lr, #4, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q6[2], r3 -; CHECK-NEXT: vmov.32 q6[3], r3 ; CHECK-NEXT: vmov.u8 r3, q3[4] ; CHECK-NEXT: vmov.32 q0[0], r3 ; CHECK-NEXT: vmov.u8 r3, q3[5] -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov.u8 r0, q3[4] +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: muls r0, r3, r0 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.32 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q3[5] -; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vmov q7, q3 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: umull r0, r3, r0, r3 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: muls r1, r3, r1 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vand q0, q0, q6 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, lr, #0 +; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: vmov.32 q6[0], r3 +; CHECK-NEXT: vmov.u8 r3, q4[6] +; CHECK-NEXT: vmov.32 q6[2], r2 +; CHECK-NEXT: vmov.u8 r2, q1[6] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.u8 r3, q4[7] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: umull r0, r3, r3, r0 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vand q0, q2, q6 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: adds r3, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q3[6] -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: ubfx r0, lr, #8, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q6[0], r0 -; CHECK-NEXT: vmov.32 q6[1], r0 -; CHECK-NEXT: ubfx r0, lr, #12, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q6[2], r0 -; CHECK-NEXT: vmov.32 q6[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: muls r0, r3, r0 ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.u8 r2, q3[7] -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vand q1, q1, q4 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vand q0, q0, q6 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.32 q2[1], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r2 -; CHECK-NEXT: vand q0, q2, q6 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q5[8] +; CHECK-NEXT: vmov.u8 r0, q7[8] ; CHECK-NEXT: vmov.16 q6[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[9] ; CHECK-NEXT: vmov.16 q6[1], r2 @@ -828,9 +794,8 @@ ; CHECK-NEXT: vmov.16 q6[6], r2 ; CHECK-NEXT: vmov.u8 r2, q5[15] ; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc lr, r1, #0 ; CHECK-NEXT: vcmp.i16 ne, q6, zr -; CHECK-NEXT: vmov.u8 r0, q7[8] ; CHECK-NEXT: vpsel q3, q1, q0 ; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov.u16 r2, q3[0] @@ -844,79 +809,67 @@ ; CHECK-NEXT: vmov.32 q0[3], r2 ; CHECK-NEXT: vmov.32 q1[2], r0 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vand q1, q1, q5 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: and r3, lr, #1 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q4[0], r3 -; CHECK-NEXT: vmov.32 q4[1], r3 -; CHECK-NEXT: ubfx r3, lr, #4, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q4[2], r3 -; CHECK-NEXT: vmov.32 q4[3], r3 -; CHECK-NEXT: vmov.u8 r3, q6[8] +; CHECK-NEXT: vmov.u8 r3, q5[8] ; CHECK-NEXT: vmov.32 q0[0], r3 -; CHECK-NEXT: vmov.u8 r3, q6[9] +; CHECK-NEXT: vmov.u8 r3, q5[9] ; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vand q0, q0, q5 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: muls r0, r3, r0 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r0, r3, r0, r3 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: muls r1, r3, r1 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, lr, #0 +; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: vmov.32 q4[0], r3 +; CHECK-NEXT: vmov.u8 r3, q7[10] +; CHECK-NEXT: vmov.32 q4[2], r2 +; CHECK-NEXT: vmov.u8 r2, q5[10] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.u8 r2, q5[11] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.u8 r3, q7[11] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: umull r0, r3, r3, r0 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vand q0, q2, q4 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: adds r3, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q7[10] -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.u8 r2, q7[11] -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vand q1, q1, q5 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: ubfx r0, lr, #8, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: vmov.32 q4[1], r0 -; CHECK-NEXT: ubfx r0, lr, #12, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: vmov.32 q4[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[10] +; CHECK-NEXT: muls r0, r3, r0 ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[11] -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vand q0, q0, q5 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vand q0, q0, q4 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.32 q2[1], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r2 -; CHECK-NEXT: vand q0, q2, q4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: adds.w r12, r2, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q3[4] +; CHECK-NEXT: vmov.u8 r0, q7[12] ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q3[5] ; CHECK-NEXT: vmov.32 q0[1], r2 @@ -924,84 +877,71 @@ ; CHECK-NEXT: vmov.32 q0[2], r2 ; CHECK-NEXT: vmov.u16 r2, q3[7] ; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov.u8 r0, q7[12] -; CHECK-NEXT: vmrs lr, p0 ; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov.u8 r0, q7[13] +; CHECK-NEXT: vmrs r2, p0 ; CHECK-NEXT: vmov.32 q1[2], r0 -; CHECK-NEXT: vand q1, q1, q5 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: and r3, lr, #1 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: adc lr, r1, #0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: vmov.32 q3[1], r3 -; CHECK-NEXT: ubfx r3, lr, #4, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vmov.32 q3[3], r3 -; CHECK-NEXT: vmov.u8 r3, q6[12] +; CHECK-NEXT: vmov.u8 r3, q5[12] ; CHECK-NEXT: vmov.32 q0[0], r3 -; CHECK-NEXT: vmov.u8 r3, q6[13] +; CHECK-NEXT: vmov.u8 r3, q5[13] ; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vand q0, q0, q5 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: muls r0, r3, r0 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r0, r3, r0, r3 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: muls r1, r3, r1 +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, lr, #0 +; CHECK-NEXT: adds.w r12, r0, r3 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.u8 r3, q7[14] +; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: vmov.u8 r2, q5[14] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.u8 r2, q5[15] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.u8 r3, q7[15] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: umull r0, r3, r3, r0 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vand q0, q2, q3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: adds r3, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q7[14] -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.u8 r2, q7[15] -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vand q1, q1, q5 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: ubfx r0, lr, #8, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q3[0], r0 -; CHECK-NEXT: vmov.32 q3[1], r0 -; CHECK-NEXT: ubfx r0, lr, #12, #1 -; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov.32 q3[2], r0 -; CHECK-NEXT: vmov.32 q3[3], r0 -; CHECK-NEXT: vmov.u8 r0, q6[14] +; CHECK-NEXT: muls r0, r3, r0 ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q6[15] -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vand q0, q0, q5 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.32 q2[1], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r2 -; CHECK-NEXT: vand q0, q2, q3 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adcs r1, r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: add sp, #56 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: @@ -1360,44 +1300,36 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) { ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i64 q3, #0xff +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vand q4, q0, q3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vand q1, q2, q3 -; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov.32 q0[0], r1 ; CHECK-NEXT: vmov.32 q0[2], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.32 q0[3], r1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q2[0], r0 -; CHECK-NEXT: vmov.32 q2[1], r0 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne ; CHECK-NEXT: vmov.32 q2[2], r0 -; CHECK-NEXT: vmov.32 q2[3], r0 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1824,49 +1756,39 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i64 q3, #0xffff ; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vand q4, q0, q3 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vand q1, q2, q3 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.32 q0[3], r3 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer %xx = zext <2 x i16> %x to <2 x i64> @@ -2162,22 +2084,21 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #80 -; CHECK-NEXT: sub sp, #80 +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vcmp.i8 eq, q2, zr -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill ; CHECK-NEXT: vpsel q5, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov.u8 r2, q5[0] -; CHECK-NEXT: vmov.i64 q4, #0xff ; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[1] ; CHECK-NEXT: vmov.16 q2[1], r2 @@ -2204,80 +2125,75 @@ ; CHECK-NEXT: vmov.u16 r2, q6[3] ; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmrs lr, p0 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: and r3, lr, #1 +; CHECK-NEXT: vmov.i64 q2, #0xff +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q7[0], r3 -; CHECK-NEXT: vmov.32 q7[1], r3 -; CHECK-NEXT: ubfx r3, lr, #4, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov.32 q7[2], r3 -; CHECK-NEXT: vmov.32 q7[3], r3 ; CHECK-NEXT: vmov.u8 r3, q3[0] ; CHECK-NEXT: vmov.32 q0[0], r3 ; CHECK-NEXT: vmov.u8 r3, q3[1] ; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.u8 r3, q1[0] -; CHECK-NEXT: vand q2, q0, q4 +; CHECK-NEXT: vmov.u8 r3, q4[0] +; CHECK-NEXT: vand q1, q0, q2 ; CHECK-NEXT: vmov.32 q0[0], r3 -; CHECK-NEXT: vmov.u8 r3, q1[1] -; CHECK-NEXT: vmov r12, s8 +; CHECK-NEXT: vmov.u8 r3, q4[1] +; CHECK-NEXT: vmov r12, s6 ; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vand q1, q0, q4 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: umull r3, r2, r3, r12 +; CHECK-NEXT: vmov lr, s4 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: mul r12, r3, r12 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: mul r3, r3, lr +; CHECK-NEXT: mov.w lr, #0 ; CHECK-NEXT: vmov.32 q0[0], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: vmov.32 q0[3], r3 +; CHECK-NEXT: vmov.32 q0[2], r12 ; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds r4, r4, r2 -; CHECK-NEXT: ubfx r2, lr, #8, #1 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: adds.w r5, r3, r12 +; CHECK-NEXT: ubfx r3, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: vmov.32 q7[0], r3 ; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: vmov.32 q7[0], r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.32 q7[1], r2 -; CHECK-NEXT: ubfx r2, lr, #12, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r3, q2[2] ; CHECK-NEXT: vmov.32 q7[2], r2 -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.32 q7[3], r2 ; CHECK-NEXT: vmov.u8 r2, q3[2] +; CHECK-NEXT: vmov.u8 r3, q4[2] ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u8 r2, q3[3] -; CHECK-NEXT: vmov.u8 r3, q2[3] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.u8 r3, q4[3] ; CHECK-NEXT: vmov.32 q0[2], r2 ; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.32 q2[1], r3 ; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vand q0, q2, q7 +; CHECK-NEXT: adc r12, lr, #0 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w lr, r12, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds.w r12, r3, r2 +; CHECK-NEXT: muls r3, r4, r3 +; CHECK-NEXT: vmov.u8 r4, q4[4] +; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: vmov.u8 r4, q4[5] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q1[2], r4 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q7 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov q7, q4 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: adds.w r12, r2, r5 ; CHECK-NEXT: vmov.u16 r2, q6[4] ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q6[5] @@ -2286,92 +2202,71 @@ ; CHECK-NEXT: vmov.32 q0[2], r2 ; CHECK-NEXT: vmov.u16 r2, q6[7] ; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: adc.w lr, lr, r4 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill -; CHECK-NEXT: and r4, r6, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q6[0], r4 -; CHECK-NEXT: vmov.32 q6[1], r4 -; CHECK-NEXT: ubfx r4, r6, #4, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov.32 q6[2], r4 -; CHECK-NEXT: vmov.32 q6[3], r4 -; CHECK-NEXT: vmov.u8 r4, q3[4] -; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov.u8 r4, q3[5] -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.32 q0[2], r4 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vmov.u8 r3, q3[4] -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q3[5] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov q7, q3 -; CHECK-NEXT: vand q1, q1, q4 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: umull r3, r4, r3, r4 -; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.32 q2[1], r4 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov.32 q2[2], r3 -; CHECK-NEXT: vmov.32 q2[3], r4 -; CHECK-NEXT: vand q0, q2, q6 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q6[0], r5 +; CHECK-NEXT: ubfx r5, r2, #4, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q6[2], r5 +; CHECK-NEXT: vmov.u8 r5, q3[4] +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov.u8 r5, q3[5] +; CHECK-NEXT: vmov.32 q0[2], r5 +; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds.w r4, r4, r12 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds r4, r4, r5 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: ubfx r2, r6, #8, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.u8 r3, q3[6] -; CHECK-NEXT: vmov.32 q6[0], r2 -; CHECK-NEXT: vmov.32 q6[1], r2 -; CHECK-NEXT: ubfx r2, r6, #12, #1 +; CHECK-NEXT: muls r5, r4, r5 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: muls r3, r4, r3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q0[2], r5 +; CHECK-NEXT: vand q0, q0, q6 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc r5, lr, #0 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adc r12, r5, #0 +; CHECK-NEXT: ubfx r5, r2, #8, #1 +; CHECK-NEXT: ubfx r2, r2, #12, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q6[0], r5 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov.32 q6[2], r2 -; CHECK-NEXT: vmov.32 q6[3], r2 ; CHECK-NEXT: vmov.u8 r2, q1[6] +; CHECK-NEXT: vmov.u8 r5, q4[6] ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q3[7] +; CHECK-NEXT: vmov.32 q1[0], r5 +; CHECK-NEXT: vmov.u8 r5, q4[7] ; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vand q0, q0, q4 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q1[2], r5 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r3 -; CHECK-NEXT: vand q0, q2, q6 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: muls r2, r5, r2 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: muls r5, r4, r5 +; CHECK-NEXT: vmov.u8 r4, q7[8] +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vand q0, q0, q6 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: vmov.u8 r5, q7[8] -; CHECK-NEXT: adc.w r3, r2, r6 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: adds.w r12, r2, r5 ; CHECK-NEXT: vmov.u8 r2, q5[8] +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov.16 q6[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[9] ; CHECK-NEXT: vmov.16 q6[1], r2 @@ -2389,9 +2284,9 @@ ; CHECK-NEXT: vmov.16 q6[7], r2 ; CHECK-NEXT: vcmp.i16 ne, q6, zr ; CHECK-NEXT: vpsel q3, q1, q0 -; CHECK-NEXT: vmov.32 q1[0], r5 +; CHECK-NEXT: vmov.32 q1[0], r4 ; CHECK-NEXT: vmov.u16 r2, q3[0] -; CHECK-NEXT: vmov.u8 r5, q7[9] +; CHECK-NEXT: vmov.u8 r4, q7[9] ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q3[1] ; CHECK-NEXT: vmov.32 q0[1], r2 @@ -2399,88 +2294,75 @@ ; CHECK-NEXT: vmov.32 q0[2], r2 ; CHECK-NEXT: vmov.u16 r2, q3[3] ; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmov.32 q1[2], r5 +; CHECK-NEXT: vmov.32 q1[2], r4 ; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vand q1, q1, q5 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: and r6, r2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: vmov.32 q4[0], r6 -; CHECK-NEXT: vmov.32 q4[1], r6 -; CHECK-NEXT: ubfx r6, r2, #4, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: vmov.32 q4[2], r6 -; CHECK-NEXT: vmov.32 q4[3], r6 -; CHECK-NEXT: vmov.u8 r6, q6[8] -; CHECK-NEXT: vmov.32 q0[0], r6 -; CHECK-NEXT: vmov.u8 r6, q6[9] -; CHECK-NEXT: vmov.32 q0[2], r6 -; CHECK-NEXT: vand q0, q0, q5 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: vmov.32 q2[0], r6 -; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov.32 q2[1], r5 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: vmov.32 q2[2], r6 -; CHECK-NEXT: vmov.32 q2[3], r5 -; CHECK-NEXT: vand q0, q2, q4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adds.w r5, r5, r12 -; CHECK-NEXT: adcs r6, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adc.w r12, r6, r4 -; CHECK-NEXT: ubfx r6, r2, #8, #1 -; CHECK-NEXT: rsbs r6, r6, #0 +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q4[0], r5 +; CHECK-NEXT: ubfx r5, r2, #4, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q4[2], r5 +; CHECK-NEXT: vmov.u8 r5, q5[8] +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov.u8 r5, q5[9] +; CHECK-NEXT: vmov.32 q0[2], r5 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: muls r5, r4, r5 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: muls r3, r4, r3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q0[2], r5 +; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc r5, lr, #0 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adc r12, r5, #0 +; CHECK-NEXT: ubfx r5, r2, #8, #1 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: vmov.32 q4[0], r6 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q4[0], r5 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q4[1], r6 -; CHECK-NEXT: vmov.u8 r6, q7[10] ; CHECK-NEXT: vmov.32 q4[2], r2 -; CHECK-NEXT: vmov.32 q1[0], r6 -; CHECK-NEXT: vmov.32 q4[3], r2 -; CHECK-NEXT: vmov.u8 r2, q6[10] +; CHECK-NEXT: vmov.u8 r2, q5[10] +; CHECK-NEXT: vmov.u8 r5, q7[10] ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.u8 r2, q6[11] -; CHECK-NEXT: vmov.u8 r6, q7[11] +; CHECK-NEXT: vmov.u8 r2, q5[11] +; CHECK-NEXT: vmov.32 q1[0], r5 +; CHECK-NEXT: vmov.u8 r5, q7[11] ; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: vmov.32 q1[2], r6 -; CHECK-NEXT: vand q0, q0, q5 -; CHECK-NEXT: vand q1, q1, q5 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: umull r2, r6, r6, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q1[2], r5 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.32 q2[1], r6 -; CHECK-NEXT: vmov r6, s6 -; CHECK-NEXT: umull r2, r6, r6, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r6 -; CHECK-NEXT: vand q0, q2, q4 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: muls r2, r5, r2 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: muls r5, r4, r5 +; CHECK-NEXT: vmov.u8 r4, q7[12] +; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: vmov.u8 r4, q7[13] +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov.32 q1[2], r4 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vand q0, q0, q4 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w r12, r3, r5 -; CHECK-NEXT: vmov.u8 r5, q7[12] -; CHECK-NEXT: vmov.32 q1[0], r5 -; CHECK-NEXT: vmov.u8 r5, q7[13] -; CHECK-NEXT: vmov.32 q1[2], r5 -; CHECK-NEXT: vand q1, q1, q5 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: adc.w r3, r2, r6 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: adds.w r12, r2, r5 ; CHECK-NEXT: vmov.u16 r2, q3[4] +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q3[5] ; CHECK-NEXT: vmov.32 q0[1], r2 @@ -2488,81 +2370,69 @@ ; CHECK-NEXT: vmov.32 q0[2], r2 ; CHECK-NEXT: vmov.u16 r2, q3[7] ; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmrs r2, p0 -; CHECK-NEXT: and r6, r2, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: vmov.32 q3[0], r6 -; CHECK-NEXT: vmov.32 q3[1], r6 -; CHECK-NEXT: ubfx r6, r2, #4, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: vmov.32 q3[2], r6 -; CHECK-NEXT: vmov.32 q3[3], r6 -; CHECK-NEXT: vmov.u8 r6, q6[12] -; CHECK-NEXT: vmov.32 q0[0], r6 -; CHECK-NEXT: vmov.u8 r6, q6[13] -; CHECK-NEXT: vmov.32 q0[2], r6 -; CHECK-NEXT: vand q0, q0, q5 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: vmov.32 q2[0], r6 -; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov.32 q2[1], r5 -; CHECK-NEXT: vmov r5, s6 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: vmov.32 q2[2], r6 -; CHECK-NEXT: vmov.32 q2[3], r5 -; CHECK-NEXT: vand q0, q2, q3 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adds.w r5, r5, r12 -; CHECK-NEXT: adcs r6, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adc.w r12, r6, r4 -; CHECK-NEXT: ubfx r6, r2, #8, #1 -; CHECK-NEXT: rsbs r6, r6, #0 +; CHECK-NEXT: and r5, r2, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q3[0], r5 +; CHECK-NEXT: ubfx r5, r2, #4, #1 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.u8 r5, q5[12] +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov.u8 r5, q5[13] +; CHECK-NEXT: vmov.32 q0[2], r5 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: muls r5, r4, r5 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: muls r3, r4, r3 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q0[2], r5 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adds.w r3, r3, r12 +; CHECK-NEXT: adc r5, lr, #0 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adc r12, r5, #0 +; CHECK-NEXT: ubfx r5, r2, #8, #1 ; CHECK-NEXT: ubfx r2, r2, #12, #1 -; CHECK-NEXT: vmov.32 q3[0], r6 +; CHECK-NEXT: rsbs r5, r5, #0 +; CHECK-NEXT: vmov.32 q3[0], r5 ; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: vmov.32 q3[1], r6 -; CHECK-NEXT: vmov.u8 r6, q7[14] ; CHECK-NEXT: vmov.32 q3[2], r2 -; CHECK-NEXT: vmov.32 q1[0], r6 -; CHECK-NEXT: vmov.32 q3[3], r2 -; CHECK-NEXT: vmov.u8 r2, q6[14] +; CHECK-NEXT: vmov.u8 r2, q5[14] +; CHECK-NEXT: vmov.u8 r5, q7[14] ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.u8 r2, q6[15] -; CHECK-NEXT: vmov.u8 r6, q7[15] +; CHECK-NEXT: vmov.u8 r2, q5[15] +; CHECK-NEXT: vmov.32 q1[0], r5 +; CHECK-NEXT: vmov.u8 r5, q7[15] ; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: vmov.32 q1[2], r6 -; CHECK-NEXT: vand q0, q0, q5 -; CHECK-NEXT: vand q1, q1, q5 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: umull r2, r6, r6, r2 -; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.32 q1[2], r5 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.32 q2[1], r6 -; CHECK-NEXT: vmov r6, s6 -; CHECK-NEXT: umull r2, r6, r6, r2 -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r6 -; CHECK-NEXT: vand q0, q2, q3 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: muls r2, r5, r2 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: muls r5, r4, r5 +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r2, r6 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: add sp, #80 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> @@ -2925,49 +2795,39 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.i64 q3, #0xff ; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: vand q4, q0, q3 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vand q1, q2, q3 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.32 q0[0], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.32 q0[3], r3 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 ; CHECK-NEXT: csetm r2, ne ; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.32 q2[3], r2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer %xx = zext <2 x i8> %x to <2 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll @@ -50,48 +50,40 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pSrc, i32 %blockSize, <4 x i32> %a) { ; CHECK-LABEL: foo_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q4, [r0] -; CHECK-NEXT: vmov.f64 d0, d8 -; CHECK-NEXT: vmov.i64 q5, #0xffffffff -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vand q6, q0, q5 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov r1, s25 +; CHECK-NEXT: vmov.f64 d10, d8 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmov.f32 s22, s17 +; CHECK-NEXT: vmov r0, s20 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov r0, s22 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov r1, s27 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov.f64 d0, d9 ; CHECK-NEXT: vmov.f32 s2, s19 -; CHECK-NEXT: vand q0, q0, q5 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r7, s1 ; CHECK-NEXT: vmov d8, r4, r5 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d11, r0, r1 ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d10, r0, r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %pSrc, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) diff --git a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll --- a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -49,14 +49,12 @@ define void @zero_test() { ; X32-LABEL: zero_test: ; X32: # %bb.0: # %entry -; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: movlps %xmm0, (%eax) +; X32-NEXT: movl $0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: zero_test: ; X64: # %bb.0: # %entry -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: movlps %xmm0, (%rax) +; X64-NEXT: movq $0, (%rax) ; X64-NEXT: retq entry: %0 = select <2 x i1> undef, <2 x float> undef, <2 x float> zeroinitializer diff --git a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll --- a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll +++ b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll @@ -29,8 +29,8 @@ ; X86-LABEL: store_64: ; X86: # %bb.0: # %BB ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: movlps %xmm0, (%eax) +; X86-NEXT: movl $0, 4(%eax) +; X86-NEXT: movl $0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: store_64: diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll --- a/llvm/test/CodeGen/X86/fold-load-vec.ll +++ b/llvm/test/CodeGen/X86/fold-load-vec.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq $0, (%rsp) ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movlps %xmm0, (%rsp) ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movlps %xmm0, (%rsp) ; CHECK-NEXT: movlps %xmm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2054,22 +2054,22 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,7,42,32] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [32768,4294934528,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[0,2] -; SSE2-NEXT: pmuludq %xmm3, %xmm0 -; SSE2-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pmuludq %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE2-NEXT: pmuludq %xmm3, %xmm2 +; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295] +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_negative2: diff --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll --- a/llvm/test/CodeGen/X86/nontemporal-3.ll +++ b/llvm/test/CodeGen/X86/nontemporal-3.ll @@ -195,33 +195,14 @@ } define void @test_zero_v8f32_align1(<8 x float>* %dst) nounwind { -; SSE2-LABEL: test_zero_v8f32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v8f32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorl %eax, %eax -; SSE4A-NEXT: movntiq %rax, 8(%rdi) -; SSE4A-NEXT: movntiq %rax, 24(%rdi) -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v8f32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v8f32_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8f32_align1: ; AVX: # %bb.0: @@ -245,32 +226,14 @@ } define void @test_zero_v4i64_align1(<4 x i64>* %dst) nounwind { -; SSE2-LABEL: test_zero_v4i64_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v4i64_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v4i64_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v4i64_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v4i64_align1: ; AVX: # %bb.0: @@ -294,32 +257,14 @@ } define void @test_zero_v8i32_align1(<8 x i32>* %dst) nounwind { -; SSE2-LABEL: test_zero_v8i32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v8i32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v8i32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v8i32_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8i32_align1: ; AVX: # %bb.0: @@ -343,32 +288,14 @@ } define void @test_zero_v16i16_align1(<16 x i16>* %dst) nounwind { -; SSE2-LABEL: test_zero_v16i16_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v16i16_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v16i16_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v16i16_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16i16_align1: ; AVX: # %bb.0: @@ -392,32 +319,14 @@ } define void @test_zero_v32i8_align1(<32 x i8>* %dst) nounwind { -; SSE2-LABEL: test_zero_v32i8_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v32i8_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v32i8_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v32i8_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v32i8_align1: ; AVX: # %bb.0: @@ -636,45 +545,18 @@ } define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind { -; SSE2-LABEL: test_zero_v16f32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v16f32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorl %eax, %eax -; SSE4A-NEXT: movntiq %rax, 24(%rdi) -; SSE4A-NEXT: movntiq %rax, 8(%rdi) -; SSE4A-NEXT: movntiq %rax, 56(%rdi) -; SSE4A-NEXT: movntiq %rax, 40(%rdi) -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v16f32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v16f32_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16f32_align1: ; AVX: # %bb.0: @@ -706,44 +588,18 @@ } define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind { -; SSE2-LABEL: test_zero_v8i64_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v8i64_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v8i64_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v8i64_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8i64_align1: ; AVX: # %bb.0: @@ -775,44 +631,18 @@ } define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind { -; SSE2-LABEL: test_zero_v16i32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v16i32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v16i32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v16i32_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16i32_align1: ; AVX: # %bb.0: @@ -844,44 +674,18 @@ } define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind { -; SSE2-LABEL: test_zero_v32i16_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v32i16_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v32i16_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v32i16_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v32i16_align1: ; AVX: # %bb.0: @@ -913,44 +717,18 @@ } define void @test_zero_v64i8_align1(<64 x i8>* %dst) nounwind { -; SSE2-LABEL: test_zero_v64i8_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v64i8_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v64i8_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v64i8_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v64i8_align1: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll --- a/llvm/test/CodeGen/X86/pr41619.ll +++ b/llvm/test/CodeGen/X86/pr41619.ll @@ -7,10 +7,9 @@ ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: movl %eax, (%rax) -; CHECK-NEXT: vmovlps %xmm1, (%rax) +; CHECK-NEXT: movq $0, (%rax) ; CHECK-NEXT: retq bb: %tmp = bitcast double %arg to i64 diff --git a/llvm/test/CodeGen/X86/vec_zero_cse.ll b/llvm/test/CodeGen/X86/vec_zero_cse.ll --- a/llvm/test/CodeGen/X86/vec_zero_cse.ll +++ b/llvm/test/CodeGen/X86/vec_zero_cse.ll @@ -15,8 +15,8 @@ ; X32: # %bb.0: ; X32-NEXT: movl $0, M1+4 ; X32-NEXT: movl $0, M1 -; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: movlps %xmm0, M2 +; X32-NEXT: movl $0, M2+4 +; X32-NEXT: movl $0, M2 ; X32-NEXT: retl ; ; X64-LABEL: test1: @@ -34,8 +34,8 @@ ; X32: # %bb.0: ; X32-NEXT: movl $-1, M1+4 ; X32-NEXT: movl $-1, M1 -; X32-NEXT: pcmpeqd %xmm0, %xmm0 -; X32-NEXT: movq %xmm0, M2 +; X32-NEXT: movl $-1, M2+4 +; X32-NEXT: movl $-1, M2 ; X32-NEXT: retl ; ; X64-LABEL: test2: diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -185,13 +185,14 @@ ; X32-SSE-NEXT: psrlq $1, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X32-SSE-NEXT: pxor %xmm6, %xmm6 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pand %xmm3, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 @@ -1240,13 +1241,14 @@ ; X32-SSE-NEXT: psrlq $1, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X32-SSE-NEXT: pxor %xmm6, %xmm6 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pand %xmm3, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 @@ -2505,17 +2507,15 @@ ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = <4,u,14,u> ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 ; X32-SSE-NEXT: pandn %xmm2, %xmm4 -; X32-SSE-NEXT: psrlq $1, %xmm1 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 +; X32-SSE-NEXT: psrlq $1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X32-SSE-NEXT: psrlq %xmm4, %xmm1 +; X32-SSE-NEXT: psrlq $50, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: psllq %xmm3, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psllq %xmm3, %xmm0 +; X32-SSE-NEXT: psllq $14, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 ; X32-SSE-NEXT: retl @@ -3032,10 +3032,8 @@ ; X32-SSE-LABEL: splatconstant_funnnel_v2i64: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: psrlq $50, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X32-SSE-NEXT: psllq $14, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X32-SSE-NEXT: orpd %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -136,17 +136,18 @@ ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllq %xmm1, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-SSE-NEXT: movdqa %xmm0, %xmm6 +; X32-SSE-NEXT: psllq %xmm1, %xmm6 +; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psrlq %xmm2, %xmm0 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-SSE-NEXT: psrlq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm6, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt) ret <2 x i64> %res @@ -745,17 +746,18 @@ ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllq %xmm1, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-SSE-NEXT: movdqa %xmm0, %xmm6 +; X32-SSE-NEXT: psllq %xmm1, %xmm6 +; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psrlq %xmm2, %xmm0 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-SSE-NEXT: psrlq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm6, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat) @@ -1263,17 +1265,15 @@ ; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psllq %xmm2, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psllq %xmm2, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psllq $14, %xmm2 +; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; X32-SSE-NEXT: pand %xmm1, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psrlq %xmm2, %xmm0 +; X32-SSE-NEXT: psrlq $50, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm2, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res @@ -1665,10 +1665,8 @@ ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psrlq $50, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X32-SSE-NEXT: psllq $14, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X32-SSE-NEXT: orpd %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -185,14 +185,15 @@ ; X32-SSE-NEXT: pand %xmm3, %xmm4 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X32-SSE-NEXT: pxor %xmm6, %xmm6 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pandn %xmm3, %xmm2 ; X32-SSE-NEXT: psllq $1, %xmm0 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 @@ -1225,14 +1226,15 @@ ; X32-SSE-NEXT: pand %xmm3, %xmm4 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X32-SSE-NEXT: pxor %xmm6, %xmm6 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X32-SSE-NEXT: psrlq %xmm4, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pandn %xmm3, %xmm2 ; X32-SSE-NEXT: psllq $1, %xmm0 ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllq %xmm2, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X32-SSE-NEXT: psllq %xmm2, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 @@ -2124,15 +2126,13 @@ ; X32-SSE-NEXT: pand %xmm2, %xmm4 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5 ; X32-SSE-NEXT: psrlq %xmm4, %xmm5 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X32-SSE-NEXT: psrlq %xmm4, %xmm1 +; X32-SSE-NEXT: psrlq $14, %xmm1 ; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X32-SSE-NEXT: pandn %xmm2, %xmm3 -; X32-SSE-NEXT: psllq $1, %xmm0 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psllq $1, %xmm2 ; X32-SSE-NEXT: psllq %xmm3, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psllq %xmm3, %xmm0 +; X32-SSE-NEXT: psllq $50, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X32-SSE-NEXT: orpd %xmm1, %xmm0 ; X32-SSE-NEXT: retl @@ -2662,10 +2662,8 @@ ; X32-SSE-LABEL: splatconstant_funnnel_v2i64: ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: psrlq $14, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X32-SSE-NEXT: psllq $50, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X32-SSE-NEXT: orpd %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -138,17 +138,18 @@ ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrlq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psrlq %xmm1, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-SSE-NEXT: movdqa %xmm0, %xmm6 +; X32-SSE-NEXT: psrlq %xmm1, %xmm6 +; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psllq %xmm2, %xmm0 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-SSE-NEXT: psllq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm6, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt) ret <2 x i64> %res @@ -789,17 +790,18 @@ ; X32-SSE-NEXT: pand %xmm2, %xmm1 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrlq %xmm1, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psrlq %xmm1, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: pxor %xmm5, %xmm5 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-SSE-NEXT: movdqa %xmm0, %xmm6 +; X32-SSE-NEXT: psrlq %xmm1, %xmm6 +; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X32-SSE-NEXT: pand %xmm2, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psllq %xmm2, %xmm0 +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-SSE-NEXT: psllq %xmm3, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm6, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat) @@ -1343,17 +1345,15 @@ ; X32-SSE-NEXT: pand %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: psrlq %xmm2, %xmm4 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: psrlq %xmm2, %xmm5 -; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: psrlq $14, %xmm2 +; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; X32-SSE-NEXT: pand %xmm1, %xmm3 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq %xmm3, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X32-SSE-NEXT: psllq %xmm2, %xmm0 +; X32-SSE-NEXT: psllq $50, %xmm0 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X32-SSE-NEXT: orpd %xmm5, %xmm0 +; X32-SSE-NEXT: orpd %xmm2, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res @@ -1745,10 +1745,8 @@ ; X32-SSE: # %bb.0: ; X32-SSE-NEXT: movdqa %xmm0, %xmm1 ; X32-SSE-NEXT: psllq $50, %xmm1 -; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X32-SSE-NEXT: psrlq $14, %xmm0 -; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X32-SSE-NEXT: orpd %xmm1, %xmm0 +; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3061,52 +3061,18 @@ } define void @PR43024() { -; SSE2-LABEL: PR43024: -; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSE2-NEXT: movaps %xmm0, (%rax) -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; SSE2-NEXT: addss %xmm0, %xmm1 -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: addss %xmm0, %xmm1 -; SSE2-NEXT: addss %xmm0, %xmm1 -; SSE2-NEXT: movss %xmm1, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR43024: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSSE3-NEXT: movaps %xmm0, (%rax) -; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: addss %xmm0, %xmm1 -; SSSE3-NEXT: xorps %xmm0, %xmm0 -; SSSE3-NEXT: addss %xmm0, %xmm1 -; SSSE3-NEXT: addss %xmm0, %xmm1 -; SSSE3-NEXT: movss %xmm1, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR43024: -; SSE41: # %bb.0: -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSE41-NEXT: movaps %xmm0, (%rax) -; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: movss %xmm1, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: PR43024: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movl $2143289344, (%rax) # imm = 0x7FC00000 +; SSE-NEXT: retq ; ; AVX-LABEL: PR43024: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; AVX-NEXT: vmovaps %xmm0, (%rax) -; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovss %xmm0, (%rax) +; AVX-NEXT: movl $2143289344, (%rax) # imm = 0x7FC00000 ; AVX-NEXT: retq store <4 x float> , <4 x float>* undef, align 16 %1 = load <4 x float>, <4 x float>* undef, align 16 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2313,11 +2313,9 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll --- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll +++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll @@ -105,8 +105,8 @@ ; X86-LABEL: shuf5: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movsd %xmm0, (%eax) +; X86-NEXT: movl $555819297, 4(%eax) # imm = 0x21212121 +; X86-NEXT: movl $555819297, (%eax) # imm = 0x21212121 ; X86-NEXT: retl ; ; X64-LABEL: shuf5: