diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19690,6 +19690,11 @@ unsigned NumElts = VecVT.getVectorNumElements(); unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); + // Try to simplify the whole operation to a constant, or simplify its + // operands. + if (SimplifyDemandedBits(SDValue(N, 0))) + return SDValue(N, 0); + // TODO: These transforms should not require the 'hasOneUse' restriction, but // there are regressions on multiple targets without it. We can end up with a // mess of scalar and vector code if we reduce only part of the DAG to scalar. diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -57,8 +57,8 @@ ; CHECK-LABEL: widen_f16_build_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #13294 -; CHECK-NEXT: dup.4h v0, w8 -; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: movk w8, #13294, lsl #16 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %1 = bitcast half* %addr to <2 x half>* store <2 x half> , <2 x half>* %1, align 2 diff --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll --- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll +++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll @@ -25,17 +25,8 @@ define void @test2(float * %p1, i32 %v1) { ; CHECK-LABEL: test2: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: and x8, x1, #0x3 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: movi.16b v0, #63 -; CHECK-NEXT: bfi x9, x8, #2, #2 -; CHECK-NEXT: str q0, [sp] -; CHECK-NEXT: ldr s0, [x9] -; CHECK-NEXT: str s0, [x0] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov w8, #1061109567 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret entry: %v2 = extractelement <3 x float> , i32 %v1 diff --git a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll --- a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll +++ b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll @@ -30,10 +30,10 @@ define [1 x <4 x float>] @test2() { ; CHECK-LABEL: .p2align 4 ; -- Begin function test2 ; CHECK-NEXT: lCPI1_0: -; CHECK-NEXT: .long 0x00000000 ; float 0 -; CHECK-NEXT: .long 0x00000000 ; float 0 -; CHECK-NEXT: .long 0x00000000 ; float 0 -; CHECK-NEXT: .long 0x3f800000 ; float 1 +; CHECK-NEXT: .long 0x80000000 ; float -0 +; CHECK-NEXT: .long 0x80000000 ; float -0 +; CHECK-NEXT: .long 0x80000000 ; float -0 +; CHECK-NEXT: .long 0xbf800000 ; float -1 ; CHECK-NEXT: .section __TEXT,__text,regular,pure_instructions ; CHECK-NEXT: .globl _test2 ; CHECK-NEXT: .p2align 2 @@ -43,17 +43,7 @@ ; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x8, lCPI1_0@PAGE ; CHECK-NEXT: Lloh3: -; CHECK-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF] -; CHECK-NEXT: mov s2, v1[1] -; CHECK-NEXT: fneg s0, s1 -; CHECK-NEXT: mov s3, v1[2] -; CHECK-NEXT: mov s1, v1[3] -; CHECK-NEXT: fneg s2, s2 -; CHECK-NEXT: fneg s1, s1 -; CHECK-NEXT: mov.s v0[1], v2[0] -; CHECK-NEXT: fneg s2, s3 -; CHECK-NEXT: mov.s v0[2], v2[0] -; CHECK-NEXT: mov.s v0[3], v1[0] +; CHECK-NEXT: ldr q0, [x8, lCPI1_0@PAGEOFF] ; CHECK-NEXT: ret ; ret [1 x <4 x float>] [<4 x float> diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll --- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll +++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll @@ -629,10 +629,8 @@ ; CHECKDAG-LABEL: sext_v1x64: ; CHECKDAG: // %bb.0: ; CHECKDAG-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECKDAG-NEXT: fmov x8, d0 -; CHECKDAG-NEXT: asr x1, x8, #63 -; CHECKDAG-NEXT: mov.d v0[1], x1 ; CHECKDAG-NEXT: fmov x0, d0 +; CHECKDAG-NEXT: asr x1, x0, #63 ; CHECKDAG-NEXT: ret ; ; FALLBACK-LABEL: sext_v1x64: diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -1749,30 +1749,27 @@ ; CHECK-LABEL: uabd_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov.d x8, v0[1] -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: mov.d x10, v1[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: mov.d x9, v1[1] ; CHECK-NEXT: fmov x11, d1 -; CHECK-NEXT: asr x12, x9, #63 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: asr x14, x8, #63 +; CHECK-NEXT: asr x15, x9, #63 +; CHECK-NEXT: subs x8, x8, x9 ; CHECK-NEXT: asr x13, x11, #63 -; CHECK-NEXT: subs x9, x9, x11 +; CHECK-NEXT: sbc x9, x14, x15 +; CHECK-NEXT: subs x10, x10, x11 ; CHECK-NEXT: sbc x11, x12, x13 -; CHECK-NEXT: asr x12, x8, #63 -; CHECK-NEXT: asr x13, x10, #63 -; CHECK-NEXT: subs x8, x8, x10 -; CHECK-NEXT: sbc x10, x12, x13 +; CHECK-NEXT: asr x13, x9, #63 ; CHECK-NEXT: asr x12, x11, #63 -; CHECK-NEXT: asr x13, x10, #63 -; CHECK-NEXT: eor x9, x9, x12 ; CHECK-NEXT: eor x8, x8, x13 -; CHECK-NEXT: eor x10, x10, x13 +; CHECK-NEXT: eor x10, x10, x12 +; CHECK-NEXT: eor x11, x11, x12 +; CHECK-NEXT: subs x0, x10, x12 +; CHECK-NEXT: eor x9, x9, x13 +; CHECK-NEXT: sbc x1, x11, x12 ; CHECK-NEXT: subs x2, x8, x13 -; CHECK-NEXT: sbc x3, x10, x13 -; CHECK-NEXT: subs x8, x9, x12 -; CHECK-NEXT: eor x9, x11, x12 -; CHECK-NEXT: sbc x1, x9, x12 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov.d v0[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: sbc x3, x9, x13 ; CHECK-NEXT: ret %aext = sext <2 x i64> %a to <2 x i128> %bext = sext <2 x i64> %b to <2 x i128> @@ -1782,3 +1779,5 @@ %absel = select <2 x i1> %abcmp, <2 x i128> %ababs, <2 x i128> %abdiff ret <2 x i128> %absel } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FALLBACK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll --- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll +++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll @@ -243,21 +243,18 @@ define <4 x i65> @sign_4xi65(<4 x i65> %a) { ; CHECK-LABEL: sign_4xi65: ; CHECK: // %bb.0: -; CHECK-NEXT: sbfx x8, x1, #0, #1 -; CHECK-NEXT: sbfx x10, x5, #0, #1 -; CHECK-NEXT: orr x9, x8, #0x1 -; CHECK-NEXT: lsr x1, x8, #63 ; CHECK-NEXT: sbfx x8, x7, #0, #1 -; CHECK-NEXT: orr x4, x10, #0x1 -; CHECK-NEXT: lsr x5, x10, #63 +; CHECK-NEXT: sbfx x9, x5, #0, #1 +; CHECK-NEXT: sbfx x10, x3, #0, #1 +; CHECK-NEXT: sbfx x11, x1, #0, #1 +; CHECK-NEXT: orr x0, x11, #0x1 +; CHECK-NEXT: lsr x1, x11, #63 +; CHECK-NEXT: orr x2, x10, #0x1 +; CHECK-NEXT: lsr x3, x10, #63 +; CHECK-NEXT: orr x4, x9, #0x1 +; CHECK-NEXT: lsr x5, x9, #63 ; CHECK-NEXT: orr x6, x8, #0x1 -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: sbfx x9, x3, #0, #1 -; CHECK-NEXT: orr x2, x9, #0x1 -; CHECK-NEXT: lsr x3, x9, #63 ; CHECK-NEXT: lsr x7, x8, #63 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %c = icmp sgt <4 x i65> %a, %res = select <4 x i1> %c, <4 x i65> , <4 x i65 > diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -823,47 +823,43 @@ ; CHECK-NEXT: .cfi_offset b9, -56 ; CHECK-NEXT: .cfi_offset b10, -64 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: movi v9.2s, #241, lsl #24 ; CHECK-NEXT: mov w8, #1895825407 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x21, #-34359738368 ; CHECK-NEXT: mov x22, #34359738367 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fmov s10, w8 -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt -; CHECK-NEXT: fcmp s8, s10 +; CHECK-NEXT: fcmp s0, s10 ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: fcmp s8, s8 +; CHECK-NEXT: fcmp s0, s0 +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, x21, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp s0, s10 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: fcmp s8, s10 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x22, x8, gt -; CHECK-NEXT: fcmp s0, s0 +; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload +; CHECK-NEXT: csel x2, xzr, x8, vs +; CHECK-NEXT: csel x3, xzr, x9, vs ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret %x = call <2 x i100> @llvm.fptosi.sat.v2f32.v2i100(<2 x float> %f) @@ -889,47 +885,43 @@ ; CHECK-NEXT: .cfi_offset b9, -56 ; CHECK-NEXT: .cfi_offset b10, -64 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: movi v9.2s, #255, lsl #24 ; CHECK-NEXT: mov w8, #2130706431 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x21, #-9223372036854775808 ; CHECK-NEXT: mov x22, #9223372036854775807 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fmov s10, w8 -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt -; CHECK-NEXT: fcmp s8, s10 +; CHECK-NEXT: fcmp s0, s10 ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: fcmp s8, s8 +; CHECK-NEXT: fcmp s0, s0 +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, x21, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp s0, s10 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: fcmp s8, s10 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x22, x8, gt -; CHECK-NEXT: fcmp s0, s0 +; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload +; CHECK-NEXT: csel x2, xzr, x8, vs +; CHECK-NEXT: csel x3, xzr, x9, vs ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret %x = call <2 x i128> @llvm.fptosi.sat.v2f32.v2i128(<2 x float> %f) @@ -1079,15 +1071,15 @@ define <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-LABEL: test_signed_v4f32_v4i100: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #128 -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: str d10, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #40] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #56] // 8-byte Folded Spill -; CHECK-NEXT: stp x26, x25, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: .cfi_def_cfa_offset 112 +; CHECK-NEXT: str d10, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #24] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #40] // 8-byte Folded Spill +; CHECK-NEXT: stp x26, x25, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w21, -24 @@ -1100,28 +1092,40 @@ ; CHECK-NEXT: .cfi_offset b8, -80 ; CHECK-NEXT: .cfi_offset b9, -88 ; CHECK-NEXT: .cfi_offset b10, -96 -; CHECK-NEXT: mov s8, v0.s[1] -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: movi v9.2s, #241, lsl #24 ; CHECK-NEXT: mov w8, #1895825407 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x25, #-34359738368 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: mov x26, #34359738367 ; CHECK-NEXT: fmov s10, w8 +; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: mov s8, v0.s[1] +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x25, x1, lt +; CHECK-NEXT: fcmp s0, s10 +; CHECK-NEXT: csel x9, x26, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: fcmp s0, s0 +; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: csel x19, xzr, x8, vs +; CHECK-NEXT: csel x20, xzr, x9, vs +; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: csel x19, xzr, x8, vs -; CHECK-NEXT: csel x20, xzr, x9, vs +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: csel x21, xzr, x8, vs +; CHECK-NEXT: csel x22, xzr, x9, vs ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s0, s9 @@ -1133,48 +1137,32 @@ ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: csel x21, xzr, x8, vs -; CHECK-NEXT: csel x22, xzr, x9, vs +; CHECK-NEXT: csel x23, xzr, x8, vs +; CHECK-NEXT: csel x24, xzr, x9, vs ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 +; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: csel x23, xzr, x8, vs -; CHECK-NEXT: csel x24, xzr, x9, vs -; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: mov x6, x23 -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: mov x7, x24 -; CHECK-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, x25, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp s0, s10 -; CHECK-NEXT: ldr x30, [sp, #56] // 8-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x26, x8, gt -; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: ldr d10, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: ldp x26, x25, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: ldp d9, d8, [sp, #40] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: csel x6, xzr, x8, vs +; CHECK-NEXT: csel x7, xzr, x9, vs +; CHECK-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %x = call <4 x i100> @llvm.fptosi.sat.v4f32.v4i100(<4 x float> %f) ret <4 x i100> %x @@ -1183,15 +1171,15 @@ define <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) { ; CHECK-LABEL: test_signed_v4f32_v4i128: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #128 -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: str d10, [sp, #32] // 8-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #40] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #56] // 8-byte Folded Spill -; CHECK-NEXT: stp x26, x25, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #112 +; CHECK-NEXT: .cfi_def_cfa_offset 112 +; CHECK-NEXT: str d10, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #24] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #40] // 8-byte Folded Spill +; CHECK-NEXT: stp x26, x25, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w21, -24 @@ -1204,28 +1192,40 @@ ; CHECK-NEXT: .cfi_offset b8, -80 ; CHECK-NEXT: .cfi_offset b9, -88 ; CHECK-NEXT: .cfi_offset b10, -96 -; CHECK-NEXT: mov s8, v0.s[1] -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: movi v9.2s, #255, lsl #24 ; CHECK-NEXT: mov w8, #2130706431 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x25, #-9223372036854775808 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: mov x26, #9223372036854775807 ; CHECK-NEXT: fmov s10, w8 +; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: mov s8, v0.s[1] +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x25, x1, lt +; CHECK-NEXT: fcmp s0, s10 +; CHECK-NEXT: csel x9, x26, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: fcmp s0, s0 +; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: csel x19, xzr, x8, vs +; CHECK-NEXT: csel x20, xzr, x9, vs +; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: csel x19, xzr, x8, vs -; CHECK-NEXT: csel x20, xzr, x9, vs +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: csel x21, xzr, x8, vs +; CHECK-NEXT: csel x22, xzr, x9, vs ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s0, s9 @@ -1237,48 +1237,32 @@ ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: csel x21, xzr, x8, vs -; CHECK-NEXT: csel x22, xzr, x9, vs +; CHECK-NEXT: csel x23, xzr, x8, vs +; CHECK-NEXT: csel x24, xzr, x9, vs ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 +; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: csel x23, xzr, x8, vs -; CHECK-NEXT: csel x24, xzr, x9, vs -; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: mov x6, x23 -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: mov x7, x24 -; CHECK-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, x25, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp s0, s10 -; CHECK-NEXT: ldr x30, [sp, #56] // 8-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x26, x8, gt -; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: ldr d10, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: ldp x26, x25, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: ldp d9, d8, [sp, #40] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: add sp, sp, #128 +; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: csel x6, xzr, x8, vs +; CHECK-NEXT: csel x7, xzr, x9, vs +; CHECK-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %x = call <4 x i128> @llvm.fptosi.sat.v4f32.v4i128(<4 x float> %f) ret <4 x i128> %x @@ -1480,48 +1464,44 @@ ; CHECK-NEXT: .cfi_offset b8, -48 ; CHECK-NEXT: .cfi_offset b9, -56 ; CHECK-NEXT: .cfi_offset b10, -64 -; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fmov d0, d8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: mov x8, #-4170333254945079296 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x21, #-34359738368 ; CHECK-NEXT: mov x22, #34359738367 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: fmov d9, x8 ; CHECK-NEXT: mov x8, #5053038781909696511 -; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: mov d8, v0.d[1] +; CHECK-NEXT: fcmp d0, d9 ; CHECK-NEXT: fmov d10, x8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt -; CHECK-NEXT: fcmp d8, d10 +; CHECK-NEXT: fcmp d0, d10 ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: fcmp d8, d8 +; CHECK-NEXT: fcmp d0, d0 +; CHECK-NEXT: fmov d0, d8 ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs ; CHECK-NEXT: bl __fixdfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp d8, d9 ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fcmp d0, d9 -; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, x21, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp d0, d10 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: fcmp d8, d10 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x22, x8, gt -; CHECK-NEXT: fcmp d0, d0 +; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: fcmp d8, d8 ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload +; CHECK-NEXT: csel x2, xzr, x8, vs +; CHECK-NEXT: csel x3, xzr, x9, vs ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret %x = call <2 x i100> @llvm.fptosi.sat.v2f64.v2i100(<2 x double> %f) @@ -1546,48 +1526,44 @@ ; CHECK-NEXT: .cfi_offset b8, -48 ; CHECK-NEXT: .cfi_offset b9, -56 ; CHECK-NEXT: .cfi_offset b10, -64 -; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fmov d0, d8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: mov x8, #-4044232465378705408 +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x21, #-9223372036854775808 ; CHECK-NEXT: mov x22, #9223372036854775807 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: fmov d9, x8 ; CHECK-NEXT: mov x8, #5179139571476070399 -; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: mov d8, v0.d[1] +; CHECK-NEXT: fcmp d0, d9 ; CHECK-NEXT: fmov d10, x8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt -; CHECK-NEXT: fcmp d8, d10 +; CHECK-NEXT: fcmp d0, d10 ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: fcmp d8, d8 +; CHECK-NEXT: fcmp d0, d0 +; CHECK-NEXT: fmov d0, d8 ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs ; CHECK-NEXT: bl __fixdfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp d8, d9 ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload -; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fcmp d0, d9 -; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, x21, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp d0, d10 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: fcmp d8, d10 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x22, x8, gt -; CHECK-NEXT: fcmp d0, d0 +; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: fcmp d8, d8 ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload +; CHECK-NEXT: csel x2, xzr, x8, vs +; CHECK-NEXT: csel x3, xzr, x9, vs ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret %x = call <2 x i128> @llvm.fptosi.sat.v2f64.v2i128(<2 x double> %f) @@ -1853,9 +1829,8 @@ ; CHECK-NEXT: .cfi_offset b9, -88 ; CHECK-NEXT: .cfi_offset b10, -96 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[1] +; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: movi v9.2s, #241, lsl #24 @@ -1865,7 +1840,7 @@ ; CHECK-NEXT: mov x26, #34359738367 ; CHECK-NEXT: fmov s10, w8 ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 @@ -1879,7 +1854,7 @@ ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 @@ -1891,8 +1866,9 @@ ; CHECK-NEXT: csel x22, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 @@ -1905,30 +1881,27 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: mov x6, x23 -; CHECK-NEXT: csel x8, x25, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 +; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: mov x7, x24 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x26, x8, gt +; CHECK-NEXT: csel x9, x26, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: csel x1, xzr, x8, vs ; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: csel x6, xzr, x8, vs +; CHECK-NEXT: csel x7, xzr, x9, vs ; CHECK-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %x = call <4 x i100> @llvm.fptosi.sat.v4f16.v4i100(<4 x half> %f) @@ -1960,9 +1933,8 @@ ; CHECK-NEXT: .cfi_offset b9, -88 ; CHECK-NEXT: .cfi_offset b10, -96 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[1] +; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: movi v9.2s, #255, lsl #24 @@ -1972,7 +1944,7 @@ ; CHECK-NEXT: mov x26, #9223372036854775807 ; CHECK-NEXT: fmov s10, w8 ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 @@ -1986,7 +1958,7 @@ ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 @@ -1998,8 +1970,9 @@ ; CHECK-NEXT: csel x22, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 @@ -2012,30 +1985,27 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: mov x6, x23 -; CHECK-NEXT: csel x8, x25, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 +; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: mov x7, x24 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x26, x8, gt +; CHECK-NEXT: csel x9, x26, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: csel x1, xzr, x8, vs ; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: csel x6, xzr, x8, vs +; CHECK-NEXT: csel x7, xzr, x9, vs ; CHECK-NEXT: ldp x26, x25, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: add sp, sp, #112 ; CHECK-NEXT: ret %x = call <4 x i128> @llvm.fptosi.sat.v4f16.v4i128(<4 x half> %f) @@ -2616,36 +2586,36 @@ ; CHECK-NEXT: mov w8, #1895825407 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov x25, #-34359738368 -; CHECK-NEXT: mov x23, #34359738367 +; CHECK-NEXT: mov x22, #34359738367 ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x25, x1, lt +; CHECK-NEXT: csel x8, x25, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x9, x23, x9, gt -; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: csinv x9, x9, xzr, le +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csel x8, xzr, x8, vs ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill -; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: str x8, [sp, #72] // 8-byte Folded Spill +; CHECK-NEXT: csel x8, xzr, x9, vs +; CHECK-NEXT: str x8, [sp, #24] // 8-byte Folded Spill ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x8, xzr, x8, vs -; CHECK-NEXT: csel x22, xzr, x9, vs +; CHECK-NEXT: csel x10, xzr, x8, vs +; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x8, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x8, x10, [sp, #8] // 16-byte Folded Spill ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, s10 @@ -2654,10 +2624,10 @@ ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x24, xzr, x8, vs +; CHECK-NEXT: csel x26, xzr, x8, vs ; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: str x8, [sp, #32] // 8-byte Folded Spill @@ -2669,40 +2639,39 @@ ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x27, xzr, x8, vs +; CHECK-NEXT: csel x28, xzr, x8, vs ; CHECK-NEXT: csel x8, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, x25, x1, lt +; CHECK-NEXT: csel x8, x25, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x9, x23, x9, gt -; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: csinv x9, x9, xzr, le +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x8, xzr, x8, vs -; CHECK-NEXT: csel x29, xzr, x9, vs +; CHECK-NEXT: csel x27, xzr, x8, vs +; CHECK-NEXT: csel x20, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x20, xzr, x8, vs -; CHECK-NEXT: csel x28, xzr, x9, vs +; CHECK-NEXT: csel x29, xzr, x8, vs +; CHECK-NEXT: csel x21, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -2712,65 +2681,54 @@ ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csel x21, xzr, x8, vs -; CHECK-NEXT: csel x26, xzr, x9, vs +; CHECK-NEXT: csel x23, xzr, x8, vs +; CHECK-NEXT: csel x24, xzr, x9, vs ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: fmov d0, x20 ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: ldr x11, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: lsr x10, x28, #28 -; CHECK-NEXT: ldr d1, [sp] // 8-byte Folded Reload -; CHECK-NEXT: lsr x12, x29, #28 -; CHECK-NEXT: mov v0.d[1], x28 +; CHECK-NEXT: extr x9, x21, x29, #28 +; CHECK-NEXT: bfi x23, x20, #36, #28 +; CHECK-NEXT: extr x11, x27, x20, #28 +; CHECK-NEXT: str x24, [x19] ; CHECK-NEXT: csel x8, x25, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x10, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: stur x11, [x19, #75] -; CHECK-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x23, x8, gt +; CHECK-NEXT: stur x9, [x19, #41] +; CHECK-NEXT: stp x23, x11, [x19, #8] +; CHECK-NEXT: lsr x11, x27, #28 +; CHECK-NEXT: csinv x9, x10, xzr, le +; CHECK-NEXT: lsr x10, x21, #28 +; CHECK-NEXT: csel x8, x22, x8, gt ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: stur x13, [x19, #50] -; CHECK-NEXT: mov v1.d[1], x29 -; CHECK-NEXT: ldr d0, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: strb w10, [x19, #49] -; CHECK-NEXT: extr x10, x28, x11, #28 ; CHECK-NEXT: csel x8, xzr, x8, vs -; CHECK-NEXT: bfi x8, x11, #36, #28 -; CHECK-NEXT: strb w12, [x19, #24] +; CHECK-NEXT: ldr x10, [sp] // 8-byte Folded Reload +; CHECK-NEXT: csel x9, xzr, x9, vs +; CHECK-NEXT: bfi x8, x29, #36, #28 +; CHECK-NEXT: strb w11, [x19, #24] +; CHECK-NEXT: stur x10, [x19, #75] +; CHECK-NEXT: ldp x12, x11, [sp, #8] // 16-byte Folded Reload ; CHECK-NEXT: stur x9, [x19, #25] -; CHECK-NEXT: fmov x12, d1 -; CHECK-NEXT: stur x10, [x19, #41] -; CHECK-NEXT: lsr x9, x22, #28 -; CHECK-NEXT: ldr d1, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: stur x8, [x19, #33] +; CHECK-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: extr x10, x12, x11, #28 +; CHECK-NEXT: bfi x28, x11, #36, #28 +; CHECK-NEXT: stur x8, [x19, #50] +; CHECK-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload ; CHECK-NEXT: ldr x11, [sp, #72] // 8-byte Folded Reload -; CHECK-NEXT: extr x18, x29, x12, #28 -; CHECK-NEXT: mov v0.d[1], x22 -; CHECK-NEXT: bfi x21, x12, #36, #28 -; CHECK-NEXT: str x26, [x19] -; CHECK-NEXT: mov v1.d[1], x11 -; CHECK-NEXT: lsr x10, x11, #28 -; CHECK-NEXT: mov x13, x11 -; CHECK-NEXT: stp x21, x18, [x19, #8] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: strb w9, [x19, #99] -; CHECK-NEXT: strb w10, [x19, #74] -; CHECK-NEXT: fmov x11, d1 -; CHECK-NEXT: extr x12, x22, x8, #28 -; CHECK-NEXT: bfi x27, x8, #36, #28 -; CHECK-NEXT: extr x8, x13, x11, #28 -; CHECK-NEXT: bfi x24, x11, #36, #28 -; CHECK-NEXT: stur x12, [x19, #91] -; CHECK-NEXT: stur x27, [x19, #83] +; CHECK-NEXT: stur x10, [x19, #91] +; CHECK-NEXT: stur x28, [x19, #83] +; CHECK-NEXT: extr x8, x11, x9, #28 +; CHECK-NEXT: bfi x26, x9, #36, #28 +; CHECK-NEXT: lsr x9, x12, #28 ; CHECK-NEXT: stur x8, [x19, #66] -; CHECK-NEXT: stur x24, [x19, #58] +; CHECK-NEXT: lsr x8, x11, #28 +; CHECK-NEXT: stur x26, [x19, #58] +; CHECK-NEXT: strb w9, [x19, #99] +; CHECK-NEXT: strb w8, [x19, #74] ; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload ; CHECK-NEXT: ldp x24, x23, [sp, #144] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -733,37 +733,33 @@ ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #1904214015 -; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: mov x21, #68719476735 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: mov s8, v0.s[1] +; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csel x19, x21, x9, gt ; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x20 -; CHECK-NEXT: mov x3, x19 -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x1, x21, x9, gt +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x1, x19 +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: csel x3, x21, x9, gt +; CHECK-NEXT: csinv x2, x8, xzr, le ; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret %x = call <2 x i100> @llvm.fptoui.sat.v2f32.v2i100(<2 x float> %f) @@ -784,36 +780,32 @@ ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov w8, #2139095039 -; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: mov w8, #2139095039 +; CHECK-NEXT: mov s8, v0.s[1] +; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: csinv x19, x9, xzr, le ; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: csinv x2, x9, xzr, le +; CHECK-NEXT: csinv x3, x8, xzr, le ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csinv x1, x9, xzr, le -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret %x = call <2 x i128> @llvm.fptoui.sat.v2f32.v2i128(<2 x float> %f) @@ -946,13 +938,13 @@ define <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) { ; CHECK-LABEL: test_unsigned_v4f32_v4i100: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: .cfi_def_cfa_offset 112 -; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x30, x25, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x30, x25, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w21, -24 @@ -963,23 +955,32 @@ ; CHECK-NEXT: .cfi_offset w30, -64 ; CHECK-NEXT: .cfi_offset b8, -72 ; CHECK-NEXT: .cfi_offset b9, -80 -; CHECK-NEXT: mov s8, v0.s[1] -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #1904214015 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: mov x25, #68719476735 +; CHECK-NEXT: mov s8, v0.s[1] +; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt +; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: csel x19, x25, x9, gt +; CHECK-NEXT: csinv x20, x8, xzr, le +; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: csel x21, x25, x9, gt +; CHECK-NEXT: csinv x22, x8, xzr, le ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: csel x19, x25, x9, gt -; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov s8, v0.s[1] @@ -988,40 +989,27 @@ ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s0, s9 ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: csel x21, x25, x9, gt -; CHECK-NEXT: csinv x22, x8, xzr, le -; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x23, x25, x9, gt ; CHECK-NEXT: csinv x24, x8, xzr, le ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x20 -; CHECK-NEXT: mov x3, x19 -; CHECK-NEXT: mov x4, x22 -; CHECK-NEXT: mov x5, x21 -; CHECK-NEXT: mov x6, x24 -; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: mov x7, x23 -; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: mov x2, x22 +; CHECK-NEXT: mov x3, x21 +; CHECK-NEXT: mov x4, x24 +; CHECK-NEXT: mov x5, x23 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x1, x25, x9, gt -; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ldp x30, x25, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x1, x19 +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: csel x7, x25, x9, gt +; CHECK-NEXT: csinv x6, x8, xzr, le +; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x25, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %x = call <4 x i100> @llvm.fptoui.sat.v4f32.v4i100(<4 x float> %f) ret <4 x i100> %x @@ -1030,13 +1018,13 @@ define <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) { ; CHECK-LABEL: test_unsigned_v4f32_v4i128: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #112 -; CHECK-NEXT: .cfi_def_cfa_offset 112 -; CHECK-NEXT: stp d9, d8, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill -; CHECK-NEXT: stp x24, x23, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w21, -24 @@ -1046,22 +1034,31 @@ ; CHECK-NEXT: .cfi_offset w30, -64 ; CHECK-NEXT: .cfi_offset b8, -72 ; CHECK-NEXT: .cfi_offset b9, -80 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: mov w8, #2139095039 ; CHECK-NEXT: mov s8, v0.s[1] -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: fcmp s0, s9 ; CHECK-NEXT: fmov s0, s8 +; CHECK-NEXT: csinv x19, x9, xzr, le +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov w8, #2139095039 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: csinv x21, x9, xzr, le +; CHECK-NEXT: csinv x22, x8, xzr, le ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: csinv x19, x9, xzr, le -; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov s8, v0.s[1] @@ -1070,40 +1067,27 @@ ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s0, s9 ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: csinv x21, x9, xzr, le -; CHECK-NEXT: csinv x22, x8, xzr, le +; CHECK-NEXT: csinv x23, x9, xzr, le +; CHECK-NEXT: csinv x24, x8, xzr, le ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csinv x23, x9, xzr, le -; CHECK-NEXT: csinv x24, x8, xzr, le -; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: mov x6, x23 -; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: mov x7, x24 -; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload -; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csinv x1, x9, xzr, le -; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: add sp, sp, #112 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: csinv x6, x9, xzr, le +; CHECK-NEXT: csinv x7, x8, xzr, le +; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %x = call <4 x i128> @llvm.fptoui.sat.v4f32.v4i128(<4 x float> %f) ret <4 x i128> %x @@ -1272,37 +1256,33 @@ ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 -; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fmov d0, d8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixunsdfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x8, #5057542381537067007 -; CHECK-NEXT: fcmp d8, #0.0 ; CHECK-NEXT: mov x21, #68719476735 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov d8, v0.d[1] +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: fmov d9, x8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: fcmp d0, d9 +; CHECK-NEXT: fmov d0, d8 ; CHECK-NEXT: csel x19, x21, x9, gt ; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x20 -; CHECK-NEXT: mov x3, x19 -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: fcmp d8, #0.0 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp d0, d9 -; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x1, x21, x9, gt +; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x1, x19 +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: csel x3, x21, x9, gt +; CHECK-NEXT: csinv x2, x8, xzr, le ; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret %x = call <2 x i100> @llvm.fptoui.sat.v2f64.v2i100(<2 x double> %f) @@ -1322,36 +1302,32 @@ ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 -; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fmov d0, d8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: mov x8, #5183643171103440895 -; CHECK-NEXT: fcmp d8, #0.0 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov x8, #5183643171103440895 +; CHECK-NEXT: mov d8, v0.d[1] +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: fmov d9, x8 ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: fcmp d0, d9 +; CHECK-NEXT: fmov d0, d8 ; CHECK-NEXT: csinv x19, x9, xzr, le ; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp d8, #0.0 ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fcmp d0, #0.0 -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp d0, d9 +; CHECK-NEXT: csinv x2, x9, xzr, le +; CHECK-NEXT: csinv x3, x8, xzr, le ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csinv x1, x9, xzr, le -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret %x = call <2 x i128> @llvm.fptoui.sat.v2f64.v2i128(<2 x double> %f) @@ -1581,7 +1557,7 @@ ; CHECK-NEXT: .cfi_offset b8, -72 ; CHECK-NEXT: .cfi_offset b9, -80 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: mov h1, v0.h[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: fmov s0, s8 @@ -1590,7 +1566,7 @@ ; CHECK-NEXT: mov w8, #1904214015 ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: mov x25, #68719476735 -; CHECK-NEXT: mov h0, v0.h[1] +; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt @@ -1600,9 +1576,8 @@ ; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 @@ -1611,8 +1586,9 @@ ; CHECK-NEXT: csinv x22, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 @@ -1622,25 +1598,22 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x2, x22 -; CHECK-NEXT: mov x3, x21 -; CHECK-NEXT: mov x4, x20 -; CHECK-NEXT: mov x5, x19 -; CHECK-NEXT: mov x6, x24 +; CHECK-NEXT: mov x2, x20 +; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: mov x4, x22 +; CHECK-NEXT: mov x5, x21 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: mov x7, x23 +; CHECK-NEXT: mov x0, x24 +; CHECK-NEXT: mov x1, x23 ; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x1, x25, x9, gt +; CHECK-NEXT: csel x7, x25, x9, gt +; CHECK-NEXT: csinv x6, x8, xzr, le ; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: ldp x30, x25, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %x = call <4 x i100> @llvm.fptoui.sat.v4f16.v4i100(<4 x half> %f) @@ -1667,15 +1640,14 @@ ; CHECK-NEXT: .cfi_offset b8, -72 ; CHECK-NEXT: .cfi_offset b9, -80 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[1] +; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #2139095039 ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt @@ -1687,7 +1659,7 @@ ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 @@ -1696,8 +1668,9 @@ ; CHECK-NEXT: csinv x22, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 @@ -1707,25 +1680,22 @@ ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: mov x6, x23 -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: mov x7, x24 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csinv x1, x9, xzr, le +; CHECK-NEXT: csinv x6, x9, xzr, le +; CHECK-NEXT: csinv x7, x8, xzr, le ; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %x = call <4 x i128> @llvm.fptoui.sat.v4f16.v4i128(<4 x half> %f) @@ -2195,28 +2165,28 @@ ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #1904214015 ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: mov x21, #68719476735 +; CHECK-NEXT: mov x23, #68719476735 ; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: csel x8, xzr, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x20, x21, x8, gt +; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: stp x8, x9, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, xzr, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x23, x21, x8, gt +; CHECK-NEXT: csel x9, x23, x9, gt +; CHECK-NEXT: csinv x24, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 -; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 @@ -2226,7 +2196,7 @@ ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x24, x21, x9, gt +; CHECK-NEXT: csel x25, x23, x9, gt ; CHECK-NEXT: str x8, [sp, #32] // 8-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti @@ -2238,29 +2208,29 @@ ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x26, x21, x9, gt -; CHECK-NEXT: str x8, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: csel x27, x23, x9, gt +; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel x8, xzr, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x29, x9, xzr, le -; CHECK-NEXT: csel x28, x21, x8, gt +; CHECK-NEXT: csel x29, x23, x9, gt +; CHECK-NEXT: csinv x26, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: csel x8, xzr, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x27, x9, xzr, le -; CHECK-NEXT: csel x22, x21, x8, gt +; CHECK-NEXT: csel x28, x23, x9, gt +; CHECK-NEXT: csinv x20, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -2270,58 +2240,46 @@ ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x25, x21, x9, gt -; CHECK-NEXT: str x8, [sp] // 8-byte Folded Spill +; CHECK-NEXT: csel x21, x23, x9, gt +; CHECK-NEXT: csinv x22, x8, xzr, le ; CHECK-NEXT: fmov s0, s8 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr x11, [sp, #8] // 8-byte Folded Reload -; CHECK-NEXT: fmov d0, x27 -; CHECK-NEXT: fmov d1, x29 ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: lsr x10, x22, #28 -; CHECK-NEXT: stur x11, [x19, #75] -; CHECK-NEXT: lsr x11, x28, #28 -; CHECK-NEXT: mov v0.d[1], x22 -; CHECK-NEXT: ldr x12, [sp, #32] // 8-byte Folded Reload -; CHECK-NEXT: mov v1.d[1], x28 +; CHECK-NEXT: extr x8, x28, x20, #28 +; CHECK-NEXT: bfi x21, x26, #36, #28 +; CHECK-NEXT: extr x9, x29, x26, #28 +; CHECK-NEXT: lsr x11, x29, #28 +; CHECK-NEXT: str x22, [x19] +; CHECK-NEXT: stur x8, [x19, #41] ; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt +; CHECK-NEXT: csel x10, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: stur x12, [x19, #50] -; CHECK-NEXT: fmov x12, d0 -; CHECK-NEXT: fmov x13, d1 +; CHECK-NEXT: stp x21, x9, [x19, #8] +; CHECK-NEXT: lsr x9, x28, #28 +; CHECK-NEXT: strb w11, [x19, #24] +; CHECK-NEXT: bfi x27, x24, #36, #28 +; CHECK-NEXT: csel x10, x23, x10, gt ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: ldp d0, d1, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: csel x9, x21, x9, gt -; CHECK-NEXT: strb w10, [x19, #49] -; CHECK-NEXT: extr x10, x22, x12, #28 -; CHECK-NEXT: bfi x9, x12, #36, #28 +; CHECK-NEXT: bfi x10, x20, #36, #28 +; CHECK-NEXT: strb w9, [x19, #49] ; CHECK-NEXT: stur x8, [x19, #25] -; CHECK-NEXT: extr x8, x28, x13, #28 -; CHECK-NEXT: mov v0.d[1], x23 -; CHECK-NEXT: strb w11, [x19, #24] -; CHECK-NEXT: mov v1.d[1], x20 -; CHECK-NEXT: stur x10, [x19, #41] -; CHECK-NEXT: stur x9, [x19, #33] -; CHECK-NEXT: bfi x25, x13, #36, #28 -; CHECK-NEXT: str x8, [x19, #16] -; CHECK-NEXT: lsr x9, x23, #28 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: ldr x12, [sp] // 8-byte Folded Reload -; CHECK-NEXT: fmov x11, d1 -; CHECK-NEXT: lsr x10, x20, #28 -; CHECK-NEXT: strb w9, [x19, #99] -; CHECK-NEXT: stp x12, x25, [x19] -; CHECK-NEXT: extr x12, x23, x8, #28 -; CHECK-NEXT: bfi x26, x8, #36, #28 -; CHECK-NEXT: extr x8, x20, x11, #28 -; CHECK-NEXT: bfi x24, x11, #36, #28 -; CHECK-NEXT: strb w10, [x19, #74] -; CHECK-NEXT: stur x12, [x19, #91] -; CHECK-NEXT: stur x26, [x19, #83] -; CHECK-NEXT: stur x8, [x19, #66] -; CHECK-NEXT: stur x24, [x19, #58] +; CHECK-NEXT: stur x10, [x19, #33] +; CHECK-NEXT: ldp x9, x12, [sp] // 16-byte Folded Reload +; CHECK-NEXT: stur x9, [x19, #75] +; CHECK-NEXT: extr x8, x12, x24, #28 +; CHECK-NEXT: ldr x9, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: stur x9, [x19, #50] +; CHECK-NEXT: ldp x11, x10, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: stur x8, [x19, #91] +; CHECK-NEXT: lsr x8, x12, #28 +; CHECK-NEXT: stur x27, [x19, #83] +; CHECK-NEXT: extr x9, x10, x11, #28 +; CHECK-NEXT: bfi x25, x11, #36, #28 +; CHECK-NEXT: strb w8, [x19, #99] +; CHECK-NEXT: stur x9, [x19, #66] +; CHECK-NEXT: lsr x9, x10, #28 +; CHECK-NEXT: stur x25, [x19, #58] +; CHECK-NEXT: strb w9, [x19, #74] ; CHECK-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload ; CHECK-NEXT: ldp x24, x23, [sp, #128] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -350,25 +350,22 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: adds x8, x2, x6 -; CHECK-NEXT: adcs x9, x3, x7 -; CHECK-NEXT: cset w10, vs -; CHECK-NEXT: asr x11, x9, #63 -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: csel x2, x11, x8, ne -; CHECK-NEXT: eor x8, x11, #0x8000000000000000 -; CHECK-NEXT: csel x3, x8, x9, ne ; CHECK-NEXT: adds x8, x0, x4 ; CHECK-NEXT: adcs x9, x1, x5 ; CHECK-NEXT: cset w10, vs ; CHECK-NEXT: asr x11, x9, #63 ; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: eor x10, x11, #0x8000000000000000 -; CHECK-NEXT: csel x8, x11, x8, ne -; CHECK-NEXT: csel x1, x10, x9, ne -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x0, x11, x8, ne +; CHECK-NEXT: eor x8, x11, #0x8000000000000000 +; CHECK-NEXT: csel x1, x8, x9, ne +; CHECK-NEXT: adds x8, x2, x6 +; CHECK-NEXT: adcs x9, x3, x7 +; CHECK-NEXT: asr x10, x9, #63 +; CHECK-NEXT: cset w11, vs +; CHECK-NEXT: cmp w11, #0 +; CHECK-NEXT: eor x11, x10, #0x8000000000000000 +; CHECK-NEXT: csel x2, x10, x8, ne +; CHECK-NEXT: csel x3, x11, x9, ne ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -353,25 +353,22 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: subs x8, x2, x6 -; CHECK-NEXT: sbcs x9, x3, x7 -; CHECK-NEXT: cset w10, vs -; CHECK-NEXT: asr x11, x9, #63 -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: csel x2, x11, x8, ne -; CHECK-NEXT: eor x8, x11, #0x8000000000000000 -; CHECK-NEXT: csel x3, x8, x9, ne ; CHECK-NEXT: subs x8, x0, x4 ; CHECK-NEXT: sbcs x9, x1, x5 ; CHECK-NEXT: cset w10, vs ; CHECK-NEXT: asr x11, x9, #63 ; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: eor x10, x11, #0x8000000000000000 -; CHECK-NEXT: csel x8, x11, x8, ne -; CHECK-NEXT: csel x1, x10, x9, ne -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x0, x11, x8, ne +; CHECK-NEXT: eor x8, x11, #0x8000000000000000 +; CHECK-NEXT: csel x1, x8, x9, ne +; CHECK-NEXT: subs x8, x2, x6 +; CHECK-NEXT: sbcs x9, x3, x7 +; CHECK-NEXT: asr x10, x9, #63 +; CHECK-NEXT: cset w11, vs +; CHECK-NEXT: cmp w11, #0 +; CHECK-NEXT: eor x11, x10, #0x8000000000000000 +; CHECK-NEXT: csel x2, x10, x8, ne +; CHECK-NEXT: csel x3, x11, x9, ne ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -348,21 +348,18 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: +; CHECK-NEXT: adds x8, x0, x4 +; CHECK-NEXT: adcs x9, x1, x5 +; CHECK-NEXT: cset w10, hs +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: csinv x0, x8, xzr, eq +; CHECK-NEXT: csinv x1, x9, xzr, eq ; CHECK-NEXT: adds x8, x2, x6 ; CHECK-NEXT: adcs x9, x3, x7 ; CHECK-NEXT: cset w10, hs ; CHECK-NEXT: cmp w10, #0 ; CHECK-NEXT: csinv x2, x8, xzr, eq ; CHECK-NEXT: csinv x3, x9, xzr, eq -; CHECK-NEXT: adds x8, x0, x4 -; CHECK-NEXT: adcs x9, x1, x5 -; CHECK-NEXT: cset w10, hs -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: csinv x8, x8, xzr, eq -; CHECK-NEXT: csinv x1, x9, xzr, eq -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -344,21 +344,18 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: +; CHECK-NEXT: subs x8, x0, x4 +; CHECK-NEXT: sbcs x9, x1, x5 +; CHECK-NEXT: cset w10, lo +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: csel x0, xzr, x8, ne +; CHECK-NEXT: csel x1, xzr, x9, ne ; CHECK-NEXT: subs x8, x2, x6 ; CHECK-NEXT: sbcs x9, x3, x7 ; CHECK-NEXT: cset w10, lo ; CHECK-NEXT: cmp w10, #0 ; CHECK-NEXT: csel x2, xzr, x8, ne ; CHECK-NEXT: csel x3, xzr, x9, ne -; CHECK-NEXT: subs x8, x0, x4 -; CHECK-NEXT: sbcs x9, x1, x5 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: cmp w10, #0 -; CHECK-NEXT: csel x8, xzr, x8, ne -; CHECK-NEXT: csel x1, xzr, x9, ne -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -96,20 +96,14 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK-LABEL: test_v9i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: umov w9, v0.b[1] +; CHECK-NEXT: umov w10, v0.b[2] +; CHECK-NEXT: umov w11, v0.b[3] ; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: mov v1.b[9], w8 -; CHECK-NEXT: mov v1.b[10], w8 -; CHECK-NEXT: mov v1.b[11], w8 -; CHECK-NEXT: mov v1.b[13], w8 -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NEXT: and v1.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v1.b[1] -; CHECK-NEXT: umov w9, v1.b[0] -; CHECK-NEXT: umov w10, v1.b[2] -; CHECK-NEXT: umov w11, v1.b[3] -; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: umov w8, v1.b[0] +; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[5] ; CHECK-NEXT: and w8, w8, w10 ; CHECK-NEXT: umov w10, v0.b[6] diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -106,7 +106,6 @@ ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64 ; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]] -; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], 0 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] @@ -117,6 +116,7 @@ ; VI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] ; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] ; GCN: v_mov_b32_e32 v[[R_I64_0_High:[0-9]+]], 0 +; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], v[[R_I64_0_High]] ; GCN: buffer_store_dwordx4 v[[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]] ; GCN: s_endpgm define amdgpu_kernel void @fptoui_v2f16_to_v2i64( diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -4814,16 +4814,16 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s5, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -4835,20 +4835,20 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s4, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s5, s3, 16 ; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xffff ; GCN-HSA-NEXT: s_and_b32 s2, s3, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -4865,16 +4865,16 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s3, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5053,28 +5053,28 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s5, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s7, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -5086,13 +5086,13 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s9, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s9, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 16 ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s3, s7, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 @@ -5100,24 +5100,24 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -5132,27 +5132,27 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s7, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s5, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s4, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5419,52 +5419,52 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s9, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -5476,21 +5476,21 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s13, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s15, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s16, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s17, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s9, 16 ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s3, s9, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 @@ -5498,52 +5498,52 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -5558,47 +5558,47 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s11, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s10, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s9, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s8, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s7, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s6, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s5, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s4, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -6053,104 +6053,104 @@ ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s17, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s19, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s16, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s13, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s16, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s17, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s19, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff -; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -6162,141 +6162,141 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s3, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s21, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s23, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s25, s17, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s19, 16 -; GCN-HSA-NEXT: s_lshr_b32 s27, s18, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s29, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s31, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s34, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s16, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s17, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s18, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s19, 16 ; GCN-HSA-NEXT: s_and_b32 s35, s4, 0xffff -; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff -; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff -; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff -; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff -; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff -; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff -; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-HSA-NEXT: s_and_b32 s36, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-HSA-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xa0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xa0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x80 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6316,87 +6316,87 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s15, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s14, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s13, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s13, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s12, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s12, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s11, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s11, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s10, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s9, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s9, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s8, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s7, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s6, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s5, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s4, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s3, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s3, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s2, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s2, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s1, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s1, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s0, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -5747,8 +5747,8 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 @@ -5792,19 +5792,19 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v4i16_to_v4i64: @@ -6051,73 +6051,73 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v4 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v4 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[3:6] +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xffff, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xffff, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GCN-HSA-NEXT: flat_store_dwordx4 v[3:4], v[14:17] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64: @@ -6131,29 +6131,29 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v8i16_to_v8i64: @@ -6498,133 +6498,131 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[6:9], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xffff, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xffff, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v20 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v9 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, v9 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v0 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_waitcnt vmcnt(3) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, v9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xffff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -6639,49 +6637,48 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v5 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v28 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v5 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v1 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v28 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -7272,7 +7269,7 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7283,279 +7280,265 @@ ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v3 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v21 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v7 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v13 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v27 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v27 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, 0xffff, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, 0xffff, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, 0xffff, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, 0xffff, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, 0xffff, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, 0xffff, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, 0xffff, v12 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, 0xffff, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, 0xffff, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, 0xffff, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xffff, v16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v1 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v27 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 +; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 -; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[9:12], v[9:10] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[13:16], v[13:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[9:12], v[8:9] +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[17:20], v[17:18] -; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xb0 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x90 +; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s16, s0, 0x70 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s17 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s18, s0, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s18 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s13 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[13:16] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s14 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[5:8] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s11 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[5:8] -; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xffff, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v8 +; GCN-HSA-NEXT: s_waitcnt vmcnt(6) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[2:5] +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x80 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[2:5] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[2:5] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[5:8] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v19 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[5:8] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xffff, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xffff, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[7:10] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xffff, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: s_waitcnt vmcnt(12) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xffff, v17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[3:6] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v18 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: @@ -7568,95 +7551,97 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[1:4], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xffff, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v36 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, 0xffff, v38 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v37 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xffff, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xffff, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, 0xffff, v6 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v33 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xffff, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v31 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, 0xffff, v35 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xffff, v32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xffff, v7 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xffff, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v17, 0xffff, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v25, 0xffff, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xffff, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v34 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v37 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v26, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v37 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v21, 0xffff, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v22, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v37 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -265,29 +265,29 @@ ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX6-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX6-NEXT: v_bfe_i32 v6, v7, 0, 16 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v8, v5 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_min_i32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 ; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 ; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -125,18 +125,18 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 64, v0 -; GCN-NEXT: v_lshr_b64 v[2:3], 17, v1 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, 64, v0 -; GCN-NEXT: v_lshl_b64 v[4:5], 17, v1 +; GCN-NEXT: v_lshr_b64 v[1:2], 17, v1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 64, v0 +; GCN-NEXT: v_lshl_b64 v[2:3], 17, v2 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN-NEXT: v_lshl_b64 v[4:5], 17, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v1, s[4:5] -; GCN-NEXT: v_lshl_b64 v[0:1], 17, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 17, %rhs ret i128 %shl @@ -149,12 +149,11 @@ ; GCN-NEXT: s_mov_b64 s[4:5], 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v3, 0x41 -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -168,11 +167,10 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshr_b64 v[1:2], 33, v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 33, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -265,29 +265,29 @@ ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 +; GFX6-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 +; GFX6-NEXT: v_bfe_i32 v6, v7, 0, 16 ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v8, v5 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_min_i32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 ; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 ; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -198,26 +198,26 @@ ; GFX6-LABEL: v_uaddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_min_u32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v6 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v8, v5 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_min_u32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_min_u32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_min_u32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -301,26 +301,26 @@ ; GFX6-LABEL: v_usubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v5 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v9 -; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_max_u32_e32 v1, v3, v1 +; GFX6-NEXT: v_max_u32_e32 v2, v2, v6 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v2, v6 -; GFX6-NEXT: v_max_u32_e32 v2, v3, v8 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_max_u32_e32 v2, v9, v8 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v4 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v4i16: diff --git a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll --- a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll +++ b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll @@ -1356,54 +1356,73 @@ ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, lr} ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: vmov r12, lr, d16 ; CHECK-FIX-NOSCHED-NEXT: vmov r3, r4, d17 -; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d16 +; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r12, r12, #16 ; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 ; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s6 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr ; CHECK-FIX-NOSCHED-NEXT: vmov s4, r4 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s6 ; CHECK-FIX-NOSCHED-NEXT: lsr r4, r4, #16 ; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r12 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r12, #16 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr -; CHECK-FIX-NOSCHED-NEXT: lsr lr, lr, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s14, r12 -; CHECK-FIX-NOSCHED-NEXT: vmov s7, lr +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r5 +; CHECK-FIX-NOSCHED-NEXT: lsr r5, lr, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s14, r5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s6 ; CHECK-FIX-NOSCHED-NEXT: vmov s6, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov s7, r12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: bne .LBB36_3 -; CHECK-FIX-NOSCHED-NEXT: b .LBB36_4 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB36_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_2: -; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #6] +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: b .LBB36_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB36_3: +; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #10] +; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #6] ; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r2, #2] ; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2, #14] -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5 ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #12] -; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr -; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #8] +; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #8] ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r6 -; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #4] +; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #4] ; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 ; CHECK-FIX-NOSCHED-NEXT: ldrh r8, [r2] ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s4 ; CHECK-FIX-NOSCHED-NEXT: vmov s4, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12 -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r5 +; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s5 ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 @@ -1411,44 +1430,46 @@ ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_4 -; CHECK-FIX-NOSCHED-NEXT: .LBB36_3: -; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: beq .LBB36_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB36_4: -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 -; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16 +; CHECK-FIX-NOSCHED-NEXT: ldrh r1, [r1] +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r3, d0 +; CHECK-FIX-NOSCHED-NEXT: vmov r7, r6, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r1 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r1, d16[0] +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r6 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1 -; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r3 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r6, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 +; CHECK-FIX-NOSCHED-NEXT: .LBB36_5: +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 ; CHECK-FIX-NOSCHED-NEXT: vmov r0, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s7 ; CHECK-FIX-NOSCHED-NEXT: vmov r1, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s9 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r1, s11 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 @@ -1494,125 +1515,150 @@ ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-CORTEX-FIX-NEXT: .vsave {d8} -; CHECK-CORTEX-FIX-NEXT: vpush {d8} +; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9, d10} +; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9, d10} ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 ; CHECK-CORTEX-FIX-NEXT: beq .LBB36_2 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-CORTEX-FIX-NEXT: vorr q9, q8, q8 +; CHECK-CORTEX-FIX-NEXT: vmov r3, lr, d16 ; CHECK-CORTEX-FIX-NEXT: vmov r5, r6, d17 +; CHECK-CORTEX-FIX-NEXT: vld1.16 {d18[0]}, [r1:16] +; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, lr, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r6, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s8, lr ; CHECK-CORTEX-FIX-NEXT: vmov s6, r5 -; CHECK-CORTEX-FIX-NEXT: vmov s14, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s7, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r12, r3, d16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s7 -; CHECK-CORTEX-FIX-NEXT: lsr lr, r12, #16 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r12 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s5, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s14, r7 ; CHECK-CORTEX-FIX-NEXT: vmov s9, r8 -; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11 -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB36_3 -; CHECK-CORTEX-FIX-NEXT: b .LBB36_4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s8 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r12, d18[0] +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-CORTEX-FIX-NEXT: vmov s11, r12 +; CHECK-CORTEX-FIX-NEXT: b .LBB36_3 ; CHECK-CORTEX-FIX-NEXT: .LBB36_2: -; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r2] -; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r2, #2] ; CHECK-CORTEX-FIX-NEXT: ldrh r8, [r2, #4] ; CHECK-CORTEX-FIX-NEXT: ldrh r5, [r2, #6] ; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r2, #8] ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #10] ; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r2, #12] ; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r2, #14] +; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r2] +; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r2, #2] +; CHECK-CORTEX-FIX-NEXT: vmov s5, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r8 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r6 ; CHECK-CORTEX-FIX-NEXT: vmov s6, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s5, r5 ; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r8 -; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s6 +; CHECK-CORTEX-FIX-NEXT: vmov s14, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s9, lr +; CHECK-CORTEX-FIX-NEXT: vmov s11, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s6 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s13 -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB36_4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s7 ; CHECK-CORTEX-FIX-NEXT: .LBB36_3: -; CHECK-CORTEX-FIX-NEXT: vld1.16 {d0[0]}, [r1:16] -; CHECK-CORTEX-FIX-NEXT: .LBB36_4: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s9 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s11 +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: beq .LBB36_5 +; CHECK-CORTEX-FIX-NEXT: @ %bb.4: +; CHECK-CORTEX-FIX-NEXT: ldrh r0, [r1] +; CHECK-CORTEX-FIX-NEXT: vmov r1, r3, d0 +; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1 +; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r0 +; CHECK-CORTEX-FIX-NEXT: lsr r1, r1, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r3, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r0, r5, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s2, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 +; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 +; CHECK-CORTEX-FIX-NEXT: vmov s18, r1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s18 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r12, d16[0] +; CHECK-CORTEX-FIX-NEXT: vmov s20, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s20 +; CHECK-CORTEX-FIX-NEXT: b .LBB36_6 +; CHECK-CORTEX-FIX-NEXT: .LBB36_5: ; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1 ; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 ; CHECK-CORTEX-FIX-NEXT: lsr r7, r1, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s1, r1 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 -; CHECK-CORTEX-FIX-NEXT: vmov r0, s12 -; CHECK-CORTEX-FIX-NEXT: vmov r1, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s7 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s6 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r1 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 ; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 ; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 ; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s9 -; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r1, s5 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s9 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s3 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11 -; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-CORTEX-FIX-NEXT: .LBB36_6: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s3 +; CHECK-CORTEX-FIX-NEXT: vmov r0, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r1, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s6 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s8 +; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r1, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s1 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s11 ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 +; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 ; CHECK-CORTEX-FIX-NEXT: vmov r1, s0 ; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s5 ; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s4 ; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s8 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s14 ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r4, s2 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r5 @@ -1620,7 +1666,7 @@ ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr ; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s1 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s3 ; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r4, r1, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r1 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0 @@ -1629,7 +1675,7 @@ ; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: vpop {d8} +; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9, d10} ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, pc} br i1 %0, label %5, label %12 @@ -1680,56 +1726,67 @@ ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s0 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s9 ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-FIX-NOSCHED-NEXT: vmov r2, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r2 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, lr, d17 -; CHECK-FIX-NOSCHED-NEXT: vmov r2, r12, d16 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov r4, s0 +; CHECK-FIX-NOSCHED-NEXT: vmov r12, lr, d16 +; CHECK-FIX-NOSCHED-NEXT: vmov r2, r3, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r4 +; CHECK-FIX-NOSCHED-NEXT: lsr r12, r12, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r4, d16[0] ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s2 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, lr, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s0, lr -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r12 -; CHECK-FIX-NOSCHED-NEXT: lsr r5, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, lr +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2 -; CHECK-FIX-NOSCHED-NEXT: lsr r2, r12, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov s14, r2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r4 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s10, r2 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov s3, r5 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r4 +; CHECK-FIX-NOSCHED-NEXT: lsr r4, lr, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s14, r4 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: bne .LBB37_3 -; CHECK-FIX-NOSCHED-NEXT: b .LBB37_4 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB37_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB37_2: +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2 +; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: b .LBB37_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB37_3: ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #6] -; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #6] +; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #2] ; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r1, #14] ; CHECK-FIX-NOSCHED-NEXT: vmov s8, r3 ; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #12] -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r4 -; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #8] -; CHECK-FIX-NOSCHED-NEXT: vmov s1, lr -; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #4] +; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #8] +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r5 +; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #4] ; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 ; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r1] ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s0 ; CHECK-FIX-NOSCHED-NEXT: vmov s0, r2 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12 -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s1 ; CHECK-FIX-NOSCHED-NEXT: vmov s1, r6 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 @@ -1737,47 +1794,48 @@ ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_4 -; CHECK-FIX-NOSCHED-NEXT: .LBB37_3: -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-FIX-NOSCHED-NEXT: vmov r0, s9 -; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r0 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB37_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB37_4: -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s4, s9 +; CHECK-FIX-NOSCHED-NEXT: vmov r6, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r6 ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2 -; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5 -; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r6, d16[0] +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r6 +; CHECK-FIX-NOSCHED-NEXT: .LBB37_5: ; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r0, s1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s3 -; CHECK-FIX-NOSCHED-NEXT: vmov r2, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 +; CHECK-FIX-NOSCHED-NEXT: vmov r2, s1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5 ; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s15 ; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s9 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 ; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s4, s4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r2, s11 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 @@ -1822,8 +1880,8 @@ ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-CORTEX-FIX-NEXT: .vsave {d8} -; CHECK-CORTEX-FIX-NEXT: vpush {d8} +; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9, d10} +; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9, d10} ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s0 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 ; CHECK-CORTEX-FIX-NEXT: beq .LBB37_2 @@ -1831,120 +1889,143 @@ ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s9 ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] ; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 +; CHECK-CORTEX-FIX-NEXT: vmov r12, r3, d16 ; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r2 ; CHECK-CORTEX-FIX-NEXT: vmov r4, r5, d17 +; CHECK-CORTEX-FIX-NEXT: lsr r2, r3, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r12, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s1, r2 +; CHECK-CORTEX-FIX-NEXT: vmov s11, r12 ; CHECK-CORTEX-FIX-NEXT: lsr r6, r4, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s14, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s3, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s8 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r7 +; CHECK-CORTEX-FIX-NEXT: vmov s14, r6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-CORTEX-FIX-NEXT: vmov.32 lr, d16[0] +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s3 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r2, #16 -; CHECK-CORTEX-FIX-NEXT: lsr lr, r3, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r2 -; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13 -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB37_3 -; CHECK-CORTEX-FIX-NEXT: b .LBB37_4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s14 +; CHECK-CORTEX-FIX-NEXT: vmov s13, lr +; CHECK-CORTEX-FIX-NEXT: b .LBB37_3 ; CHECK-CORTEX-FIX-NEXT: .LBB37_2: -; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r1] -; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r1, #2] ; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #4] ; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r1, #6] ; CHECK-CORTEX-FIX-NEXT: ldrh r5, [r1, #8] ; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r1, #10] ; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12] ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #14] +; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r1] +; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r1, #2] +; CHECK-CORTEX-FIX-NEXT: vmov s1, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r7 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r3 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r2 -; CHECK-CORTEX-FIX-NEXT: vmov s1, r6 ; CHECK-CORTEX-FIX-NEXT: vmov s8, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r5 -; CHECK-CORTEX-FIX-NEXT: vmov s11, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s13, lr -; CHECK-CORTEX-FIX-NEXT: vmov s15, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s2 +; CHECK-CORTEX-FIX-NEXT: vmov s14, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s11, lr +; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s2 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s15 -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB37_4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s3 ; CHECK-CORTEX-FIX-NEXT: .LBB37_3: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s13 +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: beq .LBB37_5 +; CHECK-CORTEX-FIX-NEXT: @ %bb.4: ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 +; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d2 +; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3 ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 -; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r0 -; CHECK-CORTEX-FIX-NEXT: .LBB37_4: +; CHECK-CORTEX-FIX-NEXT: lsr r2, r2, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r3, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s6, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s4, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 +; CHECK-CORTEX-FIX-NEXT: vmov s18, r2 +; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r0 +; CHECK-CORTEX-FIX-NEXT: lsr r0, r5, #16 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s18 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r12, d16[0] +; CHECK-CORTEX-FIX-NEXT: vmov s20, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s20 +; CHECK-CORTEX-FIX-NEXT: b .LBB37_6 +; CHECK-CORTEX-FIX-NEXT: .LBB37_5: ; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3 ; CHECK-CORTEX-FIX-NEXT: vmov r0, r2, d2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r5 ; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s5, r2 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 -; CHECK-CORTEX-FIX-NEXT: vmov r0, s12 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s3 ; CHECK-CORTEX-FIX-NEXT: vmov s6, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s2 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r2 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 ; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 ; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 ; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9 -; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s1 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s7 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11 -; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-CORTEX-FIX-NEXT: .LBB37_6: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r0, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r2, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s2 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s8 +; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r2, s3 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s1 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s11 ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 +; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s5 ; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s8 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s14 ; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r4, s6 @@ -1953,7 +2034,7 @@ ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr ; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s5 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s7 ; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r4, r2, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0 @@ -1962,7 +2043,7 @@ ; CHECK-CORTEX-FIX-NEXT: aese.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesmc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] -; CHECK-CORTEX-FIX-NEXT: vpop {d8} +; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9, d10} ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r11, pc} br i1 %0, label %5, label %11 @@ -3726,54 +3807,73 @@ ; CHECK-FIX-NOSCHED-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r8, lr} ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: vmov r12, lr, d16 ; CHECK-FIX-NOSCHED-NEXT: vmov r3, r4, d17 -; CHECK-FIX-NOSCHED-NEXT: vmov lr, r12, d16 +; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r5, d16[0] +; CHECK-FIX-NOSCHED-NEXT: lsr r12, r12, #16 ; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 ; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s6 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr ; CHECK-FIX-NOSCHED-NEXT: vmov s4, r4 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s6 ; CHECK-FIX-NOSCHED-NEXT: lsr r4, r4, #16 ; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r12 -; CHECK-FIX-NOSCHED-NEXT: lsr r12, r12, #16 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, lr -; CHECK-FIX-NOSCHED-NEXT: lsr lr, lr, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s14, r12 -; CHECK-FIX-NOSCHED-NEXT: vmov s7, lr +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r5 +; CHECK-FIX-NOSCHED-NEXT: lsr r5, lr, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s14, r5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s6 ; CHECK-FIX-NOSCHED-NEXT: vmov s6, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov s7, r12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: bne .LBB82_3 -; CHECK-FIX-NOSCHED-NEXT: b .LBB82_4 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB82_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_2: -; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #6] +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 +; CHECK-FIX-NOSCHED-NEXT: b .LBB82_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB82_3: +; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #10] +; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #6] ; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r2, #2] ; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r2, #14] -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5 ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r2, #12] -; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr -; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r2, #8] +; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r2, #8] ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r6 -; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r2, #4] +; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r2, #4] ; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 ; CHECK-FIX-NOSCHED-NEXT: ldrh r8, [r2] ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s4 ; CHECK-FIX-NOSCHED-NEXT: vmov s4, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12 -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r5 +; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s5 ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 @@ -3781,44 +3881,46 @@ ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_4 -; CHECK-FIX-NOSCHED-NEXT: .LBB82_3: -; CHECK-FIX-NOSCHED-NEXT: vld1.16 {d0[0]}, [r1:16] +; CHECK-FIX-NOSCHED-NEXT: beq .LBB82_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB82_4: -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r1, d0 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d1 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 -; CHECK-FIX-NOSCHED-NEXT: lsr r1, r1, #16 +; CHECK-FIX-NOSCHED-NEXT: ldrh r1, [r1] +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r3, d0 +; CHECK-FIX-NOSCHED-NEXT: vmov r7, r6, d1 +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r1 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r1, d16[0] +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r3 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s1 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r6 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 ; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s13, r1 -; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r3 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r1 +; CHECK-FIX-NOSCHED-NEXT: lsr r1, r7, #16 +; CHECK-FIX-NOSCHED-NEXT: lsr r7, r6, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 +; CHECK-FIX-NOSCHED-NEXT: .LBB82_5: +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 ; CHECK-FIX-NOSCHED-NEXT: vmov r0, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s7 ; CHECK-FIX-NOSCHED-NEXT: vmov r1, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s3, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, s5 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s5, s9 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 -; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov s1, r7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r1, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r1, s11 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 @@ -3864,125 +3966,150 @@ ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r8, lr} -; CHECK-CORTEX-FIX-NEXT: .vsave {d8} -; CHECK-CORTEX-FIX-NEXT: vpush {d8} +; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9, d10} +; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9, d10} ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 ; CHECK-CORTEX-FIX-NEXT: beq .LBB82_2 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: vld1.16 {d16[0]}, [r1:16] +; CHECK-CORTEX-FIX-NEXT: vorr q9, q8, q8 +; CHECK-CORTEX-FIX-NEXT: vmov r3, lr, d16 ; CHECK-CORTEX-FIX-NEXT: vmov r5, r6, d17 +; CHECK-CORTEX-FIX-NEXT: vld1.16 {d18[0]}, [r1:16] +; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, lr, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16 -; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r6, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s8, lr ; CHECK-CORTEX-FIX-NEXT: vmov s6, r5 -; CHECK-CORTEX-FIX-NEXT: vmov s14, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s7, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r12, r3, d16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s7 -; CHECK-CORTEX-FIX-NEXT: lsr lr, r12, #16 -; CHECK-CORTEX-FIX-NEXT: lsr r8, r3, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r12 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s5, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s14, r7 ; CHECK-CORTEX-FIX-NEXT: vmov s9, r8 -; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11 -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB82_3 -; CHECK-CORTEX-FIX-NEXT: b .LBB82_4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s8 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r12, d18[0] +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-CORTEX-FIX-NEXT: vmov s11, r12 +; CHECK-CORTEX-FIX-NEXT: b .LBB82_3 ; CHECK-CORTEX-FIX-NEXT: .LBB82_2: -; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r2] -; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r2, #2] ; CHECK-CORTEX-FIX-NEXT: ldrh r8, [r2, #4] ; CHECK-CORTEX-FIX-NEXT: ldrh r5, [r2, #6] ; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r2, #8] ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r2, #10] ; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r2, #12] ; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r2, #14] +; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r2] +; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r2, #2] +; CHECK-CORTEX-FIX-NEXT: vmov s5, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r8 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r6 ; CHECK-CORTEX-FIX-NEXT: vmov s6, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s5, r5 ; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r8 -; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s6 +; CHECK-CORTEX-FIX-NEXT: vmov s14, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s9, lr +; CHECK-CORTEX-FIX-NEXT: vmov s11, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s6 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s13 -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB82_4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s7 ; CHECK-CORTEX-FIX-NEXT: .LBB82_3: -; CHECK-CORTEX-FIX-NEXT: vld1.16 {d0[0]}, [r1:16] -; CHECK-CORTEX-FIX-NEXT: .LBB82_4: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s9 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s11 +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: beq .LBB82_5 +; CHECK-CORTEX-FIX-NEXT: @ %bb.4: +; CHECK-CORTEX-FIX-NEXT: ldrh r0, [r1] +; CHECK-CORTEX-FIX-NEXT: vmov r1, r3, d0 +; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1 +; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r0 +; CHECK-CORTEX-FIX-NEXT: lsr r1, r1, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r3, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r0, r5, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s2, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 +; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 +; CHECK-CORTEX-FIX-NEXT: vmov s18, r1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s18 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r12, d16[0] +; CHECK-CORTEX-FIX-NEXT: vmov s20, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s20 +; CHECK-CORTEX-FIX-NEXT: b .LBB82_6 +; CHECK-CORTEX-FIX-NEXT: .LBB82_5: ; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d1 ; CHECK-CORTEX-FIX-NEXT: vmov r0, r1, d0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 ; CHECK-CORTEX-FIX-NEXT: lsr r7, r1, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s1, r1 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 -; CHECK-CORTEX-FIX-NEXT: vmov r0, s12 -; CHECK-CORTEX-FIX-NEXT: vmov r1, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s7 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s6 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r1 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 ; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 ; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 ; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s0 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s9 -; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r1, s5 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s9 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s3 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11 -; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-CORTEX-FIX-NEXT: .LBB82_6: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s4, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s3 +; CHECK-CORTEX-FIX-NEXT: vmov r0, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r1, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s6 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s8 +; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r1, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r1, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s1 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s11 ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 +; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r1, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 ; CHECK-CORTEX-FIX-NEXT: vmov r1, s0 ; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s5 ; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s4 ; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s8 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s14 ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r4, s2 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[0], r5 @@ -3990,7 +4117,7 @@ ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr ; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s1 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s3 ; CHECK-CORTEX-FIX-NEXT: pkhbt r1, r4, r1, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r1 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0 @@ -3999,7 +4126,7 @@ ; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r2] -; CHECK-CORTEX-FIX-NEXT: vpop {d8} +; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9, d10} ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r8, pc} br i1 %0, label %5, label %12 @@ -4050,56 +4177,67 @@ ; CHECK-FIX-NOSCHED-NEXT: push {r4, r5, r6, r7, r11, lr} ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s0 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_2 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_3 ; CHECK-FIX-NOSCHED-NEXT: @ %bb.1: ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s9 ; CHECK-FIX-NOSCHED-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-FIX-NOSCHED-NEXT: vmov r2, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r2 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, lr, d17 -; CHECK-FIX-NOSCHED-NEXT: vmov r2, r12, d16 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 -; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov r4, s0 +; CHECK-FIX-NOSCHED-NEXT: vmov r12, lr, d16 +; CHECK-FIX-NOSCHED-NEXT: vmov r2, r3, d17 +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r4 +; CHECK-FIX-NOSCHED-NEXT: lsr r12, r12, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r4, d16[0] ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s8, s2 -; CHECK-FIX-NOSCHED-NEXT: lsr r4, lr, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s0, lr -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r12 -; CHECK-FIX-NOSCHED-NEXT: lsr r5, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, lr +; CHECK-FIX-NOSCHED-NEXT: vmov s0, r3 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r2 -; CHECK-FIX-NOSCHED-NEXT: lsr r2, r12, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s10, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov s14, r2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2 -; CHECK-FIX-NOSCHED-NEXT: vmov s2, r4 +; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s10, r2 +; CHECK-FIX-NOSCHED-NEXT: vmov s3, r12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FIX-NOSCHED-NEXT: vmov s3, r5 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s10 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s3 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r4 +; CHECK-FIX-NOSCHED-NEXT: lsr r4, lr, #16 +; CHECK-FIX-NOSCHED-NEXT: vmov s14, r4 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s2 +; CHECK-FIX-NOSCHED-NEXT: vmov s2, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s14 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s2 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: bne .LBB83_3 -; CHECK-FIX-NOSCHED-NEXT: b .LBB83_4 +; CHECK-FIX-NOSCHED-NEXT: bne .LBB83_4 ; CHECK-FIX-NOSCHED-NEXT: .LBB83_2: +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2 +; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: b .LBB83_5 +; CHECK-FIX-NOSCHED-NEXT: .LBB83_3: ; CHECK-FIX-NOSCHED-NEXT: ldrh r3, [r1, #10] -; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #6] -; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #2] +; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #6] +; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #2] ; CHECK-FIX-NOSCHED-NEXT: ldrh r7, [r1, #14] ; CHECK-FIX-NOSCHED-NEXT: vmov s8, r3 ; CHECK-FIX-NOSCHED-NEXT: ldrh r2, [r1, #12] -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r4 -; CHECK-FIX-NOSCHED-NEXT: ldrh r5, [r1, #8] -; CHECK-FIX-NOSCHED-NEXT: vmov s1, lr -; CHECK-FIX-NOSCHED-NEXT: ldrh r12, [r1, #4] +; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: ldrh r4, [r1, #8] +; CHECK-FIX-NOSCHED-NEXT: vmov s1, r5 +; CHECK-FIX-NOSCHED-NEXT: ldrh lr, [r1, #4] ; CHECK-FIX-NOSCHED-NEXT: vmov s0, r7 ; CHECK-FIX-NOSCHED-NEXT: ldrh r6, [r1] ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s2, s0 ; CHECK-FIX-NOSCHED-NEXT: vmov s0, r2 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s10, s8 -; CHECK-FIX-NOSCHED-NEXT: vmov s8, r5 +; CHECK-FIX-NOSCHED-NEXT: vmov s8, r4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s14, s12 -; CHECK-FIX-NOSCHED-NEXT: vmov s12, r12 +; CHECK-FIX-NOSCHED-NEXT: vmov s12, lr ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s3, s1 ; CHECK-FIX-NOSCHED-NEXT: vmov s1, r6 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s0, s0 @@ -4107,47 +4245,48 @@ ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: cmp r0, #0 -; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_4 -; CHECK-FIX-NOSCHED-NEXT: .LBB83_3: -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-FIX-NOSCHED-NEXT: vmov r0, s9 -; CHECK-FIX-NOSCHED-NEXT: vmov.16 d2[0], r0 +; CHECK-FIX-NOSCHED-NEXT: beq .LBB83_2 ; CHECK-FIX-NOSCHED-NEXT: .LBB83_4: -; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: vmov r3, r7, d3 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-FIX-NOSCHED-NEXT: vmov r0, r2, d2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s4, s9 +; CHECK-FIX-NOSCHED-NEXT: vmov r6, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 +; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-FIX-NOSCHED-NEXT: vmov.16 d16[0], r6 ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r2 -; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s9, s5 -; CHECK-FIX-NOSCHED-NEXT: vmov s5, r0 +; CHECK-FIX-NOSCHED-NEXT: vmov.32 r6, d16[0] +; CHECK-FIX-NOSCHED-NEXT: vmov s5, r6 +; CHECK-FIX-NOSCHED-NEXT: .LBB83_5: ; CHECK-FIX-NOSCHED-NEXT: lsr r0, r0, #16 -; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s1 ; CHECK-FIX-NOSCHED-NEXT: vmov s15, r0 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5 +; CHECK-FIX-NOSCHED-NEXT: lsr r2, r2, #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r0, s1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s3 -; CHECK-FIX-NOSCHED-NEXT: vmov r2, s1 +; CHECK-FIX-NOSCHED-NEXT: vmov s13, r2 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s15, s15 -; CHECK-FIX-NOSCHED-NEXT: vmov s6, r3 +; CHECK-FIX-NOSCHED-NEXT: vmov r2, s1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s11, s5 ; CHECK-FIX-NOSCHED-NEXT: lsr r3, r3, #16 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s15 ; CHECK-FIX-NOSCHED-NEXT: vmov s7, r3 -; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s11, s11 +; CHECK-FIX-NOSCHED-NEXT: vmov r3, s1 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s12, s12 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s1, s9 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s7, s7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-FIX-NOSCHED-NEXT: vmov s4, r7 ; CHECK-FIX-NOSCHED-NEXT: lsr r7, r7, #16 -; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s4, s4 ; CHECK-FIX-NOSCHED-NEXT: vmov s5, r7 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s4, s4 ; CHECK-FIX-NOSCHED-NEXT: vcvtb.f32.f16 s5, s5 +; CHECK-FIX-NOSCHED-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-FIX-NOSCHED-NEXT: pkhbt r0, r0, r2, lsl #16 ; CHECK-FIX-NOSCHED-NEXT: vmov r2, s11 ; CHECK-FIX-NOSCHED-NEXT: vmov.32 d16[0], r0 @@ -4192,8 +4331,8 @@ ; CHECK-CORTEX-FIX: @ %bb.0: ; CHECK-CORTEX-FIX-NEXT: .save {r4, r5, r6, r7, r11, lr} ; CHECK-CORTEX-FIX-NEXT: push {r4, r5, r6, r7, r11, lr} -; CHECK-CORTEX-FIX-NEXT: .vsave {d8} -; CHECK-CORTEX-FIX-NEXT: vpush {d8} +; CHECK-CORTEX-FIX-NEXT: .vsave {d8, d9, d10} +; CHECK-CORTEX-FIX-NEXT: vpush {d8, d9, d10} ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s0 ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 ; CHECK-CORTEX-FIX-NEXT: beq .LBB83_2 @@ -4201,120 +4340,143 @@ ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s9 ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r1] ; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 +; CHECK-CORTEX-FIX-NEXT: vmov r12, r3, d16 ; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r2 ; CHECK-CORTEX-FIX-NEXT: vmov r4, r5, d17 +; CHECK-CORTEX-FIX-NEXT: lsr r2, r3, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 +; CHECK-CORTEX-FIX-NEXT: lsr r12, r12, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s1, r2 +; CHECK-CORTEX-FIX-NEXT: vmov s11, r12 ; CHECK-CORTEX-FIX-NEXT: lsr r6, r4, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r7, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r5 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s14, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s3, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s8 +; CHECK-CORTEX-FIX-NEXT: vmov s12, r7 +; CHECK-CORTEX-FIX-NEXT: vmov s14, r6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-CORTEX-FIX-NEXT: vmov.32 lr, d16[0] +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s3 -; CHECK-CORTEX-FIX-NEXT: lsr r12, r2, #16 -; CHECK-CORTEX-FIX-NEXT: lsr lr, r3, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s8, r3 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r2 -; CHECK-CORTEX-FIX-NEXT: vmov s11, lr -; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13 -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: bne .LBB83_3 -; CHECK-CORTEX-FIX-NEXT: b .LBB83_4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s14 +; CHECK-CORTEX-FIX-NEXT: vmov s13, lr +; CHECK-CORTEX-FIX-NEXT: b .LBB83_3 ; CHECK-CORTEX-FIX-NEXT: .LBB83_2: -; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r1] -; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r1, #2] ; CHECK-CORTEX-FIX-NEXT: ldrh r7, [r1, #4] ; CHECK-CORTEX-FIX-NEXT: ldrh r6, [r1, #6] ; CHECK-CORTEX-FIX-NEXT: ldrh r5, [r1, #8] ; CHECK-CORTEX-FIX-NEXT: ldrh r4, [r1, #10] ; CHECK-CORTEX-FIX-NEXT: ldrh r2, [r1, #12] ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1, #14] +; CHECK-CORTEX-FIX-NEXT: ldrh r12, [r1] +; CHECK-CORTEX-FIX-NEXT: ldrh lr, [r1, #2] +; CHECK-CORTEX-FIX-NEXT: vmov s1, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s3, r7 ; CHECK-CORTEX-FIX-NEXT: vmov s0, r3 ; CHECK-CORTEX-FIX-NEXT: vmov s2, r2 -; CHECK-CORTEX-FIX-NEXT: vmov s1, r6 ; CHECK-CORTEX-FIX-NEXT: vmov s8, r4 -; CHECK-CORTEX-FIX-NEXT: vmov s10, r5 -; CHECK-CORTEX-FIX-NEXT: vmov s11, r7 -; CHECK-CORTEX-FIX-NEXT: vmov s13, lr -; CHECK-CORTEX-FIX-NEXT: vmov s15, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s2 +; CHECK-CORTEX-FIX-NEXT: vmov s14, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s11, lr +; CHECK-CORTEX-FIX-NEXT: vmov s13, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s12, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s2 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s2, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s11 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s10, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s15 -; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 -; CHECK-CORTEX-FIX-NEXT: beq .LBB83_4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s0, s14 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s1, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s3, s3 ; CHECK-CORTEX-FIX-NEXT: .LBB83_3: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s8, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s14, s13 +; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 +; CHECK-CORTEX-FIX-NEXT: beq .LBB83_5 +; CHECK-CORTEX-FIX-NEXT: @ %bb.4: ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 +; CHECK-CORTEX-FIX-NEXT: vmov r2, r3, d2 +; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3 ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 -; CHECK-CORTEX-FIX-NEXT: vmov.16 d2[0], r0 -; CHECK-CORTEX-FIX-NEXT: .LBB83_4: +; CHECK-CORTEX-FIX-NEXT: lsr r2, r2, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r7, r3, #16 +; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 +; CHECK-CORTEX-FIX-NEXT: vmov s6, r6 +; CHECK-CORTEX-FIX-NEXT: vmov s4, r5 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r3 +; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 +; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 +; CHECK-CORTEX-FIX-NEXT: vmov s18, r2 +; CHECK-CORTEX-FIX-NEXT: vmov.16 d16[0], r0 +; CHECK-CORTEX-FIX-NEXT: lsr r0, r5, #16 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s18 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s9 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 +; CHECK-CORTEX-FIX-NEXT: vmov.32 r12, d16[0] +; CHECK-CORTEX-FIX-NEXT: vmov s20, r12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s20 +; CHECK-CORTEX-FIX-NEXT: b .LBB83_6 +; CHECK-CORTEX-FIX-NEXT: .LBB83_5: ; CHECK-CORTEX-FIX-NEXT: vmov r6, r5, d3 ; CHECK-CORTEX-FIX-NEXT: vmov r0, r2, d2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s1 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 -; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r5 ; CHECK-CORTEX-FIX-NEXT: lsr r7, r2, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r4, r6, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s5, r2 +; CHECK-CORTEX-FIX-NEXT: lsr r3, r5, #16 ; CHECK-CORTEX-FIX-NEXT: lsr r12, r0, #16 -; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 -; CHECK-CORTEX-FIX-NEXT: vmov r0, s12 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s14 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s3 ; CHECK-CORTEX-FIX-NEXT: vmov s6, r6 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s2 +; CHECK-CORTEX-FIX-NEXT: vmov s7, r2 +; CHECK-CORTEX-FIX-NEXT: vmov s9, r0 ; CHECK-CORTEX-FIX-NEXT: vmov s13, r3 ; CHECK-CORTEX-FIX-NEXT: vmov s15, r4 ; CHECK-CORTEX-FIX-NEXT: vmov s16, r7 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s10 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s4 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s4 ; CHECK-CORTEX-FIX-NEXT: vmov s4, r12 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s5, s9 -; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r2, s1 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s11, s7 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s7, s9 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s9, s15 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s13, s13 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s15, s16 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s7 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s11 -; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f32.f16 s4, s4 +; CHECK-CORTEX-FIX-NEXT: .LBB83_6: +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s10 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s2, s2 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s8, s8 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s14, s14 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s6, s6 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s5 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s3, s13 -; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s9, s9 -; CHECK-CORTEX-FIX-NEXT: vmov r3, s14 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s3 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r0, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r2, s12 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s10, s1 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s12, s5 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s1, s13 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s5, s11 +; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s11, s15 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s2 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s8 +; CHECK-CORTEX-FIX-NEXT: pkhbt r12, r0, r2, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r2, s3 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s10 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s1 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s11 ; CHECK-CORTEX-FIX-NEXT: vmov r0, s9 +; CHECK-CORTEX-FIX-NEXT: pkhbt lr, r2, r3, lsl #16 +; CHECK-CORTEX-FIX-NEXT: vmov r3, s12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r3, r3, r7, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r7, s7 +; CHECK-CORTEX-FIX-NEXT: vmov r7, s5 ; CHECK-CORTEX-FIX-NEXT: pkhbt r7, r7, r6, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r6, s0 ; CHECK-CORTEX-FIX-NEXT: vcvtb.f16.f32 s0, s4 ; CHECK-CORTEX-FIX-NEXT: pkhbt r6, r6, r5, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r5, s8 +; CHECK-CORTEX-FIX-NEXT: vmov r5, s14 ; CHECK-CORTEX-FIX-NEXT: vmov r2, s0 ; CHECK-CORTEX-FIX-NEXT: pkhbt r5, r5, r4, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov r4, s6 @@ -4323,7 +4485,7 @@ ; CHECK-CORTEX-FIX-NEXT: vmov.32 d18[1], lr ; CHECK-CORTEX-FIX-NEXT: vmov.32 d19[1], r12 ; CHECK-CORTEX-FIX-NEXT: pkhbt r0, r4, r0, lsl #16 -; CHECK-CORTEX-FIX-NEXT: vmov r4, s5 +; CHECK-CORTEX-FIX-NEXT: vmov r4, s7 ; CHECK-CORTEX-FIX-NEXT: pkhbt r2, r4, r2, lsl #16 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d16[0], r2 ; CHECK-CORTEX-FIX-NEXT: vmov.32 d17[0], r0 @@ -4332,7 +4494,7 @@ ; CHECK-CORTEX-FIX-NEXT: aesd.8 q9, q8 ; CHECK-CORTEX-FIX-NEXT: aesimc.8 q8, q9 ; CHECK-CORTEX-FIX-NEXT: vst1.64 {d16, d17}, [r1] -; CHECK-CORTEX-FIX-NEXT: vpop {d8} +; CHECK-CORTEX-FIX-NEXT: vpop {d8, d9, d10} ; CHECK-CORTEX-FIX-NEXT: pop {r4, r5, r6, r7, r11, pc} br i1 %0, label %5, label %11 diff --git a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll --- a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll +++ b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll @@ -99,7 +99,7 @@ ; CHECKSOFT-LABEL: insert_v8f16: ; CHECKSOFT: @ %bb.0: @ %entry ; CHECKSOFT-NEXT: vmov.f16 s0, r0 -; CHECKSOFT-NEXT: vmov r2, r3, d1 +; CHECKSOFT-NEXT: vmov r2, r3, d16 ; CHECKSOFT-NEXT: vmov r0, r1, d0 ; CHECKSOFT-NEXT: bx lr entry: @@ -134,11 +134,9 @@ ; ; CHECKSOFT-LABEL: test_vset_laneq_f16_1: ; CHECKSOFT: @ %bb.0: @ %entry -; CHECKSOFT-NEXT: vmov d1, r2, r3 ; CHECKSOFT-NEXT: vldr s4, [sp] ; CHECKSOFT-NEXT: vmov d0, r0, r1 ; CHECKSOFT-NEXT: vcvtt.f16.f32 s0, s4 -; CHECKSOFT-NEXT: vmov r2, r3, d1 ; CHECKSOFT-NEXT: vmov r0, r1, d0 ; CHECKSOFT-NEXT: bx lr entry: @@ -159,7 +157,6 @@ ; CHECKSOFT-NEXT: vldr s4, [sp] ; CHECKSOFT-NEXT: vmov d0, r0, r1 ; CHECKSOFT-NEXT: vcvtt.f16.f32 s3, s4 -; CHECKSOFT-NEXT: vmov r0, r1, d0 ; CHECKSOFT-NEXT: vmov r2, r3, d1 ; CHECKSOFT-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll @@ -37,14 +37,18 @@ ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: vmov.32 d10[1], r5 -; CHECK-NEXT: mvnne r4, #0 ; CHECK-NEXT: vdup.32 d17, r0 +; CHECK-NEXT: mvnne r4, #0 ; CHECK-NEXT: vdup.32 d16, r4 ; CHECK-NEXT: mvn r4, #0 -; CHECK-NEXT: vbsl q8, q5, q4 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r3, r5, d17 +; CHECK-NEXT: vmov.32 d10[1], r5 +; CHECK-NEXT: vorr q9, q8, q8 +; CHECK-NEXT: vbsl q9, q5, q4 +; CHECK-NEXT: vand q8, q5, q8 +; CHECK-NEXT: vmov.32 r0, d18[0] +; CHECK-NEXT: vmov.32 r1, d16[1] +; CHECK-NEXT: vmov.32 r3, d19[0] +; CHECK-NEXT: vmov.32 r5, d17[1] ; CHECK-NEXT: rsbs r0, r0, #-2147483648 ; CHECK-NEXT: sbcs r0, r4, r1 ; CHECK-NEXT: mov r0, #0 @@ -55,10 +59,10 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vdup.32 d19, r2 +; CHECK-NEXT: vdup.32 d17, r2 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vdup.32 d18, r0 -; CHECK-NEXT: vbif q8, q10, q9 +; CHECK-NEXT: vdup.32 d16, r0 +; CHECK-NEXT: vbsl q8, q9, q10 ; CHECK-NEXT: vmovn.i64 d0, q8 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, r5, r11, pc} @@ -158,9 +162,12 @@ ; CHECK-NEXT: mvnne r4, #0 ; CHECK-NEXT: vdup.32 d17, r0 ; CHECK-NEXT: vdup.32 d16, r4 -; CHECK-NEXT: vbsl q8, q4, q9 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r3, r5, d17 +; CHECK-NEXT: vbit q9, q4, q8 +; CHECK-NEXT: vand q8, q4, q8 +; CHECK-NEXT: vmov.32 r0, d18[0] +; CHECK-NEXT: vmov.32 r1, d16[1] +; CHECK-NEXT: vmov.32 r3, d19[0] +; CHECK-NEXT: vmov.32 r5, d17[1] ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rscs r0, r1, #0 ; CHECK-NEXT: mov r0, #0 @@ -171,10 +178,10 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov.32 d19[0], r2 +; CHECK-NEXT: vmov.32 d17[0], r2 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vmov.32 d18[0], r0 -; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vmov.32 d16[0], r0 +; CHECK-NEXT: vand q8, q9, q8 ; CHECK-NEXT: vmovn.i64 d0, q8 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r11, pc} @@ -191,112 +198,116 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-LABEL: stest_f32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, sp, #4 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vorr q4, q0, q0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: mov r8, r1 -; CHECK-NEXT: vmov r6, s17 -; CHECK-NEXT: vmov r10, s19 -; CHECK-NEXT: vmov.32 d8[0], r7 -; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: vmov.32 d10[0], r0 -; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov.32 d8[0], r9 +; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: mov r10, r0 ; CHECK-NEXT: vmov.32 d9[0], r0 -; CHECK-NEXT: mov r0, r10 -; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r7, r1 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mvn r6, #-2147483648 -; CHECK-NEXT: subs r3, r7, r6 -; CHECK-NEXT: sbcs r3, r8, #0 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov.32 d11[0], r0 -; CHECK-NEXT: mov r3, #0 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: vmov.32 d10[0], r0 +; CHECK-NEXT: mvn r3, #-2147483648 +; CHECK-NEXT: subs r0, r0, r3 ; CHECK-NEXT: adr r2, .LCPI3_0 -; CHECK-NEXT: movwlt r3, #1 -; CHECK-NEXT: subs r7, r5, r6 -; CHECK-NEXT: sbcs r7, r4, #0 -; CHECK-NEXT: vmov.32 d11[1], r1 -; CHECK-NEXT: mov r7, #0 -; CHECK-NEXT: movwlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: mvnne r7, #0 -; CHECK-NEXT: subs r0, r0, r6 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128] -; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: movwlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vmov.32 d10[1], r4 -; CHECK-NEXT: vdup.32 d17, r0 -; CHECK-NEXT: subs r0, r9, r6 -; CHECK-NEXT: sbcs r0, r11, #0 -; CHECK-NEXT: vdup.32 d16, r7 +; CHECK-NEXT: vmov.32 d11[1], r6 ; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: vbsl q8, q5, q9 ; CHECK-NEXT: movwlt r0, #1 -; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov.32 d9[1], r11 -; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: subs r5, r4, r3 +; CHECK-NEXT: vmov.32 d10[1], r1 +; CHECK-NEXT: sbcs r1, r6, #0 +; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: movwlt r1, #1 +; CHECK-NEXT: subs r6, r9, r3 +; CHECK-NEXT: sbcs r6, r8, #0 +; CHECK-NEXT: vmov.32 d9[1], r7 +; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: movwlt r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: mvnne r6, #0 +; CHECK-NEXT: subs r3, r10, r3 +; CHECK-NEXT: sbcs r3, r7, #0 +; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128] +; CHECK-NEXT: mov r3, #0 +; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: movwlt r3, #1 ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: mvn r6, #0 -; CHECK-NEXT: vdup.32 d21, r0 ; CHECK-NEXT: mvnne r3, #0 ; CHECK-NEXT: vmov.32 d8[1], r8 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vdup.32 d20, r3 -; CHECK-NEXT: vbit q9, q4, q10 -; CHECK-NEXT: adr r5, .LCPI3_1 -; CHECK-NEXT: vld1.64 {d20, d21}, [r5:128] -; CHECK-NEXT: vmov r5, r4, d17 -; CHECK-NEXT: vmov r3, r7, d18 -; CHECK-NEXT: rsbs r0, r0, #-2147483648 -; CHECK-NEXT: sbcs r0, r6, r1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: vdup.32 d21, r3 +; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: vdup.32 d20, r6 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vorr q8, q10, q10 +; CHECK-NEXT: vdup.32 d23, r1 +; CHECK-NEXT: vand q12, q4, q10 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: vbsl q8, q4, q9 +; CHECK-NEXT: vdup.32 d22, r0 +; CHECK-NEXT: vbit q9, q5, q11 +; CHECK-NEXT: adr r7, .LCPI3_1 +; CHECK-NEXT: vmov.32 r0, d24[1] +; CHECK-NEXT: vand q11, q5, q11 +; CHECK-NEXT: vmov.32 r1, d16[0] +; CHECK-NEXT: mvn r6, #0 +; CHECK-NEXT: vmov.32 r3, d18[0] +; CHECK-NEXT: vmov.32 r4, d22[1] +; CHECK-NEXT: vmov.32 r5, d17[0] +; CHECK-NEXT: vld1.64 {d20, d21}, [r7:128] +; CHECK-NEXT: vmov.32 r7, d25[1] +; CHECK-NEXT: rsbs r1, r1, #-2147483648 +; CHECK-NEXT: sbcs r0, r6, r0 +; CHECK-NEXT: vmov.32 r1, d19[0] ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: rsbs r1, r3, #-2147483648 -; CHECK-NEXT: vmov r1, r3, d19 +; CHECK-NEXT: rsbs r3, r3, #-2147483648 +; CHECK-NEXT: vmov.32 r3, d23[1] +; CHECK-NEXT: sbcs r4, r6, r4 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: movwlt r4, #1 +; CHECK-NEXT: rsbs r5, r5, #-2147483648 ; CHECK-NEXT: sbcs r7, r6, r7 ; CHECK-NEXT: mov r7, #0 ; CHECK-NEXT: movwlt r7, #1 -; CHECK-NEXT: rsbs r5, r5, #-2147483648 -; CHECK-NEXT: sbcs r5, r6, r4 -; CHECK-NEXT: mov r5, #0 -; CHECK-NEXT: movwlt r5, #1 ; CHECK-NEXT: rsbs r1, r1, #-2147483648 ; CHECK-NEXT: sbcs r1, r6, r3 ; CHECK-NEXT: movwlt r2, #1 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: mvnne r5, #0 ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: vdup.32 d25, r5 ; CHECK-NEXT: mvnne r7, #0 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: vdup.32 d25, r7 +; CHECK-NEXT: mvnne r4, #0 ; CHECK-NEXT: vdup.32 d23, r2 ; CHECK-NEXT: vdup.32 d24, r0 ; CHECK-NEXT: vbif q8, q10, q12 -; CHECK-NEXT: vdup.32 d22, r7 +; CHECK-NEXT: vdup.32 d22, r4 ; CHECK-NEXT: vbif q9, q10, q11 ; CHECK-NEXT: vmovn.i64 d1, q8 ; CHECK-NEXT: vmovn.i64 d0, q9 ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI3_0: @@ -395,75 +406,82 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vorr q4, q0, q0 ; CHECK-NEXT: vmov r0, s17 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.32 d16[0], r2 -; CHECK-NEXT: mvn r4, #0 -; CHECK-NEXT: subs r2, r2, r4 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: vmov r8, s19 -; CHECK-NEXT: sbcs r2, r1, #0 -; CHECK-NEXT: vmov.32 d17[0], r5 -; CHECK-NEXT: mov r2, #0 -; CHECK-NEXT: vmov.i64 q5, #0xffffffff -; CHECK-NEXT: movwlt r2, #1 -; CHECK-NEXT: subs r3, r5, r4 -; CHECK-NEXT: sbcs r3, r6, #0 -; CHECK-NEXT: vmov.32 d17[1], r6 -; CHECK-NEXT: mov r3, #0 -; CHECK-NEXT: mov r7, #0 -; CHECK-NEXT: movwlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: mvnne r3, #0 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vdup.32 d19, r3 -; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d18, r2 -; CHECK-NEXT: vmov.32 d16[1], r1 -; CHECK-NEXT: vorr q4, q9, q9 -; CHECK-NEXT: vbsl q4, q8, q5 -; CHECK-NEXT: vmov r10, r9, d8 +; CHECK-NEXT: vmov.32 d11[0], r6 +; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: vmov.32 d10[0], r0 +; CHECK-NEXT: mvn r9, #0 +; CHECK-NEXT: subs r0, r0, r9 +; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: vmov.32 d11[1], r7 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: movwlt r0, #1 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: vmov.32 d10[1], r1 +; CHECK-NEXT: mvnne r0, #0 +; CHECK-NEXT: subs r1, r6, r9 +; CHECK-NEXT: sbcs r1, r7, #0 +; CHECK-NEXT: mov r1, #0 +; CHECK-NEXT: movwlt r1, #1 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: mvnne r1, #0 +; CHECK-NEXT: vdup.32 d13, r1 +; CHECK-NEXT: vdup.32 d12, r0 +; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __aeabi_f2lz +; CHECK-NEXT: vorr q4, q6, q6 ; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: vmov.32 d12[0], r0 +; CHECK-NEXT: vmov.i64 q8, #0xffffffff +; CHECK-NEXT: vmov.32 d14[0], r0 +; CHECK-NEXT: vbsl q4, q5, q8 ; CHECK-NEXT: mov r0, r8 ; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: vmov.32 r7, d8[0] ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: subs r2, r5, r4 -; CHECK-NEXT: vmov.32 d13[0], r0 +; CHECK-NEXT: subs r2, r5, r9 +; CHECK-NEXT: vmov.32 d15[0], r0 ; CHECK-NEXT: sbcs r2, r6, #0 +; CHECK-NEXT: vand q10, q5, q6 ; CHECK-NEXT: mov r2, #0 +; CHECK-NEXT: vmov.i64 q8, #0xffffffff ; CHECK-NEXT: movwlt r2, #1 -; CHECK-NEXT: subs r0, r0, r4 +; CHECK-NEXT: subs r0, r0, r9 ; CHECK-NEXT: sbcs r0, r1, #0 -; CHECK-NEXT: vmov.32 d13[1], r1 +; CHECK-NEXT: vmov.32 d15[1], r1 ; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: vmov r5, r4, d9 ; CHECK-NEXT: movwlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: mvnne r0, #0 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: vmov.32 d12[1], r6 +; CHECK-NEXT: vdup.32 d19, r0 ; CHECK-NEXT: mvnne r2, #0 -; CHECK-NEXT: vdup.32 d17, r0 -; CHECK-NEXT: rsbs r0, r10, #0 -; CHECK-NEXT: vdup.32 d16, r2 -; CHECK-NEXT: rscs r0, r9, #0 -; CHECK-NEXT: vbsl q8, q6, q5 +; CHECK-NEXT: vmov.32 d14[1], r6 +; CHECK-NEXT: rsbs r7, r7, #0 +; CHECK-NEXT: vdup.32 d18, r2 +; CHECK-NEXT: vbit q8, q7, q9 +; CHECK-NEXT: vmov.32 r0, d20[1] +; CHECK-NEXT: vand q9, q7, q9 +; CHECK-NEXT: vmov.32 r5, d9[0] +; CHECK-NEXT: vmov.32 r7, d21[1] +; CHECK-NEXT: vmov.32 r1, d16[0] +; CHECK-NEXT: vmov.32 r2, d18[1] +; CHECK-NEXT: vmov.32 r3, d17[0] +; CHECK-NEXT: vmov.32 r6, d19[1] +; CHECK-NEXT: rscs r0, r0, #0 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: movwlt r0, #1 -; CHECK-NEXT: vmov r1, r2, d16 -; CHECK-NEXT: vmov r3, r6, d17 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: rscs r1, r2, #0 ; CHECK-NEXT: mov r1, #0 @@ -473,10 +491,10 @@ ; CHECK-NEXT: mov r2, #0 ; CHECK-NEXT: movwlt r2, #1 ; CHECK-NEXT: rsbs r3, r5, #0 -; CHECK-NEXT: rscs r3, r4, #0 -; CHECK-NEXT: movwlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: mvnne r7, #0 +; CHECK-NEXT: rscs r3, r7, #0 +; CHECK-NEXT: movwlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: mvnne r4, #0 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: mvnne r2, #0 ; CHECK-NEXT: cmp r1, #0 @@ -485,14 +503,14 @@ ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: vmov.32 d20[0], r1 ; CHECK-NEXT: mvnne r0, #0 -; CHECK-NEXT: vmov.32 d19[0], r7 +; CHECK-NEXT: vmov.32 d19[0], r4 ; CHECK-NEXT: vand q8, q8, q10 ; CHECK-NEXT: vmov.32 d18[0], r0 ; CHECK-NEXT: vmovn.i64 d1, q8 ; CHECK-NEXT: vand q9, q4, q9 ; CHECK-NEXT: vmovn.i64 d0, q9 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} entry: %conv = fptosi <4 x float> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -506,118 +524,122 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-NEON-LABEL: stest_f16i32: ; CHECK-NEON: @ %bb.0: @ %entry -; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEON-NEXT: .pad #4 -; CHECK-NEON-NEXT: sub sp, sp, #4 +; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEON-NEXT: vmov r0, s0 +; CHECK-NEON-NEXT: vmov r0, s2 ; CHECK-NEON-NEXT: vmov.f32 s16, s3 -; CHECK-NEON-NEXT: vmov.f32 s18, s2 -; CHECK-NEON-NEXT: vmov.f32 s20, s1 +; CHECK-NEON-NEXT: vmov.f32 s18, s1 +; CHECK-NEON-NEXT: vmov.f32 s20, s0 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz ; CHECK-NEON-NEXT: mov r9, r0 -; CHECK-NEON-NEXT: vmov r0, s18 -; CHECK-NEON-NEXT: vmov r10, s16 +; CHECK-NEON-NEXT: vmov r0, s16 +; CHECK-NEON-NEXT: vmov r4, s18 ; CHECK-NEON-NEXT: mov r8, r1 -; CHECK-NEON-NEXT: vmov r6, s20 +; CHECK-NEON-NEXT: vmov r5, s20 ; CHECK-NEON-NEXT: vmov.32 d8[0], r9 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: mov r5, r0 -; CHECK-NEON-NEXT: vmov.32 d10[0], r0 -; CHECK-NEON-NEXT: mov r0, r6 -; CHECK-NEON-NEXT: mov r4, r1 -; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: mov r11, r0 +; CHECK-NEON-NEXT: mov r10, r0 ; CHECK-NEON-NEXT: vmov.32 d9[0], r0 -; CHECK-NEON-NEXT: mov r0, r10 +; CHECK-NEON-NEXT: mov r0, r4 ; CHECK-NEON-NEXT: mov r7, r1 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: mvn r6, #-2147483648 -; CHECK-NEON-NEXT: subs r3, r9, r6 -; CHECK-NEON-NEXT: sbcs r3, r8, #0 +; CHECK-NEON-NEXT: mov r4, r0 ; CHECK-NEON-NEXT: vmov.32 d11[0], r0 -; CHECK-NEON-NEXT: mov r3, #0 +; CHECK-NEON-NEXT: mov r0, r5 +; CHECK-NEON-NEXT: mov r6, r1 +; CHECK-NEON-NEXT: bl __aeabi_h2f +; CHECK-NEON-NEXT: bl __aeabi_f2lz +; CHECK-NEON-NEXT: vmov.32 d10[0], r0 +; CHECK-NEON-NEXT: mvn r3, #-2147483648 +; CHECK-NEON-NEXT: subs r0, r0, r3 ; CHECK-NEON-NEXT: adr r2, .LCPI6_0 -; CHECK-NEON-NEXT: movwlt r3, #1 -; CHECK-NEON-NEXT: subs r5, r5, r6 -; CHECK-NEON-NEXT: sbcs r5, r4, #0 -; CHECK-NEON-NEXT: vmov.32 d11[1], r1 -; CHECK-NEON-NEXT: mov r5, #0 -; CHECK-NEON-NEXT: movwlt r5, #1 -; CHECK-NEON-NEXT: cmp r5, #0 -; CHECK-NEON-NEXT: mvnne r5, #0 -; CHECK-NEON-NEXT: subs r0, r0, r6 ; CHECK-NEON-NEXT: sbcs r0, r1, #0 -; CHECK-NEON-NEXT: vld1.64 {d18, d19}, [r2:128] -; CHECK-NEON-NEXT: mov r0, #0 -; CHECK-NEON-NEXT: mov r2, #0 -; CHECK-NEON-NEXT: movwlt r0, #1 -; CHECK-NEON-NEXT: cmp r0, #0 -; CHECK-NEON-NEXT: mvnne r0, #0 -; CHECK-NEON-NEXT: vmov.32 d10[1], r4 -; CHECK-NEON-NEXT: vdup.32 d17, r0 -; CHECK-NEON-NEXT: subs r0, r11, r6 -; CHECK-NEON-NEXT: sbcs r0, r7, #0 -; CHECK-NEON-NEXT: vdup.32 d16, r5 +; CHECK-NEON-NEXT: vmov.32 d11[1], r6 ; CHECK-NEON-NEXT: mov r0, #0 -; CHECK-NEON-NEXT: vbsl q8, q5, q9 ; CHECK-NEON-NEXT: movwlt r0, #1 -; CHECK-NEON-NEXT: cmp r0, #0 +; CHECK-NEON-NEXT: subs r5, r4, r3 +; CHECK-NEON-NEXT: vmov.32 d10[1], r1 +; CHECK-NEON-NEXT: sbcs r1, r6, #0 +; CHECK-NEON-NEXT: mov r1, #0 +; CHECK-NEON-NEXT: movwlt r1, #1 +; CHECK-NEON-NEXT: subs r6, r9, r3 +; CHECK-NEON-NEXT: sbcs r6, r8, #0 ; CHECK-NEON-NEXT: vmov.32 d9[1], r7 -; CHECK-NEON-NEXT: mvnne r0, #0 +; CHECK-NEON-NEXT: mov r6, #0 +; CHECK-NEON-NEXT: movwlt r6, #1 +; CHECK-NEON-NEXT: cmp r6, #0 +; CHECK-NEON-NEXT: mvnne r6, #0 +; CHECK-NEON-NEXT: subs r3, r10, r3 +; CHECK-NEON-NEXT: sbcs r3, r7, #0 +; CHECK-NEON-NEXT: vld1.64 {d18, d19}, [r2:128] +; CHECK-NEON-NEXT: mov r3, #0 +; CHECK-NEON-NEXT: mov r2, #0 +; CHECK-NEON-NEXT: movwlt r3, #1 ; CHECK-NEON-NEXT: cmp r3, #0 -; CHECK-NEON-NEXT: mvn r6, #0 -; CHECK-NEON-NEXT: vdup.32 d21, r0 ; CHECK-NEON-NEXT: mvnne r3, #0 ; CHECK-NEON-NEXT: vmov.32 d8[1], r8 -; CHECK-NEON-NEXT: vmov r0, r1, d16 -; CHECK-NEON-NEXT: vdup.32 d20, r3 -; CHECK-NEON-NEXT: vbit q9, q4, q10 -; CHECK-NEON-NEXT: adr r5, .LCPI6_1 -; CHECK-NEON-NEXT: vld1.64 {d20, d21}, [r5:128] -; CHECK-NEON-NEXT: vmov r5, r4, d17 -; CHECK-NEON-NEXT: vmov r3, r7, d18 -; CHECK-NEON-NEXT: rsbs r0, r0, #-2147483648 -; CHECK-NEON-NEXT: sbcs r0, r6, r1 +; CHECK-NEON-NEXT: cmp r1, #0 +; CHECK-NEON-NEXT: vdup.32 d21, r3 +; CHECK-NEON-NEXT: mvnne r1, #0 +; CHECK-NEON-NEXT: vdup.32 d20, r6 +; CHECK-NEON-NEXT: cmp r0, #0 +; CHECK-NEON-NEXT: vorr q8, q10, q10 +; CHECK-NEON-NEXT: vdup.32 d23, r1 +; CHECK-NEON-NEXT: vand q12, q4, q10 +; CHECK-NEON-NEXT: mvnne r0, #0 +; CHECK-NEON-NEXT: vbsl q8, q4, q9 +; CHECK-NEON-NEXT: vdup.32 d22, r0 +; CHECK-NEON-NEXT: vbit q9, q5, q11 +; CHECK-NEON-NEXT: adr r7, .LCPI6_1 +; CHECK-NEON-NEXT: vmov.32 r0, d24[1] +; CHECK-NEON-NEXT: vand q11, q5, q11 +; CHECK-NEON-NEXT: vmov.32 r1, d16[0] +; CHECK-NEON-NEXT: mvn r6, #0 +; CHECK-NEON-NEXT: vmov.32 r3, d18[0] +; CHECK-NEON-NEXT: vmov.32 r4, d22[1] +; CHECK-NEON-NEXT: vmov.32 r5, d17[0] +; CHECK-NEON-NEXT: vld1.64 {d20, d21}, [r7:128] +; CHECK-NEON-NEXT: vmov.32 r7, d25[1] +; CHECK-NEON-NEXT: rsbs r1, r1, #-2147483648 +; CHECK-NEON-NEXT: sbcs r0, r6, r0 +; CHECK-NEON-NEXT: vmov.32 r1, d19[0] ; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: movwlt r0, #1 ; CHECK-NEON-NEXT: cmp r0, #0 ; CHECK-NEON-NEXT: mvnne r0, #0 -; CHECK-NEON-NEXT: rsbs r1, r3, #-2147483648 -; CHECK-NEON-NEXT: vmov r1, r3, d19 +; CHECK-NEON-NEXT: rsbs r3, r3, #-2147483648 +; CHECK-NEON-NEXT: vmov.32 r3, d23[1] +; CHECK-NEON-NEXT: sbcs r4, r6, r4 +; CHECK-NEON-NEXT: mov r4, #0 +; CHECK-NEON-NEXT: movwlt r4, #1 +; CHECK-NEON-NEXT: rsbs r5, r5, #-2147483648 ; CHECK-NEON-NEXT: sbcs r7, r6, r7 ; CHECK-NEON-NEXT: mov r7, #0 ; CHECK-NEON-NEXT: movwlt r7, #1 -; CHECK-NEON-NEXT: rsbs r5, r5, #-2147483648 -; CHECK-NEON-NEXT: sbcs r5, r6, r4 -; CHECK-NEON-NEXT: mov r5, #0 -; CHECK-NEON-NEXT: movwlt r5, #1 ; CHECK-NEON-NEXT: rsbs r1, r1, #-2147483648 ; CHECK-NEON-NEXT: sbcs r1, r6, r3 ; CHECK-NEON-NEXT: movwlt r2, #1 ; CHECK-NEON-NEXT: cmp r2, #0 ; CHECK-NEON-NEXT: mvnne r2, #0 -; CHECK-NEON-NEXT: cmp r5, #0 -; CHECK-NEON-NEXT: mvnne r5, #0 ; CHECK-NEON-NEXT: cmp r7, #0 -; CHECK-NEON-NEXT: vdup.32 d25, r5 ; CHECK-NEON-NEXT: mvnne r7, #0 +; CHECK-NEON-NEXT: cmp r4, #0 +; CHECK-NEON-NEXT: vdup.32 d25, r7 +; CHECK-NEON-NEXT: mvnne r4, #0 ; CHECK-NEON-NEXT: vdup.32 d23, r2 ; CHECK-NEON-NEXT: vdup.32 d24, r0 ; CHECK-NEON-NEXT: vbif q8, q10, q12 -; CHECK-NEON-NEXT: vdup.32 d22, r7 +; CHECK-NEON-NEXT: vdup.32 d22, r4 ; CHECK-NEON-NEXT: vbif q9, q10, q11 ; CHECK-NEON-NEXT: vmovn.i64 d1, q8 ; CHECK-NEON-NEXT: vmovn.i64 d0, q9 ; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEON-NEXT: add sp, sp, #4 -; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} ; CHECK-NEON-NEXT: .p2align 4 ; CHECK-NEON-NEXT: @ %bb.1: ; CHECK-NEON-NEXT: .LCPI6_0: @@ -633,114 +655,118 @@ ; ; CHECK-FP16-LABEL: stest_f16i32: ; CHECK-FP16: @ %bb.0: @ %entry -; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-FP16-NEXT: .vsave {d10, d11, d12, d13} -; CHECK-FP16-NEXT: vpush {d10, d11, d12, d13} -; CHECK-FP16-NEXT: .vsave {d8} -; CHECK-FP16-NEXT: vpush {d8} -; CHECK-FP16-NEXT: vmov.u16 r0, d0[0] +; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-FP16-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-FP16-NEXT: vpush {d8, d9, d10, d11} +; CHECK-FP16-NEXT: vmov.u16 r0, d0[2] ; CHECK-FP16-NEXT: vorr d8, d0, d0 -; CHECK-FP16-NEXT: vmov.u16 r6, d0[1] +; CHECK-FP16-NEXT: vmov.u16 r4, d0[1] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: mov r4, r0 -; CHECK-FP16-NEXT: vmov.u16 r0, d8[2] +; CHECK-FP16-NEXT: mov r9, r0 +; CHECK-FP16-NEXT: vmov.u16 r0, d8[3] ; CHECK-FP16-NEXT: mov r8, r1 -; CHECK-FP16-NEXT: vmov.32 d10[0], r4 +; CHECK-FP16-NEXT: vmov.32 d10[0], r9 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: vmov s0, r6 -; CHECK-FP16-NEXT: mov r5, r0 +; CHECK-FP16-NEXT: vmov s0, r4 +; CHECK-FP16-NEXT: mov r6, r0 ; CHECK-FP16-NEXT: mov r7, r1 -; CHECK-FP16-NEXT: vmov.32 d12[0], r0 +; CHECK-FP16-NEXT: vmov.32 d11[0], r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: mov r9, r0 -; CHECK-FP16-NEXT: vmov.u16 r0, d8[3] -; CHECK-FP16-NEXT: mov r10, r1 -; CHECK-FP16-NEXT: vmov.32 d11[0], r9 +; CHECK-FP16-NEXT: mov r4, r0 +; CHECK-FP16-NEXT: vmov.u16 r0, d8[0] +; CHECK-FP16-NEXT: mov r5, r1 +; CHECK-FP16-NEXT: vmov.32 d9[0], r4 ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: mvn r6, #-2147483648 -; CHECK-FP16-NEXT: subs r3, r4, r6 -; CHECK-FP16-NEXT: sbcs r3, r8, #0 -; CHECK-FP16-NEXT: vmov.32 d13[0], r0 -; CHECK-FP16-NEXT: mov r3, #0 +; CHECK-FP16-NEXT: vmov.32 d8[0], r0 +; CHECK-FP16-NEXT: mvn r3, #-2147483648 +; CHECK-FP16-NEXT: subs r0, r0, r3 ; CHECK-FP16-NEXT: adr r2, .LCPI6_0 -; CHECK-FP16-NEXT: movwlt r3, #1 -; CHECK-FP16-NEXT: subs r5, r5, r6 -; CHECK-FP16-NEXT: sbcs r5, r7, #0 -; CHECK-FP16-NEXT: vmov.32 d13[1], r1 +; CHECK-FP16-NEXT: sbcs r0, r1, #0 +; CHECK-FP16-NEXT: vmov.32 d9[1], r5 +; CHECK-FP16-NEXT: mov r0, #0 +; CHECK-FP16-NEXT: movwlt r0, #1 +; CHECK-FP16-NEXT: subs r4, r4, r3 +; CHECK-FP16-NEXT: vmov.32 d8[1], r1 +; CHECK-FP16-NEXT: sbcs r1, r5, #0 +; CHECK-FP16-NEXT: mov r1, #0 +; CHECK-FP16-NEXT: movwlt r1, #1 +; CHECK-FP16-NEXT: subs r5, r9, r3 +; CHECK-FP16-NEXT: sbcs r5, r8, #0 +; CHECK-FP16-NEXT: vmov.32 d11[1], r7 ; CHECK-FP16-NEXT: mov r5, #0 ; CHECK-FP16-NEXT: movwlt r5, #1 ; CHECK-FP16-NEXT: cmp r5, #0 ; CHECK-FP16-NEXT: mvnne r5, #0 -; CHECK-FP16-NEXT: subs r0, r0, r6 -; CHECK-FP16-NEXT: sbcs r0, r1, #0 +; CHECK-FP16-NEXT: subs r3, r6, r3 +; CHECK-FP16-NEXT: sbcs r3, r7, #0 ; CHECK-FP16-NEXT: vld1.64 {d18, d19}, [r2:128] -; CHECK-FP16-NEXT: mov r0, #0 -; CHECK-FP16-NEXT: mov r2, #0 -; CHECK-FP16-NEXT: movwlt r0, #1 -; CHECK-FP16-NEXT: cmp r0, #0 -; CHECK-FP16-NEXT: mvnne r0, #0 -; CHECK-FP16-NEXT: vmov.32 d12[1], r7 -; CHECK-FP16-NEXT: vdup.32 d17, r0 -; CHECK-FP16-NEXT: subs r0, r9, r6 -; CHECK-FP16-NEXT: sbcs r0, r10, #0 -; CHECK-FP16-NEXT: vdup.32 d16, r5 -; CHECK-FP16-NEXT: mov r0, #0 -; CHECK-FP16-NEXT: vbsl q8, q6, q9 -; CHECK-FP16-NEXT: movwlt r0, #1 -; CHECK-FP16-NEXT: cmp r0, #0 -; CHECK-FP16-NEXT: vmov.32 d11[1], r10 -; CHECK-FP16-NEXT: mvnne r0, #0 -; CHECK-FP16-NEXT: cmp r3, #0 +; CHECK-FP16-NEXT: mov r3, #0 ; CHECK-FP16-NEXT: mvn r6, #0 -; CHECK-FP16-NEXT: vdup.32 d21, r0 +; CHECK-FP16-NEXT: movwlt r3, #1 +; CHECK-FP16-NEXT: cmp r3, #0 ; CHECK-FP16-NEXT: mvnne r3, #0 ; CHECK-FP16-NEXT: vmov.32 d10[1], r8 -; CHECK-FP16-NEXT: vmov r0, r1, d16 -; CHECK-FP16-NEXT: vdup.32 d20, r3 -; CHECK-FP16-NEXT: vbit q9, q5, q10 -; CHECK-FP16-NEXT: adr r5, .LCPI6_1 -; CHECK-FP16-NEXT: vld1.64 {d20, d21}, [r5:128] -; CHECK-FP16-NEXT: vmov r5, r4, d17 -; CHECK-FP16-NEXT: vmov r3, r7, d18 -; CHECK-FP16-NEXT: rsbs r0, r0, #-2147483648 -; CHECK-FP16-NEXT: sbcs r0, r6, r1 +; CHECK-FP16-NEXT: cmp r1, #0 +; CHECK-FP16-NEXT: mov r2, #0 +; CHECK-FP16-NEXT: vdup.32 d21, r3 +; CHECK-FP16-NEXT: mvnne r1, #0 +; CHECK-FP16-NEXT: vdup.32 d20, r5 +; CHECK-FP16-NEXT: cmp r0, #0 +; CHECK-FP16-NEXT: vorr q8, q10, q10 +; CHECK-FP16-NEXT: vdup.32 d23, r1 +; CHECK-FP16-NEXT: vand q12, q5, q10 +; CHECK-FP16-NEXT: mvnne r0, #0 +; CHECK-FP16-NEXT: vbsl q8, q5, q9 +; CHECK-FP16-NEXT: vdup.32 d22, r0 +; CHECK-FP16-NEXT: vbit q9, q4, q11 +; CHECK-FP16-NEXT: adr r7, .LCPI6_1 +; CHECK-FP16-NEXT: vmov.32 r0, d24[1] +; CHECK-FP16-NEXT: vand q11, q4, q11 +; CHECK-FP16-NEXT: vmov.32 r1, d16[0] +; CHECK-FP16-NEXT: vmov.32 r3, d18[0] +; CHECK-FP16-NEXT: vmov.32 r4, d22[1] +; CHECK-FP16-NEXT: vmov.32 r5, d17[0] +; CHECK-FP16-NEXT: vld1.64 {d20, d21}, [r7:128] +; CHECK-FP16-NEXT: vmov.32 r7, d25[1] +; CHECK-FP16-NEXT: rsbs r1, r1, #-2147483648 +; CHECK-FP16-NEXT: sbcs r0, r6, r0 +; CHECK-FP16-NEXT: vmov.32 r1, d19[0] ; CHECK-FP16-NEXT: mov r0, #0 ; CHECK-FP16-NEXT: movwlt r0, #1 ; CHECK-FP16-NEXT: cmp r0, #0 ; CHECK-FP16-NEXT: mvnne r0, #0 -; CHECK-FP16-NEXT: rsbs r1, r3, #-2147483648 -; CHECK-FP16-NEXT: vmov r1, r3, d19 +; CHECK-FP16-NEXT: rsbs r3, r3, #-2147483648 +; CHECK-FP16-NEXT: vmov.32 r3, d23[1] +; CHECK-FP16-NEXT: sbcs r4, r6, r4 +; CHECK-FP16-NEXT: mov r4, #0 +; CHECK-FP16-NEXT: movwlt r4, #1 +; CHECK-FP16-NEXT: rsbs r5, r5, #-2147483648 ; CHECK-FP16-NEXT: sbcs r7, r6, r7 ; CHECK-FP16-NEXT: mov r7, #0 ; CHECK-FP16-NEXT: movwlt r7, #1 -; CHECK-FP16-NEXT: rsbs r5, r5, #-2147483648 -; CHECK-FP16-NEXT: sbcs r5, r6, r4 -; CHECK-FP16-NEXT: mov r5, #0 -; CHECK-FP16-NEXT: movwlt r5, #1 ; CHECK-FP16-NEXT: rsbs r1, r1, #-2147483648 ; CHECK-FP16-NEXT: sbcs r1, r6, r3 ; CHECK-FP16-NEXT: movwlt r2, #1 ; CHECK-FP16-NEXT: cmp r2, #0 ; CHECK-FP16-NEXT: mvnne r2, #0 -; CHECK-FP16-NEXT: cmp r5, #0 -; CHECK-FP16-NEXT: mvnne r5, #0 ; CHECK-FP16-NEXT: cmp r7, #0 -; CHECK-FP16-NEXT: vdup.32 d25, r5 ; CHECK-FP16-NEXT: mvnne r7, #0 +; CHECK-FP16-NEXT: cmp r4, #0 +; CHECK-FP16-NEXT: vdup.32 d25, r7 +; CHECK-FP16-NEXT: mvnne r4, #0 ; CHECK-FP16-NEXT: vdup.32 d23, r2 ; CHECK-FP16-NEXT: vdup.32 d24, r0 ; CHECK-FP16-NEXT: vbif q8, q10, q12 -; CHECK-FP16-NEXT: vdup.32 d22, r7 +; CHECK-FP16-NEXT: vdup.32 d22, r4 ; CHECK-FP16-NEXT: vbif q9, q10, q11 ; CHECK-FP16-NEXT: vmovn.i64 d1, q8 ; CHECK-FP16-NEXT: vmovn.i64 d0, q9 -; CHECK-FP16-NEXT: vpop {d8} -; CHECK-FP16-NEXT: vpop {d10, d11, d12, d13} -; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-FP16-NEXT: vpop {d8, d9, d10, d11} +; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} ; CHECK-FP16-NEXT: .p2align 4 ; CHECK-FP16-NEXT: @ %bb.1: ; CHECK-FP16-NEXT: .LCPI6_0: @@ -913,10 +939,10 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-NEON-LABEL: ustest_f16i32: ; CHECK-NEON: @ %bb.0: @ %entry -; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEON-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEON-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEON-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEON-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s3 ; CHECK-NEON-NEXT: vmov.f32 s18, s2 @@ -928,19 +954,19 @@ ; CHECK-NEON-NEXT: mov r7, r1 ; CHECK-NEON-NEXT: vmov r5, s18 ; CHECK-NEON-NEXT: vmov r8, s16 -; CHECK-NEON-NEXT: vmov.32 d9[0], r6 +; CHECK-NEON-NEXT: vmov.32 d11[0], r6 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: vmov.32 d8[0], r0 +; CHECK-NEON-NEXT: vmov.32 d10[0], r0 ; CHECK-NEON-NEXT: mvn r9, #0 ; CHECK-NEON-NEXT: subs r0, r0, r9 ; CHECK-NEON-NEXT: mov r4, #0 ; CHECK-NEON-NEXT: sbcs r0, r1, #0 -; CHECK-NEON-NEXT: vmov.32 d9[1], r7 +; CHECK-NEON-NEXT: vmov.32 d11[1], r7 ; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: movwlt r0, #1 ; CHECK-NEON-NEXT: cmp r0, #0 -; CHECK-NEON-NEXT: vmov.32 d8[1], r1 +; CHECK-NEON-NEXT: vmov.32 d10[1], r1 ; CHECK-NEON-NEXT: mvnne r0, #0 ; CHECK-NEON-NEXT: subs r1, r6, r9 ; CHECK-NEON-NEXT: sbcs r1, r7, #0 @@ -952,41 +978,49 @@ ; CHECK-NEON-NEXT: vdup.32 d12, r0 ; CHECK-NEON-NEXT: mov r0, r5 ; CHECK-NEON-NEXT: bl __aeabi_h2f -; CHECK-NEON-NEXT: vmov.i64 q5, #0xffffffff -; CHECK-NEON-NEXT: vbif q4, q5, q6 ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: mov r5, r0 -; CHECK-NEON-NEXT: vmov.32 d12[0], r0 +; CHECK-NEON-NEXT: vorr q4, q6, q6 +; CHECK-NEON-NEXT: mov r6, r0 +; CHECK-NEON-NEXT: vmov.i64 q8, #0xffffffff +; CHECK-NEON-NEXT: vmov.32 d14[0], r0 ; CHECK-NEON-NEXT: mov r0, r8 -; CHECK-NEON-NEXT: mov r6, r1 -; CHECK-NEON-NEXT: vmov r7, r10, d8 +; CHECK-NEON-NEXT: mov r5, r1 +; CHECK-NEON-NEXT: vbsl q4, q5, q8 ; CHECK-NEON-NEXT: bl __aeabi_h2f +; CHECK-NEON-NEXT: vmov.32 r7, d8[0] ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: subs r2, r5, r9 -; CHECK-NEON-NEXT: vmov.32 d13[0], r0 -; CHECK-NEON-NEXT: sbcs r2, r6, #0 +; CHECK-NEON-NEXT: subs r2, r6, r9 +; CHECK-NEON-NEXT: vmov.32 d15[0], r0 +; CHECK-NEON-NEXT: sbcs r2, r5, #0 +; CHECK-NEON-NEXT: vand q10, q5, q6 ; CHECK-NEON-NEXT: mov r2, #0 +; CHECK-NEON-NEXT: vmov.i64 q8, #0xffffffff ; CHECK-NEON-NEXT: movwlt r2, #1 ; CHECK-NEON-NEXT: subs r0, r0, r9 ; CHECK-NEON-NEXT: sbcs r0, r1, #0 -; CHECK-NEON-NEXT: vmov.32 d13[1], r1 +; CHECK-NEON-NEXT: vmov.32 d15[1], r1 ; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: movwlt r0, #1 ; CHECK-NEON-NEXT: cmp r0, #0 ; CHECK-NEON-NEXT: mvnne r0, #0 ; CHECK-NEON-NEXT: cmp r2, #0 -; CHECK-NEON-NEXT: vmov.32 d12[1], r6 +; CHECK-NEON-NEXT: vmov.32 d14[1], r5 ; CHECK-NEON-NEXT: mvnne r2, #0 -; CHECK-NEON-NEXT: vdup.32 d17, r0 -; CHECK-NEON-NEXT: rsbs r0, r7, #0 -; CHECK-NEON-NEXT: vdup.32 d16, r2 -; CHECK-NEON-NEXT: vmov r7, r5, d9 -; CHECK-NEON-NEXT: vbsl q8, q6, q5 -; CHECK-NEON-NEXT: rscs r0, r10, #0 +; CHECK-NEON-NEXT: rsbs r7, r7, #0 +; CHECK-NEON-NEXT: vdup.32 d19, r0 +; CHECK-NEON-NEXT: vdup.32 d18, r2 +; CHECK-NEON-NEXT: vbit q8, q7, q9 +; CHECK-NEON-NEXT: vmov.32 r0, d20[1] +; CHECK-NEON-NEXT: vand q9, q7, q9 +; CHECK-NEON-NEXT: vmov.32 r5, d9[0] +; CHECK-NEON-NEXT: vmov.32 r7, d21[1] +; CHECK-NEON-NEXT: vmov.32 r1, d16[0] +; CHECK-NEON-NEXT: vmov.32 r2, d18[1] +; CHECK-NEON-NEXT: vmov.32 r3, d17[0] +; CHECK-NEON-NEXT: vmov.32 r6, d19[1] +; CHECK-NEON-NEXT: rscs r0, r0, #0 ; CHECK-NEON-NEXT: mov r0, #0 ; CHECK-NEON-NEXT: movwlt r0, #1 -; CHECK-NEON-NEXT: vmov r1, r2, d16 -; CHECK-NEON-NEXT: vmov r3, r6, d17 ; CHECK-NEON-NEXT: rsbs r1, r1, #0 ; CHECK-NEON-NEXT: rscs r1, r2, #0 ; CHECK-NEON-NEXT: mov r1, #0 @@ -995,8 +1029,8 @@ ; CHECK-NEON-NEXT: rscs r2, r6, #0 ; CHECK-NEON-NEXT: mov r2, #0 ; CHECK-NEON-NEXT: movwlt r2, #1 -; CHECK-NEON-NEXT: rsbs r3, r7, #0 -; CHECK-NEON-NEXT: rscs r3, r5, #0 +; CHECK-NEON-NEXT: rsbs r3, r5, #0 +; CHECK-NEON-NEXT: rscs r3, r7, #0 ; CHECK-NEON-NEXT: movwlt r4, #1 ; CHECK-NEON-NEXT: cmp r4, #0 ; CHECK-NEON-NEXT: mvnne r4, #0 @@ -1014,21 +1048,19 @@ ; CHECK-NEON-NEXT: vmovn.i64 d1, q8 ; CHECK-NEON-NEXT: vand q9, q4, q9 ; CHECK-NEON-NEXT: vmovn.i64 d0, q9 -; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEON-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEON-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} ; ; CHECK-FP16-LABEL: ustest_f16i32: ; CHECK-FP16: @ %bb.0: @ %entry ; CHECK-FP16-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-FP16-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-FP16-NEXT: .vsave {d10, d11, d12, d13, d14, d15} -; CHECK-FP16-NEXT: vpush {d10, d11, d12, d13, d14, d15} -; CHECK-FP16-NEXT: .vsave {d8} -; CHECK-FP16-NEXT: vpush {d8} +; CHECK-FP16-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-FP16-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-FP16-NEXT: vmov.u16 r0, d0[1] ; CHECK-FP16-NEXT: vorr d8, d0, d0 -; CHECK-FP16-NEXT: vmov.u16 r8, d0[2] -; CHECK-FP16-NEXT: vmov.u16 r9, d0[3] +; CHECK-FP16-NEXT: vmov.u16 r8, d0[3] +; CHECK-FP16-NEXT: vmov.u16 r7, d0[2] ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi ; CHECK-FP16-NEXT: mov r4, r0 @@ -1038,41 +1070,43 @@ ; CHECK-FP16-NEXT: vmov s0, r0 ; CHECK-FP16-NEXT: bl __fixhfdi ; CHECK-FP16-NEXT: vmov.32 d10[0], r0 -; CHECK-FP16-NEXT: mvn r7, #0 -; CHECK-FP16-NEXT: subs r0, r0, r7 -; CHECK-FP16-NEXT: vmov.i64 q6, #0xffffffff +; CHECK-FP16-NEXT: mvn r9, #0 +; CHECK-FP16-NEXT: subs r0, r0, r9 +; CHECK-FP16-NEXT: vmov s0, r7 ; CHECK-FP16-NEXT: sbcs r0, r1, #0 ; CHECK-FP16-NEXT: vmov.32 d11[1], r5 ; CHECK-FP16-NEXT: mov r0, #0 -; CHECK-FP16-NEXT: vmov s0, r8 +; CHECK-FP16-NEXT: mov r6, #0 ; CHECK-FP16-NEXT: movwlt r0, #1 ; CHECK-FP16-NEXT: cmp r0, #0 ; CHECK-FP16-NEXT: vmov.32 d10[1], r1 ; CHECK-FP16-NEXT: mvnne r0, #0 -; CHECK-FP16-NEXT: subs r1, r4, r7 -; CHECK-FP16-NEXT: mov r6, #0 +; CHECK-FP16-NEXT: subs r1, r4, r9 ; CHECK-FP16-NEXT: sbcs r1, r5, #0 -; CHECK-FP16-NEXT: vmov s16, r9 ; CHECK-FP16-NEXT: mov r1, #0 ; CHECK-FP16-NEXT: movwlt r1, #1 ; CHECK-FP16-NEXT: cmp r1, #0 ; CHECK-FP16-NEXT: mvnne r1, #0 -; CHECK-FP16-NEXT: vdup.32 d17, r1 -; CHECK-FP16-NEXT: vdup.32 d16, r0 -; CHECK-FP16-NEXT: vbif q5, q6, q8 -; CHECK-FP16-NEXT: vmov r9, r8, d10 +; CHECK-FP16-NEXT: vdup.32 d13, r1 +; CHECK-FP16-NEXT: vdup.32 d12, r0 ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: vmov.f32 s0, s16 -; CHECK-FP16-NEXT: mov r4, r0 -; CHECK-FP16-NEXT: mov r5, r1 +; CHECK-FP16-NEXT: vorr q4, q6, q6 +; CHECK-FP16-NEXT: mov r5, r0 +; CHECK-FP16-NEXT: vmov.i64 q8, #0xffffffff +; CHECK-FP16-NEXT: mov r4, r1 +; CHECK-FP16-NEXT: vbsl q4, q5, q8 ; CHECK-FP16-NEXT: vmov.32 d14[0], r0 +; CHECK-FP16-NEXT: vmov s0, r8 +; CHECK-FP16-NEXT: vmov.32 r8, d8[0] ; CHECK-FP16-NEXT: bl __fixhfdi -; CHECK-FP16-NEXT: subs r2, r4, r7 +; CHECK-FP16-NEXT: subs r2, r5, r9 ; CHECK-FP16-NEXT: vmov.32 d15[0], r0 -; CHECK-FP16-NEXT: sbcs r2, r5, #0 +; CHECK-FP16-NEXT: sbcs r2, r4, #0 +; CHECK-FP16-NEXT: vand q10, q5, q6 ; CHECK-FP16-NEXT: mov r2, #0 +; CHECK-FP16-NEXT: vmov.i64 q8, #0xffffffff ; CHECK-FP16-NEXT: movwlt r2, #1 -; CHECK-FP16-NEXT: subs r0, r0, r7 +; CHECK-FP16-NEXT: subs r0, r0, r9 ; CHECK-FP16-NEXT: sbcs r0, r1, #0 ; CHECK-FP16-NEXT: vmov.32 d15[1], r1 ; CHECK-FP16-NEXT: mov r0, #0 @@ -1080,18 +1114,23 @@ ; CHECK-FP16-NEXT: cmp r0, #0 ; CHECK-FP16-NEXT: mvnne r0, #0 ; CHECK-FP16-NEXT: cmp r2, #0 -; CHECK-FP16-NEXT: vmov.32 d14[1], r5 +; CHECK-FP16-NEXT: vdup.32 d19, r0 ; CHECK-FP16-NEXT: mvnne r2, #0 -; CHECK-FP16-NEXT: vmov r5, r4, d11 -; CHECK-FP16-NEXT: vdup.32 d17, r0 -; CHECK-FP16-NEXT: rsbs r0, r9, #0 -; CHECK-FP16-NEXT: vdup.32 d16, r2 -; CHECK-FP16-NEXT: rscs r0, r8, #0 -; CHECK-FP16-NEXT: vbsl q8, q7, q6 +; CHECK-FP16-NEXT: vmov.32 d14[1], r4 +; CHECK-FP16-NEXT: rsbs r4, r8, #0 +; CHECK-FP16-NEXT: vdup.32 d18, r2 +; CHECK-FP16-NEXT: vbit q8, q7, q9 +; CHECK-FP16-NEXT: vmov.32 r0, d20[1] +; CHECK-FP16-NEXT: vand q9, q7, q9 +; CHECK-FP16-NEXT: vmov.32 r5, d9[0] +; CHECK-FP16-NEXT: vmov.32 r4, d21[1] +; CHECK-FP16-NEXT: vmov.32 r1, d16[0] +; CHECK-FP16-NEXT: vmov.32 r2, d18[1] +; CHECK-FP16-NEXT: vmov.32 r3, d17[0] +; CHECK-FP16-NEXT: vmov.32 r7, d19[1] +; CHECK-FP16-NEXT: rscs r0, r0, #0 ; CHECK-FP16-NEXT: mov r0, #0 ; CHECK-FP16-NEXT: movwlt r0, #1 -; CHECK-FP16-NEXT: vmov r1, r2, d16 -; CHECK-FP16-NEXT: vmov r3, r7, d17 ; CHECK-FP16-NEXT: rsbs r1, r1, #0 ; CHECK-FP16-NEXT: rscs r1, r2, #0 ; CHECK-FP16-NEXT: mov r1, #0 @@ -1117,10 +1156,9 @@ ; CHECK-FP16-NEXT: vand q8, q8, q10 ; CHECK-FP16-NEXT: vmov.32 d18[0], r0 ; CHECK-FP16-NEXT: vmovn.i64 d1, q8 -; CHECK-FP16-NEXT: vand q9, q5, q9 +; CHECK-FP16-NEXT: vand q9, q4, q9 ; CHECK-FP16-NEXT: vmovn.i64 d0, q9 -; CHECK-FP16-NEXT: vpop {d8} -; CHECK-FP16-NEXT: vpop {d10, d11, d12, d13, d14, d15} +; CHECK-FP16-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-FP16-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, pc} entry: %conv = fptosi <4 x half> %x to <4 x i64> diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll --- a/llvm/test/CodeGen/ARM/vdup.ll +++ b/llvm/test/CodeGen/ARM/vdup.ll @@ -56,7 +56,16 @@ define <16 x i8> @v_dupQ8(i8 %A) nounwind { ; CHECK-LABEL: v_dupQ8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.8 q8, r0 +; CHECK-NEXT: vmov.i32 d17, #0x0 +; CHECK-NEXT: vdup.8 d16, r0 +; CHECK-NEXT: vmov.8 d17[0], r0 +; CHECK-NEXT: vmov.8 d17[1], r0 +; CHECK-NEXT: vmov.8 d17[2], r0 +; CHECK-NEXT: vmov.8 d17[3], r0 +; CHECK-NEXT: vmov.8 d17[4], r0 +; CHECK-NEXT: vmov.8 d17[5], r0 +; CHECK-NEXT: vmov.8 d17[6], r0 +; CHECK-NEXT: vmov.8 d17[7], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -82,7 +91,12 @@ define <8 x i16> @v_dupQ16(i16 %A) nounwind { ; CHECK-LABEL: v_dupQ16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.16 q8, r0 +; CHECK-NEXT: vmov.i32 d17, #0x0 +; CHECK-NEXT: vdup.16 d16, r0 +; CHECK-NEXT: vmov.16 d17[0], r0 +; CHECK-NEXT: vmov.16 d17[1], r0 +; CHECK-NEXT: vmov.16 d17[2], r0 +; CHECK-NEXT: vmov.16 d17[3], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -114,9 +128,12 @@ define <4 x float> @v_dupQfloat(float %A) nounwind { ; CHECK-LABEL: v_dupQfloat: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q8, r0 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vmov.f32 s1, s0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.f32 s2, s0 +; CHECK-NEXT: vmov.f32 s3, s0 +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: mov pc, lr %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 @@ -393,9 +410,11 @@ define <4 x float> @tdupf(float %x, float %y) { ; CHECK-LABEL: tdupf: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vmov s3, r1 +; CHECK-NEXT: vmov.f32 s1, s0 ; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.f32 s2, s0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: mov pc, lr %1 = insertelement <4 x float> undef, float %x, i32 0 diff --git a/llvm/test/CodeGen/ARM/vldlane.ll b/llvm/test/CodeGen/ARM/vldlane.ll --- a/llvm/test/CodeGen/ARM/vldlane.ll +++ b/llvm/test/CodeGen/ARM/vldlane.ll @@ -72,13 +72,23 @@ } define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { -; CHECK-LABEL: vld1laneQi8: -; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.8 {d17[1]}, [r0] -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; DEFAULT-LABEL: vld1laneQi8: +; DEFAULT: @ %bb.0: +; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] +; DEFAULT-NEXT: vorr q9, q8, q8 +; DEFAULT-NEXT: vld1.8 {d19[1]}, [r0] +; DEFAULT-NEXT: vmov r0, r1, d16 +; DEFAULT-NEXT: vmov r2, r3, d19 +; DEFAULT-NEXT: mov pc, lr +; +; BASIC-LABEL: vld1laneQi8: +; BASIC: @ %bb.0: +; BASIC-NEXT: vld1.64 {d18, d19}, [r1] +; BASIC-NEXT: vorr q8, q9, q9 +; BASIC-NEXT: vld1.8 {d17[1]}, [r0] +; BASIC-NEXT: vmov r0, r1, d18 +; BASIC-NEXT: vmov r2, r3, d17 +; BASIC-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %B %tmp2 = load i8, i8* %A, align 8 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 @@ -86,13 +96,23 @@ } define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { -; CHECK-LABEL: vld1laneQi16: -; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.16 {d17[1]}, [r0:16] -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; DEFAULT-LABEL: vld1laneQi16: +; DEFAULT: @ %bb.0: +; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] +; DEFAULT-NEXT: vorr q9, q8, q8 +; DEFAULT-NEXT: vld1.16 {d19[1]}, [r0:16] +; DEFAULT-NEXT: vmov r0, r1, d16 +; DEFAULT-NEXT: vmov r2, r3, d19 +; DEFAULT-NEXT: mov pc, lr +; +; BASIC-LABEL: vld1laneQi16: +; BASIC: @ %bb.0: +; BASIC-NEXT: vld1.64 {d18, d19}, [r1] +; BASIC-NEXT: vorr q8, q9, q9 +; BASIC-NEXT: vld1.16 {d17[1]}, [r0:16] +; BASIC-NEXT: vmov r0, r1, d18 +; BASIC-NEXT: vmov r2, r3, d17 +; BASIC-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %B %tmp2 = load i16, i16* %A, align 8 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 @@ -100,13 +120,23 @@ } define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { -; CHECK-LABEL: vld1laneQi32: -; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.32 {d17[1]}, [r0:32] -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; DEFAULT-LABEL: vld1laneQi32: +; DEFAULT: @ %bb.0: +; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] +; DEFAULT-NEXT: vorr q9, q8, q8 +; DEFAULT-NEXT: vld1.32 {d19[1]}, [r0:32] +; DEFAULT-NEXT: vmov r0, r1, d16 +; DEFAULT-NEXT: vmov r2, r3, d19 +; DEFAULT-NEXT: mov pc, lr +; +; BASIC-LABEL: vld1laneQi32: +; BASIC: @ %bb.0: +; BASIC-NEXT: vld1.64 {d18, d19}, [r1] +; BASIC-NEXT: vorr q8, q9, q9 +; BASIC-NEXT: vld1.32 {d17[1]}, [r0:32] +; BASIC-NEXT: vmov r0, r1, d18 +; BASIC-NEXT: vmov r2, r3, d17 +; BASIC-NEXT: mov pc, lr %tmp1 = load <4 x i32>, <4 x i32>* %B %tmp2 = load i32, i32* %A, align 8 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 @@ -114,13 +144,23 @@ } define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { -; CHECK-LABEL: vld1laneQf: -; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; DEFAULT-LABEL: vld1laneQf: +; DEFAULT: @ %bb.0: +; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] +; DEFAULT-NEXT: vorr q9, q8, q8 +; DEFAULT-NEXT: vmov r2, r3, d17 +; DEFAULT-NEXT: vld1.32 {d18[0]}, [r0:32] +; DEFAULT-NEXT: vmov r0, r1, d18 +; DEFAULT-NEXT: mov pc, lr +; +; BASIC-LABEL: vld1laneQf: +; BASIC: @ %bb.0: +; BASIC-NEXT: vld1.64 {d18, d19}, [r1] +; BASIC-NEXT: vorr q8, q9, q9 +; BASIC-NEXT: vmov r2, r3, d19 +; BASIC-NEXT: vld1.32 {d16[0]}, [r0:32] +; BASIC-NEXT: vmov r0, r1, d16 +; BASIC-NEXT: mov pc, lr %tmp1 = load <4 x float>, <4 x float>* %B %tmp2 = load float, float* %A %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 diff --git a/llvm/test/CodeGen/ARM/vzip.ll b/llvm/test/CodeGen/ARM/vzip.ll --- a/llvm/test/CodeGen/ARM/vzip.ll +++ b/llvm/test/CodeGen/ARM/vzip.ll @@ -291,7 +291,7 @@ ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d19, [r0] ; CHECK-NEXT: vtrn.16 d19, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d19 ; CHECK-NEXT: mov pc, lr entry: diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll --- a/llvm/test/CodeGen/Mips/cconv/vector.ll +++ b/llvm/test/CodeGen/Mips/cconv/vector.ll @@ -692,84 +692,68 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: i8_8: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: sw $6, 24($sp) -; MIPS32R5EB-NEXT: lbu $1, 25($sp) -; MIPS32R5EB-NEXT: lbu $2, 24($sp) -; MIPS32R5EB-NEXT: sw $7, 28($sp) -; MIPS32R5EB-NEXT: insert.h $w0[0], $2 -; MIPS32R5EB-NEXT: insert.h $w0[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 26($sp) -; MIPS32R5EB-NEXT: sw $4, 32($sp) -; MIPS32R5EB-NEXT: insert.h $w0[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 27($sp) -; MIPS32R5EB-NEXT: insert.h $w0[3], $1 -; MIPS32R5EB-NEXT: lbu $1, 28($sp) -; MIPS32R5EB-NEXT: sw $5, 36($sp) -; MIPS32R5EB-NEXT: insert.h $w0[4], $1 -; MIPS32R5EB-NEXT: lbu $1, 33($sp) -; MIPS32R5EB-NEXT: lbu $2, 32($sp) -; MIPS32R5EB-NEXT: insert.h $w1[0], $2 -; MIPS32R5EB-NEXT: insert.h $w1[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 29($sp) -; MIPS32R5EB-NEXT: lbu $2, 34($sp) -; MIPS32R5EB-NEXT: insert.h $w1[2], $2 -; MIPS32R5EB-NEXT: insert.h $w0[5], $1 -; MIPS32R5EB-NEXT: lbu $1, 35($sp) -; MIPS32R5EB-NEXT: lbu $2, 31($sp) -; MIPS32R5EB-NEXT: lbu $3, 30($sp) -; MIPS32R5EB-NEXT: lbu $4, 39($sp) -; MIPS32R5EB-NEXT: insert.h $w0[6], $3 -; MIPS32R5EB-NEXT: insert.h $w0[7], $2 -; MIPS32R5EB-NEXT: insert.h $w1[3], $1 -; MIPS32R5EB-NEXT: lbu $1, 36($sp) -; MIPS32R5EB-NEXT: insert.h $w1[4], $1 -; MIPS32R5EB-NEXT: lbu $1, 37($sp) -; MIPS32R5EB-NEXT: insert.h $w1[5], $1 -; MIPS32R5EB-NEXT: lbu $1, 38($sp) -; MIPS32R5EB-NEXT: insert.h $w1[6], $1 -; MIPS32R5EB-NEXT: insert.h $w1[7], $4 -; MIPS32R5EB-NEXT: addv.h $w0, $w1, $w0 -; MIPS32R5EB-NEXT: copy_s.h $1, $w0[0] -; MIPS32R5EB-NEXT: copy_s.h $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.h $3, $w0[2] -; MIPS32R5EB-NEXT: copy_s.h $4, $w0[3] -; MIPS32R5EB-NEXT: copy_s.h $5, $w0[4] -; MIPS32R5EB-NEXT: copy_s.h $6, $w0[5] -; MIPS32R5EB-NEXT: copy_s.h $7, $w0[6] -; MIPS32R5EB-NEXT: copy_s.h $8, $w0[7] -; MIPS32R5EB-NEXT: sb $8, 23($sp) -; MIPS32R5EB-NEXT: sb $7, 22($sp) -; MIPS32R5EB-NEXT: sb $6, 21($sp) -; MIPS32R5EB-NEXT: sb $5, 20($sp) -; MIPS32R5EB-NEXT: sb $4, 19($sp) -; MIPS32R5EB-NEXT: sb $3, 18($sp) -; MIPS32R5EB-NEXT: sb $2, 17($sp) -; MIPS32R5EB-NEXT: sb $1, 16($sp) -; MIPS32R5EB-NEXT: lw $1, 20($sp) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: lw $1, 16($sp) -; MIPS32R5EB-NEXT: sw $1, 4($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: i8_8: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: addiu $sp, $sp, -24 +; MIPS32R5-NEXT: .cfi_def_cfa_offset 24 +; MIPS32R5-NEXT: sw $6, 8($sp) +; MIPS32R5-NEXT: lbu $1, 9($sp) +; MIPS32R5-NEXT: lbu $2, 8($sp) +; MIPS32R5-NEXT: sw $7, 12($sp) +; MIPS32R5-NEXT: insert.h $w0[0], $2 +; MIPS32R5-NEXT: insert.h $w0[1], $1 +; MIPS32R5-NEXT: lbu $1, 10($sp) +; MIPS32R5-NEXT: sw $4, 16($sp) +; MIPS32R5-NEXT: insert.h $w0[2], $1 +; MIPS32R5-NEXT: lbu $1, 11($sp) +; MIPS32R5-NEXT: insert.h $w0[3], $1 +; MIPS32R5-NEXT: lbu $1, 12($sp) +; MIPS32R5-NEXT: sw $5, 20($sp) +; MIPS32R5-NEXT: insert.h $w0[4], $1 +; MIPS32R5-NEXT: lbu $1, 17($sp) +; MIPS32R5-NEXT: lbu $2, 16($sp) +; MIPS32R5-NEXT: insert.h $w1[0], $2 +; MIPS32R5-NEXT: insert.h $w1[1], $1 +; MIPS32R5-NEXT: lbu $1, 13($sp) +; MIPS32R5-NEXT: lbu $2, 18($sp) +; MIPS32R5-NEXT: insert.h $w1[2], $2 +; MIPS32R5-NEXT: insert.h $w0[5], $1 +; MIPS32R5-NEXT: lbu $1, 19($sp) +; MIPS32R5-NEXT: lbu $2, 15($sp) +; MIPS32R5-NEXT: lbu $3, 14($sp) +; MIPS32R5-NEXT: lbu $4, 23($sp) +; MIPS32R5-NEXT: insert.h $w0[6], $3 +; MIPS32R5-NEXT: insert.h $w0[7], $2 +; MIPS32R5-NEXT: insert.h $w1[3], $1 +; MIPS32R5-NEXT: lbu $1, 20($sp) +; MIPS32R5-NEXT: insert.h $w1[4], $1 +; MIPS32R5-NEXT: lbu $1, 21($sp) +; MIPS32R5-NEXT: insert.h $w1[5], $1 +; MIPS32R5-NEXT: lbu $1, 22($sp) +; MIPS32R5-NEXT: insert.h $w1[6], $1 +; MIPS32R5-NEXT: insert.h $w1[7], $4 +; MIPS32R5-NEXT: addv.h $w0, $w1, $w0 +; MIPS32R5-NEXT: copy_s.h $1, $w0[4] +; MIPS32R5-NEXT: copy_s.h $2, $w0[5] +; MIPS32R5-NEXT: copy_s.h $3, $w0[6] +; MIPS32R5-NEXT: copy_s.h $4, $w0[7] +; MIPS32R5-NEXT: copy_s.h $5, $w0[0] +; MIPS32R5-NEXT: copy_s.h $6, $w0[1] +; MIPS32R5-NEXT: copy_s.h $7, $w0[2] +; MIPS32R5-NEXT: copy_s.h $8, $w0[3] +; MIPS32R5-NEXT: sb $8, 3($sp) +; MIPS32R5-NEXT: sb $7, 2($sp) +; MIPS32R5-NEXT: sb $6, 1($sp) +; MIPS32R5-NEXT: sb $5, 0($sp) +; MIPS32R5-NEXT: sb $4, 7($sp) +; MIPS32R5-NEXT: sb $3, 6($sp) +; MIPS32R5-NEXT: sb $2, 5($sp) +; MIPS32R5-NEXT: sb $1, 4($sp) +; MIPS32R5-NEXT: lw $2, 0($sp) +; MIPS32R5-NEXT: lw $3, 4($sp) +; MIPS32R5-NEXT: addiu $sp, $sp, 24 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: i8_8: ; MIPS64R5: # %bb.0: @@ -830,85 +814,6 @@ ; MIPS64R5-NEXT: daddiu $sp, $sp, 32 ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: i8_8: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: sw $6, 24($sp) -; MIPS32R5EL-NEXT: lbu $1, 25($sp) -; MIPS32R5EL-NEXT: lbu $2, 24($sp) -; MIPS32R5EL-NEXT: sw $7, 28($sp) -; MIPS32R5EL-NEXT: insert.h $w0[0], $2 -; MIPS32R5EL-NEXT: insert.h $w0[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 26($sp) -; MIPS32R5EL-NEXT: sw $4, 32($sp) -; MIPS32R5EL-NEXT: insert.h $w0[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 27($sp) -; MIPS32R5EL-NEXT: insert.h $w0[3], $1 -; MIPS32R5EL-NEXT: lbu $1, 28($sp) -; MIPS32R5EL-NEXT: sw $5, 36($sp) -; MIPS32R5EL-NEXT: insert.h $w0[4], $1 -; MIPS32R5EL-NEXT: lbu $1, 33($sp) -; MIPS32R5EL-NEXT: lbu $2, 32($sp) -; MIPS32R5EL-NEXT: insert.h $w1[0], $2 -; MIPS32R5EL-NEXT: insert.h $w1[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 29($sp) -; MIPS32R5EL-NEXT: lbu $2, 34($sp) -; MIPS32R5EL-NEXT: insert.h $w1[2], $2 -; MIPS32R5EL-NEXT: insert.h $w0[5], $1 -; MIPS32R5EL-NEXT: lbu $1, 35($sp) -; MIPS32R5EL-NEXT: lbu $2, 31($sp) -; MIPS32R5EL-NEXT: lbu $3, 30($sp) -; MIPS32R5EL-NEXT: lbu $4, 39($sp) -; MIPS32R5EL-NEXT: insert.h $w0[6], $3 -; MIPS32R5EL-NEXT: insert.h $w0[7], $2 -; MIPS32R5EL-NEXT: insert.h $w1[3], $1 -; MIPS32R5EL-NEXT: lbu $1, 36($sp) -; MIPS32R5EL-NEXT: insert.h $w1[4], $1 -; MIPS32R5EL-NEXT: lbu $1, 37($sp) -; MIPS32R5EL-NEXT: insert.h $w1[5], $1 -; MIPS32R5EL-NEXT: lbu $1, 38($sp) -; MIPS32R5EL-NEXT: insert.h $w1[6], $1 -; MIPS32R5EL-NEXT: insert.h $w1[7], $4 -; MIPS32R5EL-NEXT: addv.h $w0, $w1, $w0 -; MIPS32R5EL-NEXT: copy_s.h $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.h $2, $w0[1] -; MIPS32R5EL-NEXT: copy_s.h $3, $w0[2] -; MIPS32R5EL-NEXT: copy_s.h $4, $w0[3] -; MIPS32R5EL-NEXT: copy_s.h $5, $w0[4] -; MIPS32R5EL-NEXT: copy_s.h $6, $w0[5] -; MIPS32R5EL-NEXT: copy_s.h $7, $w0[6] -; MIPS32R5EL-NEXT: copy_s.h $8, $w0[7] -; MIPS32R5EL-NEXT: sb $8, 23($sp) -; MIPS32R5EL-NEXT: sb $7, 22($sp) -; MIPS32R5EL-NEXT: sb $6, 21($sp) -; MIPS32R5EL-NEXT: sb $5, 20($sp) -; MIPS32R5EL-NEXT: sb $4, 19($sp) -; MIPS32R5EL-NEXT: sb $3, 18($sp) -; MIPS32R5EL-NEXT: sb $2, 17($sp) -; MIPS32R5EL-NEXT: sb $1, 16($sp) -; MIPS32R5EL-NEXT: lw $1, 20($sp) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: lw $1, 16($sp) -; MIPS32R5EL-NEXT: sw $1, 0($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = add <8 x i8> %a, %b ret <8 x i8> %1 } @@ -1372,60 +1277,44 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: i16_4: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: sw $6, 24($sp) -; MIPS32R5EB-NEXT: sw $7, 28($sp) -; MIPS32R5EB-NEXT: lhu $1, 26($sp) -; MIPS32R5EB-NEXT: lhu $2, 24($sp) -; MIPS32R5EB-NEXT: sw $4, 32($sp) -; MIPS32R5EB-NEXT: insert.w $w0[0], $2 -; MIPS32R5EB-NEXT: insert.w $w0[1], $1 -; MIPS32R5EB-NEXT: lhu $1, 28($sp) -; MIPS32R5EB-NEXT: sw $5, 36($sp) -; MIPS32R5EB-NEXT: insert.w $w0[2], $1 -; MIPS32R5EB-NEXT: lhu $1, 30($sp) -; MIPS32R5EB-NEXT: insert.w $w0[3], $1 -; MIPS32R5EB-NEXT: lhu $1, 34($sp) -; MIPS32R5EB-NEXT: lhu $2, 32($sp) -; MIPS32R5EB-NEXT: insert.w $w1[0], $2 -; MIPS32R5EB-NEXT: insert.w $w1[1], $1 -; MIPS32R5EB-NEXT: lhu $1, 36($sp) -; MIPS32R5EB-NEXT: insert.w $w1[2], $1 -; MIPS32R5EB-NEXT: lhu $1, 38($sp) -; MIPS32R5EB-NEXT: insert.w $w1[3], $1 -; MIPS32R5EB-NEXT: addv.w $w0, $w1, $w0 -; MIPS32R5EB-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $4, $w0[3] -; MIPS32R5EB-NEXT: sh $4, 22($sp) -; MIPS32R5EB-NEXT: sh $3, 20($sp) -; MIPS32R5EB-NEXT: sh $2, 18($sp) -; MIPS32R5EB-NEXT: sh $1, 16($sp) -; MIPS32R5EB-NEXT: lw $1, 20($sp) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: lw $1, 16($sp) -; MIPS32R5EB-NEXT: sw $1, 4($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: i16_4: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: addiu $sp, $sp, -24 +; MIPS32R5-NEXT: .cfi_def_cfa_offset 24 +; MIPS32R5-NEXT: sw $6, 8($sp) +; MIPS32R5-NEXT: sw $7, 12($sp) +; MIPS32R5-NEXT: lhu $1, 10($sp) +; MIPS32R5-NEXT: lhu $2, 8($sp) +; MIPS32R5-NEXT: sw $4, 16($sp) +; MIPS32R5-NEXT: insert.w $w0[0], $2 +; MIPS32R5-NEXT: insert.w $w0[1], $1 +; MIPS32R5-NEXT: lhu $1, 12($sp) +; MIPS32R5-NEXT: sw $5, 20($sp) +; MIPS32R5-NEXT: insert.w $w0[2], $1 +; MIPS32R5-NEXT: lhu $1, 14($sp) +; MIPS32R5-NEXT: insert.w $w0[3], $1 +; MIPS32R5-NEXT: lhu $1, 18($sp) +; MIPS32R5-NEXT: lhu $2, 16($sp) +; MIPS32R5-NEXT: insert.w $w1[0], $2 +; MIPS32R5-NEXT: insert.w $w1[1], $1 +; MIPS32R5-NEXT: lhu $1, 20($sp) +; MIPS32R5-NEXT: insert.w $w1[2], $1 +; MIPS32R5-NEXT: lhu $1, 22($sp) +; MIPS32R5-NEXT: insert.w $w1[3], $1 +; MIPS32R5-NEXT: addv.w $w0, $w1, $w0 +; MIPS32R5-NEXT: copy_s.w $1, $w0[2] +; MIPS32R5-NEXT: copy_s.w $2, $w0[3] +; MIPS32R5-NEXT: copy_s.w $3, $w0[0] +; MIPS32R5-NEXT: copy_s.w $4, $w0[1] +; MIPS32R5-NEXT: sh $4, 2($sp) +; MIPS32R5-NEXT: sh $3, 0($sp) +; MIPS32R5-NEXT: sh $2, 6($sp) +; MIPS32R5-NEXT: sh $1, 4($sp) +; MIPS32R5-NEXT: lw $2, 0($sp) +; MIPS32R5-NEXT: lw $3, 4($sp) +; MIPS32R5-NEXT: addiu $sp, $sp, 24 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: i16_4: ; MIPS64R5: # %bb.0: @@ -1462,61 +1351,6 @@ ; MIPS64R5-NEXT: daddiu $sp, $sp, 32 ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: i16_4: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: sw $6, 24($sp) -; MIPS32R5EL-NEXT: sw $7, 28($sp) -; MIPS32R5EL-NEXT: lhu $1, 26($sp) -; MIPS32R5EL-NEXT: lhu $2, 24($sp) -; MIPS32R5EL-NEXT: sw $4, 32($sp) -; MIPS32R5EL-NEXT: insert.w $w0[0], $2 -; MIPS32R5EL-NEXT: insert.w $w0[1], $1 -; MIPS32R5EL-NEXT: lhu $1, 28($sp) -; MIPS32R5EL-NEXT: sw $5, 36($sp) -; MIPS32R5EL-NEXT: insert.w $w0[2], $1 -; MIPS32R5EL-NEXT: lhu $1, 30($sp) -; MIPS32R5EL-NEXT: insert.w $w0[3], $1 -; MIPS32R5EL-NEXT: lhu $1, 34($sp) -; MIPS32R5EL-NEXT: lhu $2, 32($sp) -; MIPS32R5EL-NEXT: insert.w $w1[0], $2 -; MIPS32R5EL-NEXT: insert.w $w1[1], $1 -; MIPS32R5EL-NEXT: lhu $1, 36($sp) -; MIPS32R5EL-NEXT: insert.w $w1[2], $1 -; MIPS32R5EL-NEXT: lhu $1, 38($sp) -; MIPS32R5EL-NEXT: insert.w $w1[3], $1 -; MIPS32R5EL-NEXT: addv.w $w0, $w1, $w0 -; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $4, $w0[3] -; MIPS32R5EL-NEXT: sh $4, 22($sp) -; MIPS32R5EL-NEXT: sh $3, 20($sp) -; MIPS32R5EL-NEXT: sh $2, 18($sp) -; MIPS32R5EL-NEXT: sh $1, 16($sp) -; MIPS32R5EL-NEXT: lw $1, 20($sp) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: lw $1, 16($sp) -; MIPS32R5EL-NEXT: sw $1, 0($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = add <4 x i16> %a, %b ret <4 x i16> %1 } @@ -2561,33 +2395,14 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: ret_8_i8: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: lui $1, %hi(gv8i8) -; MIPS32R5EB-NEXT: lw $2, %lo(gv8i8)($1) -; MIPS32R5EB-NEXT: sw $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i8) -; MIPS32R5EB-NEXT: lw $1, 4($1) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: ret_8_i8: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: lui $1, %hi(gv8i8) +; MIPS32R5-NEXT: lw $2, %lo(gv8i8)($1) +; MIPS32R5-NEXT: addiu $1, $1, %lo(gv8i8) +; MIPS32R5-NEXT: lw $3, 4($1) +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: ret_8_i8: ; MIPS64R5: # %bb.0: @@ -2598,34 +2413,6 @@ ; MIPS64R5-NEXT: ld $2, 0($1) ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: ret_8_i8: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: lui $1, %hi(gv8i8) -; MIPS32R5EL-NEXT: lw $2, %lo(gv8i8)($1) -; MIPS32R5EL-NEXT: sw $2, 0($sp) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i8) -; MIPS32R5EL-NEXT: lw $1, 4($1) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = load <8 x i8>, <8 x i8> * @gv8i8 ret <8 x i8> %1 } @@ -2738,33 +2525,14 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: ret_4_i16: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: lui $1, %hi(gv4i16) -; MIPS32R5EB-NEXT: lw $2, %lo(gv4i16)($1) -; MIPS32R5EB-NEXT: sw $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv4i16) -; MIPS32R5EB-NEXT: lw $1, 4($1) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: ret_4_i16: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: lui $1, %hi(gv4i16) +; MIPS32R5-NEXT: lw $2, %lo(gv4i16)($1) +; MIPS32R5-NEXT: addiu $1, $1, %lo(gv4i16) +; MIPS32R5-NEXT: lw $3, 4($1) +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: ret_4_i16: ; MIPS64R5: # %bb.0: @@ -2775,34 +2543,6 @@ ; MIPS64R5-NEXT: ld $2, 0($1) ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: ret_4_i16: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: lui $1, %hi(gv4i16) -; MIPS32R5EL-NEXT: lw $2, %lo(gv4i16)($1) -; MIPS32R5EL-NEXT: sw $2, 0($sp) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv4i16) -; MIPS32R5EL-NEXT: lw $1, 4($1) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = load <4 x i16>, <4 x i16> * @gv4i16 ret <4 x i16> %1 } @@ -2877,33 +2617,14 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: ret_2_i32: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: lui $1, %hi(gv2i32) -; MIPS32R5EB-NEXT: lw $2, %lo(gv2i32)($1) -; MIPS32R5EB-NEXT: sw $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2i32) -; MIPS32R5EB-NEXT: lw $1, 4($1) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: ret_2_i32: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: lui $1, %hi(gv2i32) +; MIPS32R5-NEXT: lw $2, %lo(gv2i32)($1) +; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2i32) +; MIPS32R5-NEXT: lw $3, 4($1) +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: ret_2_i32: ; MIPS64R5: # %bb.0: @@ -2914,34 +2635,6 @@ ; MIPS64R5-NEXT: ld $2, 0($1) ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: ret_2_i32: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: lui $1, %hi(gv2i32) -; MIPS32R5EL-NEXT: lw $2, %lo(gv2i32)($1) -; MIPS32R5EL-NEXT: sw $2, 0($sp) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2i32) -; MIPS32R5EL-NEXT: lw $1, 4($1) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = load <2 x i32>, <2 x i32> * @gv2i32 ret <2 x i32> %1 } @@ -3857,77 +3550,81 @@ ; MIPS64EB-NEXT: jr $ra ; MIPS64EB-NEXT: nop ; -; MIPS32R5-LABEL: calli8_16: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -40 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 40 -; MIPS32R5-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: lui $1, %hi($CPI30_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI30_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5-NEXT: lui $1, %hi($CPI30_1) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI30_1) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: jal i8_16 -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv16i8) -; MIPS32R5-NEXT: insert.w $w0[0], $2 -; MIPS32R5-NEXT: insert.w $w0[1], $3 -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv16i8) -; MIPS32R5-NEXT: insert.w $w0[2], $4 -; MIPS32R5-NEXT: insert.w $w0[3], $5 -; MIPS32R5-NEXT: st.w $w0, 0($1) -; MIPS32R5-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 40 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calli8_16: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: lui $1, 3080 +; MIPS32R5EB-NEXT: ori $1, $1, 2314 +; MIPS32R5EB-NEXT: lui $2, 1801 +; MIPS32R5EB-NEXT: sw $1, 28($sp) +; MIPS32R5EB-NEXT: ori $1, $2, 1801 +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: sw $1, 20($sp) +; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: lui $1, 1543 +; MIPS32R5EB-NEXT: ori $4, $1, 1543 +; MIPS32R5EB-NEXT: ori $7, $1, 2314 +; MIPS32R5EB-NEXT: move $5, $4 +; MIPS32R5EB-NEXT: move $6, $4 +; MIPS32R5EB-NEXT: jal i8_16 +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: insert.w $w0[0], $2 +; MIPS32R5EB-NEXT: insert.w $w0[1], $3 +; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv16i8) +; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv16i8) +; MIPS32R5EB-NEXT: st.w $w0, 0($1) +; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; -; MIPS64R5-LABEL: calli8_16: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI30_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI30_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI30_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI30_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(i8_16)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv16i8)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: calli8_16: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) +; MIPS64R5EB-NEXT: lui $1, 1801 +; MIPS64R5EB-NEXT: daddiu $1, $1, 1801 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 1801 +; MIPS64R5EB-NEXT: lui $2, 1543 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $2, $2, 1543 +; MIPS64R5EB-NEXT: dsll $2, $2, 16 +; MIPS64R5EB-NEXT: daddiu $2, $2, 1543 +; MIPS64R5EB-NEXT: dsll $2, $2, 16 +; MIPS64R5EB-NEXT: daddiu $4, $2, 1543 +; MIPS64R5EB-NEXT: daddiu $5, $2, 2314 +; MIPS64R5EB-NEXT: daddiu $6, $1, 1801 +; MIPS64R5EB-NEXT: lui $1, 225 +; MIPS64R5EB-NEXT: daddiu $1, $1, 8417 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 8577 +; MIPS64R5EB-NEXT: dsll $1, $1, 19 +; MIPS64R5EB-NEXT: daddiu $7, $1, 2314 +; MIPS64R5EB-NEXT: ld $25, %call16(i8_16)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv16i8)($gp) +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS32EL-LABEL: calli8_16: ; MIPS32EL: # %bb.0: # %entry @@ -4007,6 +3704,87 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calli8_16: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: lui $1, 2569 +; MIPS32R5EL-NEXT: ori $2, $1, 2060 +; MIPS32R5EL-NEXT: lui $3, 2311 +; MIPS32R5EL-NEXT: sw $2, 28($sp) +; MIPS32R5EL-NEXT: ori $2, $3, 2311 +; MIPS32R5EL-NEXT: sw $2, 24($sp) +; MIPS32R5EL-NEXT: sw $2, 20($sp) +; MIPS32R5EL-NEXT: sw $2, 16($sp) +; MIPS32R5EL-NEXT: lui $2, 1798 +; MIPS32R5EL-NEXT: ori $4, $2, 1798 +; MIPS32R5EL-NEXT: ori $7, $1, 1798 +; MIPS32R5EL-NEXT: move $5, $4 +; MIPS32R5EL-NEXT: move $6, $4 +; MIPS32R5EL-NEXT: jal i8_16 +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: insert.w $w0[0], $2 +; MIPS32R5EL-NEXT: insert.w $w0[1], $3 +; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv16i8) +; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv16i8) +; MIPS32R5EL-NEXT: st.w $w0, 0($1) +; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop +; +; MIPS64R5EL-LABEL: calli8_16: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) +; MIPS64R5EL-NEXT: lui $1, 1285 +; MIPS64R5EL-NEXT: daddiu $1, $1, -31869 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 899 +; MIPS64R5EL-NEXT: lui $2, 2311 +; MIPS64R5EL-NEXT: daddiu $2, $2, 2311 +; MIPS64R5EL-NEXT: dsll $2, $2, 16 +; MIPS64R5EL-NEXT: daddiu $2, $2, 2311 +; MIPS64R5EL-NEXT: dsll $2, $2, 16 +; MIPS64R5EL-NEXT: dsll $1, $1, 17 +; MIPS64R5EL-NEXT: lui $3, 899 +; MIPS64R5EL-NEXT: daddiu $3, $3, 899 +; MIPS64R5EL-NEXT: dsll $3, $3, 16 +; MIPS64R5EL-NEXT: daddiu $3, $3, 899 +; MIPS64R5EL-NEXT: dsll $3, $3, 17 +; MIPS64R5EL-NEXT: daddiu $4, $3, 1798 +; MIPS64R5EL-NEXT: daddiu $5, $1, 1798 +; MIPS64R5EL-NEXT: daddiu $6, $2, 2311 +; MIPS64R5EL-NEXT: lui $1, 642 +; MIPS64R5EL-NEXT: daddiu $1, $1, 16899 +; MIPS64R5EL-NEXT: dsll $1, $1, 18 +; MIPS64R5EL-NEXT: daddiu $1, $1, 2311 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $7, $1, 2311 +; MIPS64R5EL-NEXT: ld $25, %call16(i8_16)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv16i8)($gp) +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <16 x i8> @i8_16(<16 x i8> , <16 x i8> ) store <16 x i8> %0, <16 x i8> * @gv16i8 @@ -4512,36 +4290,26 @@ ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 ; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill ; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: lui $1, 9 +; MIPS32R5EB-NEXT: ori $5, $1, 10 +; MIPS32R5EB-NEXT: sw $5, 28($sp) +; MIPS32R5EB-NEXT: lui $1, 12 +; MIPS32R5EB-NEXT: ori $1, $1, 8 +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: sw $5, 20($sp) ; MIPS32R5EB-NEXT: lui $1, 6 -; MIPS32R5EB-NEXT: ori $1, $1, 7 -; MIPS32R5EB-NEXT: lui $2, 9 -; MIPS32R5EB-NEXT: ori $2, $2, 10 -; MIPS32R5EB-NEXT: fill.w $w0, $2 -; MIPS32R5EB-NEXT: insert.w $w0[1], $1 -; MIPS32R5EB-NEXT: splati.d $w0, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5EB-NEXT: lui $1, %hi($CPI33_0) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo($CPI33_0) -; MIPS32R5EB-NEXT: ld.w $w0, 0($1) -; MIPS32R5EB-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5EB-NEXT: sw $8, 28($sp) -; MIPS32R5EB-NEXT: sw $3, 24($sp) -; MIPS32R5EB-NEXT: sw $2, 20($sp) -; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: ori $4, $1, 7 +; MIPS32R5EB-NEXT: sw $4, 16($sp) +; MIPS32R5EB-NEXT: move $6, $4 +; MIPS32R5EB-NEXT: move $7, $5 ; MIPS32R5EB-NEXT: jal i16_8 ; MIPS32R5EB-NEXT: nop -; MIPS32R5EB-NEXT: lui $1, %hi(gv8i16) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EB-NEXT: insert.w $w0[0], $2 ; MIPS32R5EB-NEXT: insert.w $w0[1], $3 ; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv8i16) ; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EB-NEXT: st.w $w0, 0($1) ; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 @@ -4559,20 +4327,21 @@ ; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_8))) ; MIPS64R5EB-NEXT: daddu $1, $1, $25 ; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli16_8))) -; MIPS64R5EB-NEXT: lui $1, 9 -; MIPS64R5EB-NEXT: ori $1, $1, 10 -; MIPS64R5EB-NEXT: lui $2, 6 -; MIPS64R5EB-NEXT: ori $2, $2, 7 -; MIPS64R5EB-NEXT: dinsu $1, $2, 32, 32 -; MIPS64R5EB-NEXT: fill.d $w0, $1 -; MIPS64R5EB-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5EB-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5EB-NEXT: ld $1, %got_page(.LCPI33_0)($gp) -; MIPS64R5EB-NEXT: daddiu $1, $1, %got_ofst(.LCPI33_0) -; MIPS64R5EB-NEXT: ld.d $w0, 0($1) -; MIPS64R5EB-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5EB-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5EB-NEXT: lui $1, 6 +; MIPS64R5EB-NEXT: daddiu $1, $1, 7 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $4, $1, 10 +; MIPS64R5EB-NEXT: lui $1, 2 +; MIPS64R5EB-NEXT: daddiu $1, $1, -32767 +; MIPS64R5EB-NEXT: dsll $1, $1, 19 +; MIPS64R5EB-NEXT: daddiu $1, $1, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $7, $1, 10 ; MIPS64R5EB-NEXT: ld $25, %call16(i16_8)($gp) +; MIPS64R5EB-NEXT: move $5, $4 +; MIPS64R5EB-NEXT: move $6, $4 ; MIPS64R5EB-NEXT: jalr $25 ; MIPS64R5EB-NEXT: nop ; MIPS64R5EB-NEXT: ld $1, %got_disp(gv8i16)($gp) @@ -4660,35 +4429,25 @@ ; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill ; MIPS32R5EL-NEXT: .cfi_offset 31, -4 ; MIPS32R5EL-NEXT: lui $1, 10 -; MIPS32R5EL-NEXT: ori $1, $1, 9 -; MIPS32R5EL-NEXT: lui $2, 7 -; MIPS32R5EL-NEXT: ori $2, $2, 6 -; MIPS32R5EL-NEXT: fill.w $w0, $2 -; MIPS32R5EL-NEXT: insert.w $w0[1], $1 -; MIPS32R5EL-NEXT: splati.d $w0, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5EL-NEXT: lui $1, %hi($CPI33_0) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo($CPI33_0) -; MIPS32R5EL-NEXT: ld.w $w0, 0($1) -; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5EL-NEXT: sw $8, 28($sp) -; MIPS32R5EL-NEXT: sw $3, 24($sp) -; MIPS32R5EL-NEXT: sw $2, 20($sp) -; MIPS32R5EL-NEXT: sw $1, 16($sp) +; MIPS32R5EL-NEXT: ori $5, $1, 9 +; MIPS32R5EL-NEXT: sw $5, 28($sp) +; MIPS32R5EL-NEXT: lui $1, 8 +; MIPS32R5EL-NEXT: ori $1, $1, 12 +; MIPS32R5EL-NEXT: sw $1, 24($sp) +; MIPS32R5EL-NEXT: sw $5, 20($sp) +; MIPS32R5EL-NEXT: lui $1, 7 +; MIPS32R5EL-NEXT: ori $4, $1, 6 +; MIPS32R5EL-NEXT: sw $4, 16($sp) +; MIPS32R5EL-NEXT: move $6, $4 +; MIPS32R5EL-NEXT: move $7, $5 ; MIPS32R5EL-NEXT: jal i16_8 ; MIPS32R5EL-NEXT: nop -; MIPS32R5EL-NEXT: lui $1, %hi(gv8i16) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EL-NEXT: insert.w $w0[0], $2 ; MIPS32R5EL-NEXT: insert.w $w0[1], $3 ; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv8i16) ; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EL-NEXT: st.w $w0, 0($1) ; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 @@ -4706,20 +4465,21 @@ ; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_8))) ; MIPS64R5EL-NEXT: daddu $1, $1, $25 ; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli16_8))) -; MIPS64R5EL-NEXT: lui $1, 7 -; MIPS64R5EL-NEXT: ori $1, $1, 6 -; MIPS64R5EL-NEXT: lui $2, 10 -; MIPS64R5EL-NEXT: ori $2, $2, 9 -; MIPS64R5EL-NEXT: dinsu $1, $2, 32, 32 -; MIPS64R5EL-NEXT: fill.d $w0, $1 -; MIPS64R5EL-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5EL-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5EL-NEXT: ld $1, %got_page(.LCPI33_0)($gp) -; MIPS64R5EL-NEXT: daddiu $1, $1, %got_ofst(.LCPI33_0) -; MIPS64R5EL-NEXT: ld.d $w0, 0($1) -; MIPS64R5EL-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5EL-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5EL-NEXT: lui $1, 10 +; MIPS64R5EL-NEXT: daddiu $1, $1, 9 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 7 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $4, $1, 6 +; MIPS64R5EL-NEXT: lui $1, 1 +; MIPS64R5EL-NEXT: daddiu $1, $1, 16385 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 8193 +; MIPS64R5EL-NEXT: dsll $1, $1, 19 +; MIPS64R5EL-NEXT: daddiu $7, $1, 12 ; MIPS64R5EL-NEXT: ld $25, %call16(i16_8)($gp) +; MIPS64R5EL-NEXT: move $5, $4 +; MIPS64R5EL-NEXT: move $6, $4 ; MIPS64R5EL-NEXT: jalr $25 ; MIPS64R5EL-NEXT: nop ; MIPS64R5EL-NEXT: ld $1, %got_disp(gv8i16)($gp) @@ -4991,39 +4751,38 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: calli32_4: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI35_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI35_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI35_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI35_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(i32_4)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv4i32)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: calli32_4: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 3 +; MIPS64R5EB-NEXT: dsll $2, $1, 33 +; MIPS64R5EB-NEXT: daddiu $4, $2, 7 +; MIPS64R5EB-NEXT: dsll $1, $1, 34 +; MIPS64R5EB-NEXT: daddiu $6, $1, 8 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 32 +; MIPS64R5EB-NEXT: daddiu $5, $1, 10 +; MIPS64R5EB-NEXT: ld $25, %call16(i32_4)($gp) +; MIPS64R5EB-NEXT: move $7, $5 +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4i32)($gp) +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: calli32_4: ; MIPS64EL: # %bb.0: # %entry @@ -5057,6 +4816,40 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: calli32_4: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 7 +; MIPS64R5EL-NEXT: dsll $1, $1, 32 +; MIPS64R5EL-NEXT: daddiu $4, $1, 6 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 1 +; MIPS64R5EL-NEXT: dsll $1, $1, 35 +; MIPS64R5EL-NEXT: daddiu $6, $1, 12 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 5 +; MIPS64R5EL-NEXT: dsll $1, $1, 33 +; MIPS64R5EL-NEXT: daddiu $5, $1, 9 +; MIPS64R5EL-NEXT: ld $25, %call16(i32_4)($gp) +; MIPS64R5EL-NEXT: move $7, $5 +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4i32)($gp) +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <4 x i32> @i32_4(<4 x i32> , <4 x i32> ) store <4 x i32> %0, <4 x i32> * @gv4i32 @@ -5120,43 +4913,35 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5-LABEL: calli64_2: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -40 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 40 -; MIPS32R5-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: lui $1, %hi($CPI36_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI36_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5-NEXT: lui $1, %hi($CPI36_1) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI36_1) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: jal i64_2 -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv2i64) -; MIPS32R5-NEXT: insert.w $w0[0], $2 -; MIPS32R5-NEXT: insert.w $w0[1], $3 -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2i64) -; MIPS32R5-NEXT: insert.w $w0[2], $4 -; MIPS32R5-NEXT: insert.w $w0[3], $5 -; MIPS32R5-NEXT: st.w $w0, 0($1) -; MIPS32R5-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 40 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calli64_2: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: addiu $1, $zero, 8 +; MIPS32R5EB-NEXT: sw $1, 28($sp) +; MIPS32R5EB-NEXT: addiu $1, $zero, 12 +; MIPS32R5EB-NEXT: sw $1, 20($sp) +; MIPS32R5EB-NEXT: sw $zero, 24($sp) +; MIPS32R5EB-NEXT: sw $zero, 16($sp) +; MIPS32R5EB-NEXT: addiu $4, $zero, 0 +; MIPS32R5EB-NEXT: addiu $5, $zero, 6 +; MIPS32R5EB-NEXT: addiu $6, $zero, 0 +; MIPS32R5EB-NEXT: addiu $7, $zero, 7 +; MIPS32R5EB-NEXT: jal i64_2 +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: insert.w $w0[0], $2 +; MIPS32R5EB-NEXT: insert.w $w0[1], $3 +; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv2i64) +; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2i64) +; MIPS32R5EB-NEXT: st.w $w0, 0($1) +; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; ; MIPS64R5-LABEL: calli64_2: ; MIPS64R5: # %bb.0: # %entry @@ -5214,6 +4999,36 @@ ; MIPS32EL-NEXT: addiu $sp, $sp, 40 ; MIPS32EL-NEXT: jr $ra ; MIPS32EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calli64_2: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: addiu $1, $zero, 8 +; MIPS32R5EL-NEXT: sw $1, 24($sp) +; MIPS32R5EL-NEXT: addiu $1, $zero, 12 +; MIPS32R5EL-NEXT: sw $1, 16($sp) +; MIPS32R5EL-NEXT: sw $zero, 28($sp) +; MIPS32R5EL-NEXT: sw $zero, 20($sp) +; MIPS32R5EL-NEXT: addiu $4, $zero, 6 +; MIPS32R5EL-NEXT: addiu $5, $zero, 0 +; MIPS32R5EL-NEXT: addiu $6, $zero, 7 +; MIPS32R5EL-NEXT: addiu $7, $zero, 0 +; MIPS32R5EL-NEXT: jal i64_2 +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: insert.w $w0[0], $2 +; MIPS32R5EL-NEXT: insert.w $w0[1], $3 +; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv2i64) +; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2i64) +; MIPS32R5EL-NEXT: st.w $w0, 0($1) +; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop entry: %0 = call <2 x i64> @i64_2(<2 x i64> , <2 x i64> ) store <2 x i64> %0, <2 x i64> * @gv2i64 @@ -5305,35 +5120,33 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: callfloat_2: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI37_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI37_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI37_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI37_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $5, $w0[0] -; MIPS64R5-NEXT: ld $25, %call16(float2_extern)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: ld $1, %got_disp(gv2f32)($gp) -; MIPS64R5-NEXT: sd $2, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: callfloat_2: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 383 +; MIPS64R5EB-NEXT: dsll $4, $1, 23 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 261 +; MIPS64R5EB-NEXT: dsll $1, $1, 33 +; MIPS64R5EB-NEXT: daddiu $1, $1, 523 +; MIPS64R5EB-NEXT: dsll $5, $1, 21 +; MIPS64R5EB-NEXT: ld $25, %call16(float2_extern)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv2f32)($gp) +; MIPS64R5EB-NEXT: sd $2, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: callfloat_2: ; MIPS64EL: # %bb.0: # %entry @@ -5362,6 +5175,34 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: callfloat_2: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 383 +; MIPS64R5EL-NEXT: dsll $4, $1, 55 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 523 +; MIPS64R5EL-NEXT: dsll $1, $1, 31 +; MIPS64R5EL-NEXT: daddiu $1, $1, 261 +; MIPS64R5EL-NEXT: dsll $5, $1, 22 +; MIPS64R5EL-NEXT: ld $25, %call16(float2_extern)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv2f32)($gp) +; MIPS64R5EL-NEXT: sd $2, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <2 x float> @float2_extern(<2 x float> , <2 x float> ) store <2 x float> %0, <2 x float> * @gv2f32 @@ -5464,27 +5305,21 @@ ; MIPS32R5-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5-NEXT: addiu $1, $zero, -16 ; MIPS32R5-NEXT: and $sp, $sp, $1 -; MIPS32R5-NEXT: lui $1, %hi($CPI38_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI38_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $6, $w0[0] -; MIPS32R5-NEXT: copy_s.w $7, $w0[1] -; MIPS32R5-NEXT: copy_s.w $1, $w0[2] -; MIPS32R5-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5-NEXT: lui $3, %hi($CPI38_1) -; MIPS32R5-NEXT: addiu $3, $3, %lo($CPI38_1) -; MIPS32R5-NEXT: ld.w $w0, 0($3) -; MIPS32R5-NEXT: copy_s.w $3, $w0[0] -; MIPS32R5-NEXT: copy_s.w $4, $w0[1] -; MIPS32R5-NEXT: copy_s.w $5, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 36($sp) -; MIPS32R5-NEXT: sw $5, 32($sp) -; MIPS32R5-NEXT: sw $4, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) +; MIPS32R5-NEXT: lui $1, 16704 +; MIPS32R5-NEXT: lui $2, 16736 +; MIPS32R5-NEXT: lui $3, 16752 +; MIPS32R5-NEXT: lui $4, 16768 +; MIPS32R5-NEXT: sw $4, 36($sp) +; MIPS32R5-NEXT: sw $3, 32($sp) +; MIPS32R5-NEXT: sw $2, 28($sp) +; MIPS32R5-NEXT: sw $1, 24($sp) +; MIPS32R5-NEXT: lui $1, 16512 +; MIPS32R5-NEXT: sw $1, 20($sp) +; MIPS32R5-NEXT: lui $1, 16384 ; MIPS32R5-NEXT: sw $1, 16($sp) ; MIPS32R5-NEXT: addiu $4, $sp, 48 +; MIPS32R5-NEXT: addiu $6, $zero, 0 +; MIPS32R5-NEXT: lui $7, 49024 ; MIPS32R5-NEXT: jal float4_extern ; MIPS32R5-NEXT: nop ; MIPS32R5-NEXT: lui $1, %hi(gv4f32) @@ -5498,39 +5333,43 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: callfloat_4: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI38_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI38_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI38_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI38_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(float4_extern)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv4f32)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: callfloat_4: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 1 +; MIPS64R5EB-NEXT: dsll $1, $1, 39 +; MIPS64R5EB-NEXT: daddiu $1, $1, 129 +; MIPS64R5EB-NEXT: daddiu $2, $zero, 261 +; MIPS64R5EB-NEXT: dsll $2, $2, 33 +; MIPS64R5EB-NEXT: daddiu $3, $zero, 383 +; MIPS64R5EB-NEXT: dsll $4, $3, 23 +; MIPS64R5EB-NEXT: dsll $5, $1, 23 +; MIPS64R5EB-NEXT: daddiu $1, $2, 523 +; MIPS64R5EB-NEXT: dsll $6, $1, 21 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 1047 +; MIPS64R5EB-NEXT: dsll $1, $1, 29 +; MIPS64R5EB-NEXT: daddiu $1, $1, 131 +; MIPS64R5EB-NEXT: dsll $7, $1, 23 +; MIPS64R5EB-NEXT: ld $25, %call16(float4_extern)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4f32)($gp) +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: callfloat_4: ; MIPS64EL: # %bb.0: # %entry @@ -5568,6 +5407,44 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: callfloat_4: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 129 +; MIPS64R5EL-NEXT: dsll $1, $1, 25 +; MIPS64R5EL-NEXT: daddiu $1, $1, 1 +; MIPS64R5EL-NEXT: daddiu $2, $zero, 523 +; MIPS64R5EL-NEXT: dsll $2, $2, 31 +; MIPS64R5EL-NEXT: daddiu $3, $zero, 383 +; MIPS64R5EL-NEXT: dsll $4, $3, 55 +; MIPS64R5EL-NEXT: dsll $5, $1, 30 +; MIPS64R5EL-NEXT: daddiu $1, $2, 261 +; MIPS64R5EL-NEXT: dsll $6, $1, 22 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 131 +; MIPS64R5EL-NEXT: dsll $1, $1, 35 +; MIPS64R5EL-NEXT: daddiu $1, $1, 1047 +; MIPS64R5EL-NEXT: dsll $7, $1, 20 +; MIPS64R5EL-NEXT: ld $25, %call16(float4_extern)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4f32)($gp) +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <4 x float> @float4_extern(<4 x float> , <4 x float> ) store <4 x float> %0, <4 x float> * @gv4f32 @@ -5644,51 +5521,42 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5-LABEL: calldouble_2: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -80 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 80 -; MIPS32R5-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: .cfi_offset 30, -8 -; MIPS32R5-NEXT: move $fp, $sp -; MIPS32R5-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5-NEXT: addiu $1, $zero, -16 -; MIPS32R5-NEXT: and $sp, $sp, $1 -; MIPS32R5-NEXT: lui $1, %hi($CPI39_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI39_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $6, $w0[0] -; MIPS32R5-NEXT: copy_s.w $7, $w0[1] -; MIPS32R5-NEXT: copy_s.w $1, $w0[2] -; MIPS32R5-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5-NEXT: lui $3, %hi($CPI39_1) -; MIPS32R5-NEXT: addiu $3, $3, %lo($CPI39_1) -; MIPS32R5-NEXT: ld.w $w0, 0($3) -; MIPS32R5-NEXT: copy_s.w $3, $w0[0] -; MIPS32R5-NEXT: copy_s.w $4, $w0[1] -; MIPS32R5-NEXT: copy_s.w $5, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 36($sp) -; MIPS32R5-NEXT: sw $5, 32($sp) -; MIPS32R5-NEXT: sw $4, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: addiu $4, $sp, 48 -; MIPS32R5-NEXT: jal double2_extern -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv2f64) -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2f64) -; MIPS32R5-NEXT: ld.d $w0, 48($sp) -; MIPS32R5-NEXT: st.d $w0, 0($1) -; MIPS32R5-NEXT: move $sp, $fp -; MIPS32R5-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 80 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calldouble_2: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -80 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 80 +; MIPS32R5EB-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 +; MIPS32R5EB-NEXT: move $fp, $sp +; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EB-NEXT: addiu $1, $zero, -16 +; MIPS32R5EB-NEXT: and $sp, $sp, $1 +; MIPS32R5EB-NEXT: lui $1, 16424 +; MIPS32R5EB-NEXT: lui $2, 16428 +; MIPS32R5EB-NEXT: sw $2, 32($sp) +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: lui $1, 49136 +; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: sw $zero, 36($sp) +; MIPS32R5EB-NEXT: sw $zero, 28($sp) +; MIPS32R5EB-NEXT: sw $zero, 20($sp) +; MIPS32R5EB-NEXT: addiu $4, $sp, 48 +; MIPS32R5EB-NEXT: addiu $6, $zero, 0 +; MIPS32R5EB-NEXT: addiu $7, $zero, 0 +; MIPS32R5EB-NEXT: jal double2_extern +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: lui $1, %hi(gv2f64) +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2f64) +; MIPS32R5EB-NEXT: ld.d $w0, 48($sp) +; MIPS32R5EB-NEXT: st.d $w0, 0($1) +; MIPS32R5EB-NEXT: move $sp, $fp +; MIPS32R5EB-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 80 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; ; MIPS64R5-LABEL: calldouble_2: ; MIPS64R5: # %bb.0: # %entry @@ -5701,17 +5569,14 @@ ; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calldouble_2))) ; MIPS64R5-NEXT: daddu $1, $1, $25 ; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calldouble_2))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI39_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI39_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI39_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI39_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5-NEXT: daddiu $1, $zero, 3071 +; MIPS64R5-NEXT: dsll $5, $1, 52 +; MIPS64R5-NEXT: daddiu $1, $zero, 2053 +; MIPS64R5-NEXT: dsll $6, $1, 51 +; MIPS64R5-NEXT: daddiu $1, $zero, 4107 +; MIPS64R5-NEXT: dsll $7, $1, 50 ; MIPS64R5-NEXT: ld $25, %call16(double2_extern)($gp) +; MIPS64R5-NEXT: daddiu $4, $zero, 0 ; MIPS64R5-NEXT: jalr $25 ; MIPS64R5-NEXT: nop ; MIPS64R5-NEXT: insert.d $w0[0], $2 @@ -5762,6 +5627,43 @@ ; MIPS32EL-NEXT: addiu $sp, $sp, 80 ; MIPS32EL-NEXT: jr $ra ; MIPS32EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calldouble_2: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -80 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 80 +; MIPS32R5EL-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 +; MIPS32R5EL-NEXT: move $fp, $sp +; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EL-NEXT: addiu $1, $zero, -16 +; MIPS32R5EL-NEXT: and $sp, $sp, $1 +; MIPS32R5EL-NEXT: lui $1, 16424 +; MIPS32R5EL-NEXT: lui $2, 16428 +; MIPS32R5EL-NEXT: sw $2, 36($sp) +; MIPS32R5EL-NEXT: sw $1, 28($sp) +; MIPS32R5EL-NEXT: lui $1, 49136 +; MIPS32R5EL-NEXT: sw $1, 20($sp) +; MIPS32R5EL-NEXT: sw $zero, 32($sp) +; MIPS32R5EL-NEXT: sw $zero, 24($sp) +; MIPS32R5EL-NEXT: sw $zero, 16($sp) +; MIPS32R5EL-NEXT: addiu $4, $sp, 48 +; MIPS32R5EL-NEXT: addiu $6, $zero, 0 +; MIPS32R5EL-NEXT: addiu $7, $zero, 0 +; MIPS32R5EL-NEXT: jal double2_extern +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: lui $1, %hi(gv2f64) +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2f64) +; MIPS32R5EL-NEXT: ld.d $w0, 48($sp) +; MIPS32R5EL-NEXT: st.d $w0, 0($1) +; MIPS32R5EL-NEXT: move $sp, $fp +; MIPS32R5EL-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 80 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop entry: %0 = call <2 x double> @double2_extern(<2 x double> , <2 x double> ) store <2 x double> %0, <2 x double> * @gv2f64 diff --git a/llvm/test/CodeGen/PowerPC/pr45709.ll b/llvm/test/CodeGen/PowerPC/pr45709.ll --- a/llvm/test/CodeGen/PowerPC/pr45709.ll +++ b/llvm/test/CodeGen/PowerPC/pr45709.ll @@ -10,7 +10,7 @@ define dso_local void @_ZN1a1bEv(<4 x float> %in) local_unnamed_addr #0 align 2 { ; CHECK-LABEL: _ZN1a1bEv: ; CHECK: # %bb.0: -; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_6 +; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_4 ; CHECK-NEXT: b .LBB0_1 ; CHECK-NEXT: .LBB0_1: # %.preheader ; CHECK-NEXT: b .LBB0_2 @@ -21,26 +21,18 @@ ; CHECK-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-NEXT: lvx v3, 0, r3 ; CHECK-NEXT: vperm v2, v2, v2, v3 -; CHECK-NEXT: vxor v3, v3, v3 -; CHECK-NEXT: addi r3, r1, -48 -; CHECK-NEXT: stvx v3, 0, r3 ; CHECK-NEXT: addi r3, r1, -32 ; CHECK-NEXT: stvx v2, 0, r3 -; CHECK-NEXT: lwz r3, -48(r1) -; CHECK-NEXT: lwz r4, -32(r1) -; CHECK-NEXT: cmpw r4, r3 -; CHECK-NEXT: bc 12, gt, .LBB0_4 -; CHECK-NEXT: b .LBB0_5 -; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: addi r3, r4, 0 -; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: lwz r3, -32(r1) +; CHECK-NEXT: srawi r4, r3, 31 +; CHECK-NEXT: andc r3, r3, r4 ; CHECK-NEXT: cmpw r3, r3 -; CHECK-NEXT: stw r3, -64(r1) -; CHECK-NEXT: addi r3, r1, -64 +; CHECK-NEXT: stw r3, -48(r1) +; CHECK-NEXT: addi r3, r1, -48 ; CHECK-NEXT: lvx v2, 0, r3 ; CHECK-NEXT: addi r3, r1, -16 ; CHECK-NEXT: stvx v2, 0, r3 -; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: blr br i1 undef, label %7, label %1 diff --git a/llvm/test/CodeGen/SystemZ/memset-08.ll b/llvm/test/CodeGen/SystemZ/memset-08.ll --- a/llvm/test/CodeGen/SystemZ/memset-08.ll +++ b/llvm/test/CodeGen/SystemZ/memset-08.ll @@ -241,7 +241,8 @@ ; CHECK-LABEL: imm19: ; CHECK: # %bb.0: ; CHECK-NEXT: vrepib %v0, 1 -; CHECK-NEXT: vstef %v0, 15(%r2), 0 +; CHECK-NEXT: iilf %r0, 16843009 +; CHECK-NEXT: st %r0, 15(%r2) ; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 19, i1 false) @@ -263,8 +264,10 @@ ; CHECK-LABEL: imm21: ; CHECK: # %bb.0: ; CHECK-NEXT: vrepib %v0, 1 -; CHECK-NEXT: vsteg %v0, 13(%r2), 0 +; CHECK-NEXT: llihf %r0, 16843009 +; CHECK-NEXT: oilf %r0, 16843009 ; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: stg %r0, 13(%r2) ; CHECK-NEXT: br %r14 call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 21, i1 false) ret void @@ -285,8 +288,10 @@ ; CHECK-LABEL: imm23: ; CHECK: # %bb.0: ; CHECK-NEXT: vrepib %v0, 1 -; CHECK-NEXT: vsteg %v0, 15(%r2), 0 +; CHECK-NEXT: llihf %r0, 16843009 +; CHECK-NEXT: oilf %r0, 16843009 ; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: stg %r0, 15(%r2) ; CHECK-NEXT: br %r14 call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 23, i1 false) ret void diff --git a/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll b/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll --- a/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll +++ b/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll @@ -269,8 +269,8 @@ define void @fun_2x2i(i32* %Dst) { ; CHECK-LABEL: fun_2x2i: ; CHECK: # %bb.0: -; CHECK-NEXT: vrepih %v0, 1 -; CHECK-NEXT: vstef %v0, 0(%r2), 0 +; CHECK-NEXT: iilf %r0, 65537 +; CHECK-NEXT: st %r0, 0(%r2) ; CHECK-NEXT: br %r14 store i32 65537, i32* %Dst ret void @@ -279,8 +279,9 @@ define void @fun_4x2i(i64* %Dst) { ; CHECK-LABEL: fun_4x2i: ; CHECK: # %bb.0: -; CHECK-NEXT: vrepih %v0, 1 -; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: llihf %r0, 65537 +; CHECK-NEXT: oilf %r0, 65537 +; CHECK-NEXT: stg %r0, 0(%r2) ; CHECK-NEXT: br %r14 store i64 281479271743489, i64* %Dst ret void @@ -289,8 +290,9 @@ define void @fun_2x4i(i64* %Dst) { ; CHECK-LABEL: fun_2x4i: ; CHECK: # %bb.0: -; CHECK-NEXT: vrepif %v0, 1 -; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: llihl %r0, 1 +; CHECK-NEXT: oill %r0, 1 +; CHECK-NEXT: stg %r0, 0(%r2) ; CHECK-NEXT: br %r14 store i64 4294967297, i64* %Dst ret void @@ -300,9 +302,9 @@ define void @fun_4x1i(i32* %Dst, i32* %Dst2) { ; CHECK-LABEL: fun_4x1i: ; CHECK: # %bb.0: -; CHECK-NEXT: vrepib %v0, 3 -; CHECK-NEXT: vstef %v0, 0(%r2), 0 -; CHECK-NEXT: vstef %v0, 0(%r3), 0 +; CHECK-NEXT: iilf %r0, 50529027 +; CHECK-NEXT: st %r0, 0(%r2) +; CHECK-NEXT: st %r0, 0(%r3) ; CHECK-NEXT: br %r14 store i32 50529027, i32* %Dst store i32 50529027, i32* %Dst2 @@ -312,9 +314,10 @@ define void @fun_8x1i(i64* %Dst, i64* %Dst2) { ; CHECK-LABEL: fun_8x1i: ; CHECK: # %bb.0: -; CHECK-NEXT: vrepib %v0, 1 -; CHECK-NEXT: vsteg %v0, 0(%r2), 0 -; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: llihf %r0, 16843009 +; CHECK-NEXT: oilf %r0, 16843009 +; CHECK-NEXT: stg %r0, 0(%r2) +; CHECK-NEXT: stg %r0, 0(%r3) ; CHECK-NEXT: br %r14 store i64 72340172838076673, i64* %Dst store i64 72340172838076673, i64* %Dst2 @@ -326,8 +329,10 @@ ; CHECK-LABEL: fun_4Eltsx4x1i_2Eltsx4x1i: ; CHECK: # %bb.0: ; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: llihf %r0, 50529027 +; CHECK-NEXT: oilf %r0, 50529027 ; CHECK-NEXT: vst %v0, 0(%r2), 3 -; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: stg %r0, 0(%r3) ; CHECK-NEXT: br %r14 %tmp = insertelement <4 x i32> undef, i32 50529027, i32 0 %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer @@ -343,8 +348,10 @@ ; CHECK-LABEL: fun_4Eltsx4x1i_8x1i: ; CHECK: # %bb.0: ; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: llihf %r0, 50529027 +; CHECK-NEXT: oilf %r0, 50529027 ; CHECK-NEXT: vst %v0, 0(%r2), 3 -; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: stg %r0, 0(%r3) ; CHECK-NEXT: br %r14 %tmp = insertelement <4 x i32> undef, i32 50529027, i32 0 %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer @@ -357,8 +364,10 @@ ; CHECK-LABEL: fun_3Eltsx2x4i: ; CHECK: # %bb.0: ; CHECK-NEXT: vrepif %v0, 1 -; CHECK-NEXT: vsteg %v0, 16(%r2), 0 +; CHECK-NEXT: llihl %r0, 1 +; CHECK-NEXT: oill %r0, 1 ; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: stg %r0, 16(%r2) ; CHECK-NEXT: br %r14 %tmp = insertelement <3 x i64> undef, i64 4294967297, i32 0 %Val = shufflevector <3 x i64> %tmp, <3 x i64> undef, <3 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -4,51 +4,48 @@ define <2 x i64> @v2i64(i32 %index, i32 %TC, <2 x i64> %V1, <2 x i64> %V2) { ; CHECK-LABEL: v2i64: ; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov q1[2], q1[0], r0, r0 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: adds.w lr, r0, #1 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r0, lr +; CHECK-NEXT: adc r4, r12, #0 +; CHECK-NEXT: subs.w r0, r0, #-1 +; CHECK-NEXT: ldr r5, .LCPI0_0 +; CHECK-NEXT: sbcs r0, r12, #0 +; CHECK-NEXT: vmov.i64 q1, #0xffffffff +; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 +; CHECK-NEXT: csetm r0, lo ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vmov r0, r4, d3 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r1 -; CHECK-NEXT: vmov lr, r12, d2 -; CHECK-NEXT: adds r6, r0, #1 -; CHECK-NEXT: adc r4, r4, #0 +; CHECK-NEXT: bfi r5, r0, #0, #8 ; CHECK-NEXT: subs.w r0, lr, #-1 -; CHECK-NEXT: vmov q1[2], q1[0], lr, r6 -; CHECK-NEXT: sbcs r0, r12, #0 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 -; CHECK-NEXT: csetm r12, lo -; CHECK-NEXT: subs.w r6, r6, #-1 -; CHECK-NEXT: bfi r5, r12, #0, #8 -; CHECK-NEXT: sbcs r6, r4, #0 -; CHECK-NEXT: mov.w r0, #0 -; CHECK-NEXT: csetm r6, lo -; CHECK-NEXT: bfi r5, r6, #8, #8 +; CHECK-NEXT: sbcs r0, r4, #0 +; CHECK-NEXT: csetm r0, lo +; CHECK-NEXT: bfi r5, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r5 -; CHECK-NEXT: vpsel q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vmov r1, r4, d0 -; CHECK-NEXT: vmov r6, r5, d2 +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r0, r5, d0 ; CHECK-NEXT: vmov d0, r2, r3 -; CHECK-NEXT: subs r1, r6, r1 -; CHECK-NEXT: sbcs.w r1, r5, r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: csetm r1, lo +; CHECK-NEXT: subs r0, r0, r1 +; CHECK-NEXT: sbcs r0, r5, #0 +; CHECK-NEXT: csetm r0, lo +; CHECK-NEXT: bfi r12, r0, #0, #8 +; CHECK-NEXT: vmov r0, r5, d1 ; CHECK-NEXT: vldr d1, [sp, #16] -; CHECK-NEXT: bfi r0, r1, #0, #8 -; CHECK-NEXT: vmov r1, r6, d3 -; CHECK-NEXT: subs r1, r1, r5 -; CHECK-NEXT: sbcs.w r1, r6, r4 -; CHECK-NEXT: csetm r1, lo -; CHECK-NEXT: bfi r0, r1, #8, #8 -; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: subs r0, r0, r1 +; CHECK-NEXT: sbcs r0, r5, #0 +; CHECK-NEXT: csetm r0, lo +; CHECK-NEXT: bfi r12, r0, #8, #8 ; CHECK-NEXT: add r0, sp, #24 ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r12 ; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 0x00000000 @ float 0 %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 %index, i32 %TC) %select = select <2 x i1> %active.lane.mask, <2 x i64> %V1, <2 x i64> %V2 ret <2 x i64> %select @@ -119,12 +116,12 @@ ; CHECK-NEXT: ldr r1, [sp, #24] ; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 ; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: strd r3, r2, [r0, #16] -; CHECK-NEXT: str r1, [r0, #24] +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: strd r1, r3, [r0, #16] +; CHECK-NEXT: str r2, [r0, #24] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll --- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll @@ -422,23 +422,23 @@ ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: movw r4, #65535 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov.i64 q1, #0xffff -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: subs r0, r0, r4 -; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r4 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov q1[3], q1[1], r1, r5 +; CHECK-NEXT: movw r1, #65535 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.i64 q0, #0xffff +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: subs r0, r0, r1 +; CHECK-NEXT: sbcs r0, r2, #0 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: bfi r5, r0, #0, #8 -; CHECK-NEXT: subs r0, r2, r4 -; CHECK-NEXT: sbcs r0, r3, #0 +; CHECK-NEXT: subs r1, r3, r1 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: bfi r2, r0, #0, #8 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: bfi r5, r0, #8, #8 -; CHECK-NEXT: vmsr p0, r5 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -1573,23 +1573,23 @@ ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2ulz -; CHECK-NEXT: vmov q0[2], q0[0], r0, r4 -; CHECK-NEXT: movw r4, #65535 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r5 -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov.i64 q1, #0xffff -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: subs r0, r0, r4 -; CHECK-NEXT: sbcs r0, r1, #0 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r4 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov q1[3], q1[1], r1, r5 +; CHECK-NEXT: movw r1, #65535 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.i64 q0, #0xffff +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: subs r0, r0, r1 +; CHECK-NEXT: sbcs r0, r2, #0 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: bfi r5, r0, #0, #8 -; CHECK-NEXT: subs r0, r2, r4 -; CHECK-NEXT: sbcs r0, r3, #0 +; CHECK-NEXT: subs r1, r3, r1 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: bfi r2, r0, #0, #8 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: bfi r5, r0, #8, #8 -; CHECK-NEXT: vmsr p0, r5 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll @@ -4,58 +4,54 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C) { ; CHECK-LABEL: loads_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d9} ; CHECK-NEXT: vpush {d9} +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s14, s3 +; CHECK-NEXT: vmov.f32 s18, s1 +; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov.i64 q0, #0xffffffff ; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vldrw.u32 q2, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.f32 s12, s10 +; CHECK-NEXT: vmov.f32 s10, s11 ; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q3, q2, q0 -; CHECK-NEXT: vand q0, q1, q0 -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r3, lr, d0 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov r4, r1, d6 -; CHECK-NEXT: vmov r0, r12, d7 -; CHECK-NEXT: vldrw.u32 q3, [r2] -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov.f32 s0, s12 -; CHECK-NEXT: vmov.f32 s6, s13 -; CHECK-NEXT: adds r2, r5, r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: asr.w r6, r5, #31 -; CHECK-NEXT: adcs r1, r6 -; CHECK-NEXT: asrl r2, r1, r4 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: adds r6, r1, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: asr.w r4, r1, #31 -; CHECK-NEXT: adc.w r1, r4, lr -; CHECK-NEXT: asrl r6, r1, r3 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov q0[2], q0[0], r6, r2 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: asr.w r3, r1, #31 -; CHECK-NEXT: adc.w r1, r3, r12 -; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov.f32 s2, s9 +; CHECK-NEXT: asrs r3, r0, #31 +; CHECK-NEXT: adds.w r12, r0, r1 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: adc r1, r3, #0 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: asrl r12, r1, r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: adds r2, r2, r0 +; CHECK-NEXT: asr.w r1, r0, #31 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: asrl r2, r1, r0 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: asrl r0, r1, r3 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: adds r6, r1, r5 -; CHECK-NEXT: asr.w r2, r1, #31 -; CHECK-NEXT: adc.w r1, r2, r4 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: asrl r6, r1, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r6, r0 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r4, r1, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: asr.w lr, r1, #31 +; CHECK-NEXT: adc r1, lr, #0 +; CHECK-NEXT: asrl r4, r1, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r12 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 ; CHECK-NEXT: vpop {d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, pc} entry: %a = load <4 x i32>, <4 x i32> *%A, align 4 %b = load <4 x i32>, <4 x i32> *%B, align 4 @@ -138,62 +134,58 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D) { ; CHECK-LABEL: load_store_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d9} -; CHECK-NEXT: vpush {d9} -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vand q3, q2, q0 -; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: .save {r4, r6, r8, lr} +; CHECK-NEXT: push.w {r4, r6, r8, lr} +; CHECK-NEXT: .vsave {d10} +; CHECK-NEXT: vpush {d10} +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r4, lr, d2 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r5, r1, d6 -; CHECK-NEXT: vmov r0, r12, d7 -; CHECK-NEXT: vldrw.u32 q3, [r2] -; CHECK-NEXT: vmov.f32 s10, s3 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s14, s1 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q2, [r2] +; CHECK-NEXT: vmov.f32 s12, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov.f32 s16, s6 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov.f32 s20, s10 +; CHECK-NEXT: vmov.f32 s10, s1 ; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov.f32 s4, s12 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: adds r2, r6, r5 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: asr.w r7, r6, #31 -; CHECK-NEXT: adcs r1, r7 -; CHECK-NEXT: asrl r2, r1, r5 -; CHECK-NEXT: vmov r7, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: adds r4, r4, r1 -; CHECK-NEXT: asr.w r5, r1, #31 -; CHECK-NEXT: adc.w r1, r5, lr -; CHECK-NEXT: asrl r4, r1, r7 -; CHECK-NEXT: vmov r6, r5, d3 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.f32 s2, s9 +; CHECK-NEXT: asr.w r12, r0, #31 +; CHECK-NEXT: adds.w r8, r0, r1 +; CHECK-NEXT: adc r1, r12, #0 +; CHECK-NEXT: asrl r8, r1, r2 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r2 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: asr.w r7, r1, #31 -; CHECK-NEXT: adc.w r1, r7, r12 -; CHECK-NEXT: vmov r7, s18 -; CHECK-NEXT: asrl r0, r1, r7 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: adds r2, r2, r1 +; CHECK-NEXT: asr.w r0, r1, #31 +; CHECK-NEXT: adc r1, r0, #0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: asrl r2, r1, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: asrl r0, r1, r4 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: adds r6, r6, r1 -; CHECK-NEXT: asr.w r2, r1, #31 -; CHECK-NEXT: adc.w r1, r2, r5 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: asrl r6, r1, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r6, r0 -; CHECK-NEXT: vstrw.32 q1, [r3] -; CHECK-NEXT: vpop {d9} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: asr.w r4, r1, #31 +; CHECK-NEXT: adc r1, r4, #0 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: asrl r6, r1, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r8 +; CHECK-NEXT: vstrw.32 q0, [r3] +; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: vpop {d10} +; CHECK-NEXT: pop.w {r4, r6, r8, pc} entry: %a = load <4 x i32>, <4 x i32> *%A, align 4 %b = load <4 x i32>, <4 x i32> *%B, align 4 @@ -276,8 +268,8 @@ define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) { ; CHECK-LABEL: load_one_store_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s2, s3 @@ -285,27 +277,27 @@ ; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: adds.w r12, r2, r2 ; CHECK-NEXT: asr.w r3, r2, #31 -; CHECK-NEXT: adc.w r7, r3, r2, asr #31 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: asrl r12, r7, r2 -; CHECK-NEXT: adds r0, r3, r3 -; CHECK-NEXT: asr.w r5, r3, #31 -; CHECK-NEXT: adc.w r5, r5, r3, asr #31 -; CHECK-NEXT: asrl r0, r5, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: adds r4, r3, r3 -; CHECK-NEXT: asr.w r5, r3, #31 -; CHECK-NEXT: adc.w r5, r5, r3, asr #31 -; CHECK-NEXT: asrl r4, r5, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: adc.w r3, r3, r2, asr #31 +; CHECK-NEXT: asrl r12, r3, r2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: adds r2, r3, r3 +; CHECK-NEXT: asr.w r0, r3, #31 +; CHECK-NEXT: adc.w r5, r0, r3, asr #31 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: asrl r2, r5, r3 ; CHECK-NEXT: adds r4, r0, r0 -; CHECK-NEXT: asr.w r2, r0, #31 -; CHECK-NEXT: adc.w r3, r2, r0, asr #31 +; CHECK-NEXT: asr.w r3, r0, #31 +; CHECK-NEXT: adc.w r3, r3, r0, asr #31 ; CHECK-NEXT: asrl r4, r3, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r12 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: adds r6, r0, r0 +; CHECK-NEXT: asr.w r3, r0, #31 +; CHECK-NEXT: adc.w r3, r3, r0, asr #31 +; CHECK-NEXT: asrl r6, r3, r0 +; CHECK-NEXT: vmov q0[2], q0[0], r6, r4 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %a = load <4 x i32>, <4 x i32> *%A, align 4 %sa = sext <4 x i32> %a to <4 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -180,44 +180,41 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: ext_add_ashr_trunc_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vmov.f32 s6, s5 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r0, r1, d6 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r12, lr, d7 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: asr.w r5, r4, #31 -; CHECK-NEXT: adcs r1, r5 -; CHECK-NEXT: lsrl r0, r1, #1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: asr.w r4, r1, #31 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: lsrl r2, r3, #1 -; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds.w r4, r3, r12 -; CHECK-NEXT: asr.w r6, r3, #31 -; CHECK-NEXT: adc.w r3, r6, lr -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r5 -; CHECK-NEXT: lsrl r4, r3, #1 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r2, r2, r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: lsrl r2, r1, #1 +; CHECK-NEXT: adds r4, r0, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: asr.w r1, r0, #31 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: lsrl r4, r1, #1 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: asr.w r12, r3, #31 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: lsrl r0, r3, #1 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: lsrl r0, r1, #1 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> %sb = zext <4 x i32> %b to <4 x i64> @@ -328,113 +325,98 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: ext_ops_trunc_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r10, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s18, s1 -; CHECK-NEXT: vmov.i64 q3, #0xffffffff -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vand q2, q2, q3 -; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vmov r2, r0, d5 -; CHECK-NEXT: vmov r1, r12, d4 -; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vand q1, q1, q3 -; CHECK-NEXT: adds r4, r3, r2 -; CHECK-NEXT: asr.w r6, r3, #31 -; CHECK-NEXT: adc.w r5, r6, r0 -; CHECK-NEXT: asrl r4, r5, r2 -; CHECK-NEXT: subs r6, r4, r2 -; CHECK-NEXT: sbc.w r8, r5, r0 -; CHECK-NEXT: umull r10, lr, r6, r2 -; CHECK-NEXT: muls r6, r0, r6 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.f32 s0, s2 +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: vmov.f32 s8, s2 ; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: orr.w lr, lr, r6 -; CHECK-NEXT: adds r6, r0, r1 -; CHECK-NEXT: asr.w r5, r0, #31 -; CHECK-NEXT: adc.w r7, r5, r12 -; CHECK-NEXT: asrl r6, r7, r1 -; CHECK-NEXT: mla r5, r8, r2, lr -; CHECK-NEXT: subs r4, r6, r1 -; CHECK-NEXT: sbc.w lr, r7, r12 -; CHECK-NEXT: umull r6, r7, r4, r1 -; CHECK-NEXT: mul r4, r4, r12 -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: orr.w r8, r7, r4 -; CHECK-NEXT: eor.w r7, r0, r1 -; CHECK-NEXT: orr.w r7, r7, r0, asr #31 -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov r10, s8 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: asr.w r0, r10, #31 +; CHECK-NEXT: asrs r7, r6, #31 +; CHECK-NEXT: adds.w r4, r10, r2 +; CHECK-NEXT: adc r3, r0, #0 +; CHECK-NEXT: asrl r4, r3, r2 +; CHECK-NEXT: subs r0, r4, r2 +; CHECK-NEXT: sbc lr, r3, #0 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: umull r0, r8, r0, r2 +; CHECK-NEXT: adds r4, r6, r3 +; CHECK-NEXT: eor.w r1, r6, r3 +; CHECK-NEXT: adc r5, r7, #0 +; CHECK-NEXT: eor.w r7, r10, r2 +; CHECK-NEXT: asrl r4, r5, r3 +; CHECK-NEXT: orr.w r7, r7, r10, asr #31 +; CHECK-NEXT: subs r4, r4, r3 +; CHECK-NEXT: orr.w r1, r1, r6, asr #31 +; CHECK-NEXT: sbc r5, r5, #0 ; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: rsb.w r0, r0, #0 -; CHECK-NEXT: csetm r7, eq -; CHECK-NEXT: bfi r4, r7, #0, #8 -; CHECK-NEXT: eor.w r7, r3, r2 -; CHECK-NEXT: orr.w r7, r7, r3, asr #31 +; CHECK-NEXT: umull r4, r12, r4, r3 +; CHECK-NEXT: csetm r9, eq +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: cmp r1, #0 +; CHECK-NEXT: bfi r7, r9, #0, #8 +; CHECK-NEXT: csetm r1, eq +; CHECK-NEXT: bfi r7, r1, #8, #8 +; CHECK-NEXT: mla r5, r5, r3, r12 +; CHECK-NEXT: rsbs r1, r6, #0 +; CHECK-NEXT: vmsr p0, r7 +; CHECK-NEXT: mla r7, lr, r2, r8 +; CHECK-NEXT: lsll r4, r5, r1 +; CHECK-NEXT: rsb.w r1, r10, #0 +; CHECK-NEXT: lsll r0, r7, r1 +; CHECK-NEXT: vmov lr, s2 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: lsll r0, r7, r2 +; CHECK-NEXT: lsll r4, r5, r3 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r4 +; CHECK-NEXT: vpsel q2, q3, q2 +; CHECK-NEXT: adds.w r2, lr, r1 +; CHECK-NEXT: asr.w r0, lr, #31 +; CHECK-NEXT: adc r3, r0, #0 +; CHECK-NEXT: asrl r2, r3, r1 +; CHECK-NEXT: subs r0, r2, r1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: sbc r7, r3, #0 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: umull r0, r6, r0, r1 +; CHECK-NEXT: asrs r5, r2, #31 +; CHECK-NEXT: adds r4, r2, r3 +; CHECK-NEXT: adc r5, r5, #0 +; CHECK-NEXT: asrl r4, r5, r3 +; CHECK-NEXT: subs r4, r4, r3 +; CHECK-NEXT: sbc r8, r5, #0 +; CHECK-NEXT: mla r5, r7, r1, r6 +; CHECK-NEXT: eor.w r6, lr, r1 +; CHECK-NEXT: orr.w r6, r6, lr, asr #31 +; CHECK-NEXT: eor.w r7, r2, r3 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: orr.w r7, r7, r2, asr #31 +; CHECK-NEXT: csetm r6, eq ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: csetm r7, eq -; CHECK-NEXT: bfi r4, r7, #8, #8 -; CHECK-NEXT: vmsr p0, r4 -; CHECK-NEXT: rsbs r4, r3, #0 -; CHECK-NEXT: mla r3, lr, r1, r8 -; CHECK-NEXT: lsll r10, r5, r4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: lsll r10, r5, r2 -; CHECK-NEXT: lsll r6, r3, r0 -; CHECK-NEXT: vmov r2, r7, d3 -; CHECK-NEXT: lsll r6, r3, r1 -; CHECK-NEXT: vmov r1, r3, d2 -; CHECK-NEXT: vmov q4[2], q4[0], r6, r10 -; CHECK-NEXT: vpsel q2, q4, q2 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: asrs r0, r4, #31 -; CHECK-NEXT: adds r6, r4, r1 -; CHECK-NEXT: adc.w r5, r0, r3 -; CHECK-NEXT: asrl r6, r5, r1 -; CHECK-NEXT: subs r0, r6, r1 -; CHECK-NEXT: sbc.w r3, r5, r3 -; CHECK-NEXT: umull r8, r6, r0, r1 -; CHECK-NEXT: mla r3, r3, r1, r6 -; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: adds r0, r6, r2 -; CHECK-NEXT: asr.w r5, r6, #31 -; CHECK-NEXT: adcs r5, r7 -; CHECK-NEXT: asrl r0, r5, r2 -; CHECK-NEXT: subs r0, r0, r2 -; CHECK-NEXT: sbc.w r7, r5, r7 -; CHECK-NEXT: eor.w r5, r4, r1 -; CHECK-NEXT: orr.w r5, r5, r4, asr #31 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: lsll r8, r3, r4 -; CHECK-NEXT: csetm r5, eq -; CHECK-NEXT: lsll r8, r3, r1 -; CHECK-NEXT: bfi r12, r5, #0, #8 -; CHECK-NEXT: eor.w r5, r6, r2 -; CHECK-NEXT: orr.w r5, r5, r6, asr #31 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r5, eq -; CHECK-NEXT: bfi r12, r5, #8, #8 -; CHECK-NEXT: umull r0, r5, r0, r2 +; CHECK-NEXT: rsb.w lr, lr, #0 +; CHECK-NEXT: bfi r12, r7, #0, #8 +; CHECK-NEXT: lsll r0, r5, lr +; CHECK-NEXT: bfi r12, r6, #8, #8 +; CHECK-NEXT: umull r4, r6, r4, r3 +; CHECK-NEXT: lsll r0, r5, r1 +; CHECK-NEXT: rsbs r1, r2, #0 ; CHECK-NEXT: vmsr p0, r12 -; CHECK-NEXT: mla r5, r7, r2, r5 -; CHECK-NEXT: rsbs r7, r6, #0 -; CHECK-NEXT: lsll r0, r5, r7 -; CHECK-NEXT: lsll r0, r5, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r8, r0 +; CHECK-NEXT: mla r7, r8, r3, r6 +; CHECK-NEXT: lsll r4, r7, r1 +; CHECK-NEXT: lsll r4, r7, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r10, pc} +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %sa = sext <4 x i32> %a to <4 x i64> %sb = zext <4 x i32> %b to <4 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll --- a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll +++ b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll @@ -211,8 +211,6 @@ ; CHECK-NEXT: cmp r1, r3 ; CHECK-NEXT: csel r0, r0, r12, eq ; CHECK-NEXT: csel r1, r1, r3, gt -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr %c = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %a, <1 x i64> %b) @@ -429,24 +427,24 @@ define arm_aapcs_vfpcc <2 x i32> @umax2i32(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: umax2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: subs r0, r2, r0 -; CHECK-NEXT: sbcs.w r0, r3, r1 -; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.i64 q3, #0xffffffff +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vand q2, q1, q3 +; CHECK-NEXT: vand q3, q0, q3 +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: sbcs r0, r2, #0 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: vmov r3, r2, d3 -; CHECK-NEXT: bfi r1, r0, #0, #8 -; CHECK-NEXT: vmov r0, r12, d1 -; CHECK-NEXT: subs r0, r3, r0 -; CHECK-NEXT: sbcs.w r0, r2, r12 +; CHECK-NEXT: subs r1, r3, r1 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: bfi r2, r0, #0, #8 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: bfi r1, r0, #8, #8 -; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q3, q2 ; CHECK-NEXT: bx lr %c = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %a, <2 x i32> %b) ret <2 x i32> %c @@ -492,8 +490,6 @@ ; CHECK-NEXT: cmp r1, r3 ; CHECK-NEXT: csel r0, r0, r12, eq ; CHECK-NEXT: csel r1, r1, r3, hi -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr %c = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %a, <1 x i64> %b) @@ -780,8 +776,6 @@ ; CHECK-NEXT: cmp r1, r3 ; CHECK-NEXT: csel r0, r0, r12, eq ; CHECK-NEXT: csel r1, r1, r3, lt -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr %c = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %a, <1 x i64> %b) @@ -998,24 +992,24 @@ define arm_aapcs_vfpcc <2 x i32> @umin2i32(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: umin2i32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: subs r0, r2, r0 -; CHECK-NEXT: sbcs.w r0, r3, r1 -; CHECK-NEXT: mov.w r1, #0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.i64 q3, #0xffffffff +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vand q2, q1, q3 +; CHECK-NEXT: vand q3, q0, q3 +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: sbcs r0, r2, #0 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: vmov r3, r2, d1 -; CHECK-NEXT: bfi r1, r0, #0, #8 -; CHECK-NEXT: vmov r0, r12, d3 -; CHECK-NEXT: subs r0, r3, r0 -; CHECK-NEXT: sbcs.w r0, r2, r12 +; CHECK-NEXT: subs r1, r3, r1 +; CHECK-NEXT: sbcs r1, r2, #0 +; CHECK-NEXT: bfi r2, r0, #0, #8 ; CHECK-NEXT: csetm r0, lo -; CHECK-NEXT: bfi r1, r0, #8, #8 -; CHECK-NEXT: vmsr p0, r1 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpsel q0, q3, q2 ; CHECK-NEXT: bx lr %c = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %a, <2 x i32> %b) ret <2 x i32> %c @@ -1061,8 +1055,6 @@ ; CHECK-NEXT: cmp r1, r3 ; CHECK-NEXT: csel r0, r0, r12, eq ; CHECK-NEXT: csel r1, r1, r3, lo -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: bx lr %c = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %a, <1 x i64> %b) diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -193,150 +193,148 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB1_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB1_3 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r9, r5 +; CHECK-NEXT: mov r1, r9 ; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: b .LBB1_6 ; CHECK-NEXT: .LBB1_3: @ %vector.ph -; CHECK-NEXT: bic r1, r3, #3 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: subs r7, r3, #4 ; CHECK-NEXT: adr r4, .LCPI1_0 -; CHECK-NEXT: subs r7, r1, #4 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: adr r4, .LCPI1_1 -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: add.w r11, r2, r1, lsl #2 -; CHECK-NEXT: add.w r9, r5, r1, lsl #2 -; CHECK-NEXT: add.w r12, r0, r1, lsl #2 +; CHECK-NEXT: str r3, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r11, r2, r3, lsl #2 +; CHECK-NEXT: add.w r1, r9, r3, lsl #2 +; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: mov.w r10, #-1 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [r5], #16 ; CHECK-NEXT: vldrw.u32 q3, [r0], #16 -; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: mov.w r2, #-1 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: vmov.f32 s20, s18 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: vmov.f32 s10, s15 -; CHECK-NEXT: vmov.f32 s22, s19 -; CHECK-NEXT: vmullb.s32 q6, q5, q2 -; CHECK-NEXT: vmov.f32 s18, s17 -; CHECK-NEXT: vmov r4, r7, d12 +; CHECK-NEXT: vldrw.u32 q4, [r9], #16 +; CHECK-NEXT: mvn r3, #-2147483648 +; CHECK-NEXT: vmov.f32 s10, s13 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r8, s16 +; CHECK-NEXT: vmov.f32 s12, s14 +; CHECK-NEXT: vmov.f32 s16, s18 +; CHECK-NEXT: vmov.f32 s14, s15 +; CHECK-NEXT: vmov.f32 s18, s19 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmullb.s32 q5, q4, q3 +; CHECK-NEXT: vmov.f32 s10, s17 +; CHECK-NEXT: vmov r7, s10 +; CHECK-NEXT: smull r6, r5, r8, r5 +; CHECK-NEXT: asrl r6, r5, #31 +; CHECK-NEXT: smull r4, r7, r7, r4 ; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 -; CHECK-NEXT: sbcs.w r5, r2, r7 +; CHECK-NEXT: vmov q2[2], q2[0], r6, r4 +; CHECK-NEXT: rsbs.w r4, r4, #-2147483648 +; CHECK-NEXT: sbcs.w r4, r10, r7 +; CHECK-NEXT: vmov q2[3], q2[1], r5, r7 +; CHECK-NEXT: csetm r4, lt +; CHECK-NEXT: rsbs.w r6, r6, #-2147483648 +; CHECK-NEXT: sbcs.w r5, r10, r5 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: csetm r5, lt -; CHECK-NEXT: bfi r8, r5, #0, #8 -; CHECK-NEXT: vmov r10, r5, d13 -; CHECK-NEXT: asrl r10, r5, #31 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r10 -; CHECK-NEXT: sbcs.w r3, r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r7, r5 -; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: bfi r8, r3, #8, #8 -; CHECK-NEXT: vmsr p0, r8 -; CHECK-NEXT: mvn r8, #-2147483648 +; CHECK-NEXT: vmov r8, r7, d10 +; CHECK-NEXT: bfi r6, r5, #0, #8 +; CHECK-NEXT: asrl r8, r7, #31 +; CHECK-NEXT: bfi r6, r4, #8, #8 +; CHECK-NEXT: vmsr p0, r6 ; CHECK-NEXT: vpsel q2, q2, q0 -; CHECK-NEXT: vmov r3, r4, d4 -; CHECK-NEXT: subs.w r3, r3, r8 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: subs r4, r4, r3 +; CHECK-NEXT: sbcs r4, r5, #0 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: csetm r4, lt +; CHECK-NEXT: bfi r5, r4, #0, #8 +; CHECK-NEXT: vmov r4, r6, d5 +; CHECK-NEXT: subs r4, r4, r3 +; CHECK-NEXT: sbcs r4, r6, #0 +; CHECK-NEXT: mov.w r6, #0 +; CHECK-NEXT: csetm r4, lt +; CHECK-NEXT: bfi r5, r4, #8, #8 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: rsbs.w r5, r8, #-2147483648 +; CHECK-NEXT: sbcs.w r5, r10, r7 +; CHECK-NEXT: vpsel q2, q2, q1 +; CHECK-NEXT: csetm r5, lt +; CHECK-NEXT: vmov.f32 s9, s10 +; CHECK-NEXT: bfi r6, r5, #0, #8 +; CHECK-NEXT: vmov r4, r5, d11 +; CHECK-NEXT: asrl r4, r5, #31 +; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 +; CHECK-NEXT: vmov q3[2], q3[0], r8, r4 +; CHECK-NEXT: sbcs.w r3, r10, r5 +; CHECK-NEXT: vmov q3[3], q3[1], r7, r5 +; CHECK-NEXT: csetm r3, lt +; CHECK-NEXT: bfi r6, r3, #8, #8 +; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: mvn r6, #-2147483648 +; CHECK-NEXT: vpsel q3, q3, q0 +; CHECK-NEXT: vmov r3, r4, d6 +; CHECK-NEXT: subs r3, r3, r6 ; CHECK-NEXT: sbcs r3, r4, #0 ; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: csetm r3, lt ; CHECK-NEXT: bfi r4, r3, #0, #8 -; CHECK-NEXT: vmov r3, r5, d5 -; CHECK-NEXT: subs.w r3, r3, r8 +; CHECK-NEXT: vmov r3, r5, d7 +; CHECK-NEXT: subs r3, r3, r6 ; CHECK-NEXT: sbcs r3, r5, #0 -; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: csetm r3, lt ; CHECK-NEXT: bfi r4, r3, #8, #8 -; CHECK-NEXT: vmov r3, s12 ; CHECK-NEXT: vmsr p0, r4 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: vpsel q2, q2, q1 -; CHECK-NEXT: smull r4, r7, r4, r3 -; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 -; CHECK-NEXT: sbcs.w r3, r2, r7 -; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: bfi r5, r3, #0, #8 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: smull r6, r3, r6, r3 -; CHECK-NEXT: asrl r6, r3, #31 -; CHECK-NEXT: rsbs.w r1, r6, #-2147483648 -; CHECK-NEXT: vmov q3[2], q3[0], r4, r6 -; CHECK-NEXT: sbcs.w r1, r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r7, r3 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r5, r1, #8, #8 -; CHECK-NEXT: vmsr p0, r5 -; CHECK-NEXT: ldrd r5, r2, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: vpsel q3, q3, q0 -; CHECK-NEXT: vmov r1, r3, d6 -; CHECK-NEXT: subs.w r1, r1, r8 -; CHECK-NEXT: sbcs r1, r3, #0 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r3, r1, #0, #8 -; CHECK-NEXT: vmov r1, r4, d7 -; CHECK-NEXT: subs.w r1, r1, r8 -; CHECK-NEXT: sbcs r1, r4, #0 -; CHECK-NEXT: csetm r1, lt -; CHECK-NEXT: bfi r3, r1, #8, #8 -; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q3, q3, q1 -; CHECK-NEXT: vmov.f32 s13, s14 -; CHECK-NEXT: vmov.f32 s14, s8 -; CHECK-NEXT: vmov.f32 s15, s10 -; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldrd r1, r3, [sp] @ 8-byte Folded Reload -; CHECK-NEXT: cmp r1, r3 +; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB1_8 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r1 +; CHECK-NEXT: sub.w lr, r3, r7 ; CHECK-NEXT: mov.w r0, #-1 ; CHECK-NEXT: mov.w r3, #-2147483648 ; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: .LBB1_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r1, [r12], #4 -; CHECK-NEXT: ldr r4, [r9], #4 -; CHECK-NEXT: smull r4, r1, r4, r1 -; CHECK-NEXT: asrl r4, r1, #31 -; CHECK-NEXT: subs r5, r3, r4 -; CHECK-NEXT: sbcs.w r5, r0, r1 -; CHECK-NEXT: cset r5, lt -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: ldr r4, [r12], #4 +; CHECK-NEXT: ldr r5, [r1], #4 +; CHECK-NEXT: smull r4, r5, r5, r4 +; CHECK-NEXT: asrl r4, r5, #31 +; CHECK-NEXT: subs r6, r3, r4 +; CHECK-NEXT: sbcs.w r6, r0, r5 +; CHECK-NEXT: cset r6, lt +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: csel r4, r4, r3, ne -; CHECK-NEXT: csel r1, r1, r0, ne -; CHECK-NEXT: subs r5, r4, r2 -; CHECK-NEXT: sbcs r1, r1, #0 -; CHECK-NEXT: csel r1, r4, r2, lt -; CHECK-NEXT: str r1, [r11], #4 +; CHECK-NEXT: csel r5, r5, r0, ne +; CHECK-NEXT: subs r6, r4, r2 +; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: csel r4, r4, r2, lt +; CHECK-NEXT: str r4, [r11], #4 ; CHECK-NEXT: le lr, .LBB1_7 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 @@ -432,12 +430,12 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) { ; CHECK-LABEL: ssatmul_4t_q31: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r11, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -457,100 +455,98 @@ ; CHECK-NEXT: vldrw.u32 q3, [r5] ; CHECK-NEXT: vdup.32 q1, r6 ; CHECK-NEXT: mvn r8, #-2147483648 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: vdup.32 q4, r9 -; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: add.w r9, r9, #4 ; CHECK-NEXT: vorr q4, q4, q0 ; CHECK-NEXT: vcmp.u32 cs, q1, q4 -; CHECK-NEXT: vstr p0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q5, [r0], #16 ; CHECK-NEXT: vldrwt.u32 q6, [r1], #16 -; CHECK-NEXT: vmov.f32 s16, s22 -; CHECK-NEXT: vmov.f32 s18, s23 -; CHECK-NEXT: vmov.f32 s28, s26 -; CHECK-NEXT: vmov.f32 s30, s27 -; CHECK-NEXT: vmullb.s32 q0, q7, q4 -; CHECK-NEXT: vmov.f32 s22, s25 -; CHECK-NEXT: vmov r10, r5, d0 -; CHECK-NEXT: asrl r10, r5, #31 -; CHECK-NEXT: rsbs.w r7, r10, #-2147483648 -; CHECK-NEXT: sbcs.w r7, r12, r5 -; CHECK-NEXT: csetm r7, lt -; CHECK-NEXT: bfi r4, r7, #0, #8 -; CHECK-NEXT: vmov r6, r7, d1 -; CHECK-NEXT: asrl r6, r7, #31 -; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 -; CHECK-NEXT: vmov q0[2], q0[0], r10, r6 -; CHECK-NEXT: sbcs.w r3, r12, r7 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r7 -; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: vmov r7, s22 -; CHECK-NEXT: bfi r4, r3, #8, #8 -; CHECK-NEXT: vmsr p0, r4 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vmov r3, r4, d0 -; CHECK-NEXT: subs.w r3, r3, r8 -; CHECK-NEXT: sbcs r3, r4, #0 -; CHECK-NEXT: mov.w r4, #0 -; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: bfi r4, r3, #0, #8 -; CHECK-NEXT: vmov r3, r5, d1 -; CHECK-NEXT: subs.w r3, r3, r8 -; CHECK-NEXT: sbcs r3, r5, #0 -; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: bfi r4, r3, #8, #8 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vmov.f32 s18, s21 +; CHECK-NEXT: vmov r7, s20 ; CHECK-NEXT: vmov r4, s24 -; CHECK-NEXT: vpsel q4, q0, q3 -; CHECK-NEXT: vmov.f32 s2, s21 -; CHECK-NEXT: smull r10, r5, r4, r3 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: asrl r10, r5, #31 -; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 -; CHECK-NEXT: sbcs.w r3, r12, r5 -; CHECK-NEXT: csetm r3, lt -; CHECK-NEXT: bfi r4, r3, #0, #8 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: smull r6, r3, r7, r3 -; CHECK-NEXT: asrl r6, r3, #31 -; CHECK-NEXT: rsbs.w r7, r6, #-2147483648 -; CHECK-NEXT: vmov q0[2], q0[0], r10, r6 -; CHECK-NEXT: sbcs.w r7, r12, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 -; CHECK-NEXT: csetm r7, lt -; CHECK-NEXT: bfi r4, r7, #8, #8 -; CHECK-NEXT: vmsr p0, r4 -; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: vmov.f32 s20, s22 +; CHECK-NEXT: vmov.f32 s24, s26 +; CHECK-NEXT: vmov.f32 s22, s23 +; CHECK-NEXT: vmov.f32 s26, s27 +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmullb.s32 q7, q6, q5 +; CHECK-NEXT: vmov.f32 s18, s25 +; CHECK-NEXT: vmov r6, s18 +; CHECK-NEXT: smull r4, r7, r4, r7 +; CHECK-NEXT: asrl r4, r7, #31 +; CHECK-NEXT: smull r6, r5, r6, r5 +; CHECK-NEXT: asrl r6, r5, #31 +; CHECK-NEXT: vmov q4[2], q4[0], r4, r6 +; CHECK-NEXT: rsbs.w r6, r6, #-2147483648 +; CHECK-NEXT: vmov q4[3], q4[1], r7, r5 +; CHECK-NEXT: sbcs.w r5, r12, r5 +; CHECK-NEXT: csetm r5, lt +; CHECK-NEXT: rsbs.w r4, r4, #-2147483648 +; CHECK-NEXT: sbcs.w r4, r12, r7 +; CHECK-NEXT: mov.w r6, #0 +; CHECK-NEXT: csetm r4, lt +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: bfi r6, r4, #0, #8 +; CHECK-NEXT: bfi r6, r5, #8, #8 +; CHECK-NEXT: vmsr p0, r6 +; CHECK-NEXT: vpsel q4, q4, q2 +; CHECK-NEXT: vmov r4, r5, d8 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r4, r5, #0 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: csetm r4, lt +; CHECK-NEXT: bfi r5, r4, #0, #8 +; CHECK-NEXT: vmov r4, r6, d9 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r4, r6, #0 +; CHECK-NEXT: vmov r6, r11, d14 +; CHECK-NEXT: csetm r4, lt +; CHECK-NEXT: asrl r6, r11, #31 +; CHECK-NEXT: bfi r5, r4, #8, #8 +; CHECK-NEXT: rsbs.w r4, r6, #-2147483648 +; CHECK-NEXT: sbcs.w r4, r12, r11 +; CHECK-NEXT: vmsr p0, r5 +; CHECK-NEXT: csetm r4, lt +; CHECK-NEXT: vpsel q4, q4, q3 +; CHECK-NEXT: bfi r7, r4, #0, #8 +; CHECK-NEXT: vmov r4, r3, d15 +; CHECK-NEXT: asrl r4, r3, #31 +; CHECK-NEXT: vmov.f32 s17, s18 +; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 +; CHECK-NEXT: vmov q5[2], q5[0], r6, r4 +; CHECK-NEXT: sbcs.w r5, r12, r3 +; CHECK-NEXT: vmov q5[3], q5[1], r11, r3 +; CHECK-NEXT: csetm r5, lt +; CHECK-NEXT: bfi r7, r5, #8, #8 +; CHECK-NEXT: vmsr p0, r7 +; CHECK-NEXT: vpsel q5, q5, q2 +; CHECK-NEXT: vmov r3, r4, d10 ; CHECK-NEXT: subs.w r3, r3, r8 ; CHECK-NEXT: sbcs r3, r4, #0 ; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: csetm r3, lt ; CHECK-NEXT: bfi r4, r3, #0, #8 -; CHECK-NEXT: vmov r3, r5, d1 +; CHECK-NEXT: vmov r3, r5, d11 ; CHECK-NEXT: subs.w r3, r3, r8 ; CHECK-NEXT: sbcs r3, r5, #0 ; CHECK-NEXT: csetm r3, lt ; CHECK-NEXT: bfi r4, r3, #8, #8 ; CHECK-NEXT: vmsr p0, r4 -; CHECK-NEXT: vpsel q0, q0, q3 -; CHECK-NEXT: vldr p0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s3, s18 +; CHECK-NEXT: vpsel q5, q5, q3 +; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vmov.f32 s19, s22 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r2], #16 +; CHECK-NEXT: vstrwt.32 q4, [r2], #16 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI2_0: diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -344,12 +344,14 @@ define arm_aapcs_vfpcc void @ptr_v4f16_dup(half %v, <4 x half*> %offs) { ; CHECK-LABEL: ptr_v4f16_dup: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, r1, d2 -; CHECK-NEXT: vmov r2, r3, d3 +; CHECK-NEXT: vmov.f16 r0, s0 +; CHECK-NEXT: vmov r1, r2, d2 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vmov r0, r1, d3 +; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vstr.16 s0, [r2] -; CHECK-NEXT: vstr.16 s0, [r3] ; CHECK-NEXT: bx lr entry: %splatinsert = insertelement <4 x half> poison, half %v, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll @@ -57,19 +57,19 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q5, [r0] -; CHECK-NEXT: vmov.f32 s2, s21 +; CHECK-NEXT: vmov.f32 s2, s23 +; CHECK-NEXT: vmov.f32 s16, s22 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov.f32 s2, s23 +; CHECK-NEXT: vmov.f32 s2, s21 ; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: vmov.f32 s20, s22 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 @@ -82,8 +82,8 @@ ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d ; CHECK-NEXT: vmov d10, r0, r1 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q0, q5 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -353,39 +353,39 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.f32 s16, s1 -; CHECK-NEXT: vins.f16 s12, s2 -; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vmovx.f16 s14, s2 +; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vmovx.f16 s14, s5 +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vmovx.f16 s15, s8 +; CHECK-NEXT: vins.f16 s13, s14 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vins.f16 s14, s15 +; CHECK-NEXT: vmovx.f16 s16, s11 +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vins.f16 s15, s16 +; CHECK-NEXT: vmovx.f16 s16, s0 ; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmovx.f16 s13, s3 -; CHECK-NEXT: vins.f16 s17, s2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vmov.f32 s19, s10 -; CHECK-NEXT: vins.f16 s18, s2 -; CHECK-NEXT: vmovx.f16 s2, s11 -; CHECK-NEXT: vins.f16 s19, s2 ; CHECK-NEXT: vmovx.f16 s2, s1 ; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vmovx.f16 s2, s4 ; CHECK-NEXT: vins.f16 s3, s2 ; CHECK-NEXT: vmovx.f16 s2, s7 -; CHECK-NEXT: vmovx.f16 s4, s10 -; CHECK-NEXT: vmovx.f16 s14, s6 -; CHECK-NEXT: vmovx.f16 s15, s9 +; CHECK-NEXT: vmovx.f16 s18, s6 ; CHECK-NEXT: vins.f16 s6, s2 -; CHECK-NEXT: vins.f16 s9, s4 +; CHECK-NEXT: vmovx.f16 s2, s10 +; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vins.f16 s9, s2 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s14, s8 -; CHECK-NEXT: vins.f16 s15, s11 -; CHECK-NEXT: vins.f16 s13, s5 -; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vins.f16 s18, s8 +; CHECK-NEXT: vins.f16 s19, s11 +; CHECK-NEXT: vins.f16 s17, s5 ; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vadd.i16 q0, q0, q3 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -691,8 +691,8 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) { ; CHECK-LABEL: shuffle3step_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vmov.u8 r0, q0[1] ; CHECK-NEXT: vmov.8 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[4] @@ -702,66 +702,72 @@ ; CHECK-NEXT: vmov.u8 r0, q0[10] ; CHECK-NEXT: vmov.8 q3[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.8 q3[4], r0 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov.8 q4[4], r0 ; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q3[5], r0 +; CHECK-NEXT: vmov.8 q4[5], r0 ; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q3[6], r0 +; CHECK-NEXT: vmov.8 q4[6], r0 ; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q4[8], r0 +; CHECK-NEXT: vmov.8 q5[8], r0 ; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q4[9], r0 +; CHECK-NEXT: vmov.8 q5[9], r0 ; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q4[10], r0 +; CHECK-NEXT: vmov.8 q5[10], r0 ; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q4[11], r0 +; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q4[12], r0 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.8 q6[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q4[13], r0 +; CHECK-NEXT: vmov.8 q6[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q4[14], r0 +; CHECK-NEXT: vmov.8 q6[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q4[15], r0 +; CHECK-NEXT: vmov.8 q6[15], r0 ; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q3[7], r0 +; CHECK-NEXT: vmov.8 q4[7], r0 ; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s13, s17 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.f32 s15, s27 ; CHECK-NEXT: vmov.8 q4[1], r0 ; CHECK-NEXT: vmov.u8 r0, q0[6] ; CHECK-NEXT: vmov.8 q4[2], r0 ; CHECK-NEXT: vmov.u8 r0, q0[9] ; CHECK-NEXT: vmov.8 q4[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.8 q5[4], r0 ; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.8 q5[5], r0 ; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.8 q5[6], r0 ; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q5[8], r0 +; CHECK-NEXT: vmov.8 q6[8], r0 ; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q5[9], r0 +; CHECK-NEXT: vmov.8 q6[9], r0 ; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q5[10], r0 +; CHECK-NEXT: vmov.8 q6[10], r0 ; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q5[11], r0 +; CHECK-NEXT: vmov.8 q6[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q5[12], r0 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.8 q7[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q5[13], r0 +; CHECK-NEXT: vmov.8 q7[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q5[14], r0 +; CHECK-NEXT: vmov.8 q7[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vmov.8 q7[15], r0 ; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q4[7], r0 +; CHECK-NEXT: vmov.8 q5[7], r0 +; CHECK-NEXT: vmov.f32 s19, s31 +; CHECK-NEXT: vmov.f32 s17, s21 ; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vadd.i8 q3, q4, q3 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[5] @@ -771,33 +777,36 @@ ; CHECK-NEXT: vmov.u8 r0, q0[11] ; CHECK-NEXT: vmov.8 q4[3], r0 ; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.8 q0[4], r0 ; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.8 q0[5], r0 ; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.8 q0[6], r0 ; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.8 q5[8], r0 ; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q5[9], r0 ; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.8 q5[10], r0 ; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.8 q6[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.8 q6[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.8 q6[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.8 q6[15], r0 ; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q4[7], r0 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmov.f32 s19, s3 +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov.f32 s19, s27 +; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vadd.i8 q0, q3, q4 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> @@ -811,9 +820,12 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle4step_i8(<64 x i8> %src) { ; CHECK-LABEL: shuffle4step_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov q7, q3 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q0[7] ; CHECK-NEXT: vmov.8 q4[1], r0 @@ -822,137 +834,162 @@ ; CHECK-NEXT: vmov.u8 r0, q0[15] ; CHECK-NEXT: vmov.8 q4[3], r0 ; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.8 q5[4], r0 ; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.8 q5[5], r0 ; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q4[6], r0 -; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q5[11], r0 +; CHECK-NEXT: vmov.8 q5[6], r0 ; CHECK-NEXT: vmov.u8 r0, q3[3] -; CHECK-NEXT: vmov.8 q5[12], r0 +; CHECK-NEXT: vmov.8 q6[8], r0 ; CHECK-NEXT: vmov.u8 r0, q3[7] -; CHECK-NEXT: vmov.8 q5[13], r0 +; CHECK-NEXT: vmov.8 q6[9], r0 ; CHECK-NEXT: vmov.u8 r0, q3[11] -; CHECK-NEXT: vmov.8 q5[14], r0 +; CHECK-NEXT: vmov.8 q6[10], r0 ; CHECK-NEXT: vmov.u8 r0, q3[15] -; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vmov.8 q6[11], r0 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vmov.u8 r0, q7[3] +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov.8 q7[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: vmov.8 q7[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.8 q7[14], r0 ; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q4[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.8 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.8 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.8 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.8 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q5[6], r0 +; CHECK-NEXT: vmov.8 q7[15], r0 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q5[7], r0 ; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q6[8], r0 +; CHECK-NEXT: vmov.f32 s17, s21 +; CHECK-NEXT: vmov.8 q5[0], r0 ; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q6[9], r0 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.8 q5[1], r0 ; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q6[10], r0 +; CHECK-NEXT: vmov.8 q5[2], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q6[11], r0 +; CHECK-NEXT: vmov.8 q5[3], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.8 q6[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q6[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.8 q6[6], r0 ; CHECK-NEXT: vmov.u8 r0, q3[2] -; CHECK-NEXT: vmov.8 q6[12], r0 +; CHECK-NEXT: vmov.8 q0[8], r0 ; CHECK-NEXT: vmov.u8 r0, q3[6] -; CHECK-NEXT: vmov.8 q6[13], r0 +; CHECK-NEXT: vmov.8 q0[9], r0 ; CHECK-NEXT: vmov.u8 r0, q3[10] -; CHECK-NEXT: vmov.8 q6[14], r0 +; CHECK-NEXT: vmov.8 q0[10], r0 ; CHECK-NEXT: vmov.u8 r0, q3[14] -; CHECK-NEXT: vmov.8 q6[15], r0 +; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.f32 s19, s31 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vmov.8 q7[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.8 q7[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.8 q7[14], r0 ; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vadd.i8 q4, q5, q4 -; CHECK-NEXT: vmov.8 q5[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.8 q5[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] -; CHECK-NEXT: vmov.8 q5[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.8 q5[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q5[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q5[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q5[6], r0 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q7[15], r0 +; CHECK-NEXT: vmov.f32 s22, s2 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.f32 s23, s31 +; CHECK-NEXT: vmov.8 q6[7], r0 ; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q6[8], r0 +; CHECK-NEXT: vmov.f32 s21, s25 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vadd.i8 q0, q5, q4 +; CHECK-NEXT: vmov.8 q5[0], r0 ; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q6[9], r0 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q5[1], r0 ; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q6[10], r0 +; CHECK-NEXT: vmov.8 q5[2], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q6[11], r0 +; CHECK-NEXT: vmov.8 q5[3], r0 +; CHECK-NEXT: vmov.u8 r0, q4[1] +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.8 q6[4], r0 +; CHECK-NEXT: vmov.u8 r0, q4[5] +; CHECK-NEXT: vmov.8 q6[5], r0 +; CHECK-NEXT: vmov.u8 r0, q4[9] +; CHECK-NEXT: vmov.8 q6[6], r0 ; CHECK-NEXT: vmov.u8 r0, q3[1] -; CHECK-NEXT: vmov.8 q6[12], r0 +; CHECK-NEXT: vmov.8 q7[8], r0 ; CHECK-NEXT: vmov.u8 r0, q3[5] -; CHECK-NEXT: vmov.8 q6[13], r0 +; CHECK-NEXT: vmov.8 q7[9], r0 ; CHECK-NEXT: vmov.u8 r0, q3[9] -; CHECK-NEXT: vmov.8 q6[14], r0 +; CHECK-NEXT: vmov.8 q7[10], r0 ; CHECK-NEXT: vmov.u8 r0, q3[13] -; CHECK-NEXT: vmov.8 q6[15], r0 +; CHECK-NEXT: vmov.8 q7[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vmov.f32 s22, s30 +; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.8 q0[14], r0 ; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q5[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[0] -; CHECK-NEXT: vmov.f32 s22, s26 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.8 q6[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.8 q6[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.8 q6[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] -; CHECK-NEXT: vmov.8 q6[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q6[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q6[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q6[6], r0 +; CHECK-NEXT: vmov.8 q0[15], r0 +; CHECK-NEXT: vmov.u8 r0, q4[13] +; CHECK-NEXT: vmov.8 q6[7], r0 ; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q0[8], r0 +; CHECK-NEXT: vmov.f32 s21, s25 +; CHECK-NEXT: vmov.8 q6[0], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: vmov.8 q6[1], r0 ; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q0[10], r0 +; CHECK-NEXT: vmov.8 q6[2], r0 ; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q0[11], r0 +; CHECK-NEXT: vmov.8 q6[3], r0 +; CHECK-NEXT: vmov.u8 r0, q4[0] +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov.u8 r0, q4[4] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov.u8 r0, q4[8] +; CHECK-NEXT: vmov.8 q0[6], r0 ; CHECK-NEXT: vmov.u8 r0, q3[0] -; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.8 q7[8], r0 ; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov.8 q0[13], r0 +; CHECK-NEXT: vmov.8 q7[9], r0 ; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.8 q7[10], r0 ; CHECK-NEXT: vmov.u8 r0, q3[12] -; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q6[7], r0 -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vmov.f32 s27, s3 +; CHECK-NEXT: vmov.8 q7[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vmov.f32 s26, s30 +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q2[8] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vmov.u8 r0, q4[12] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov.f32 s25, s1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vadd.i8 q0, q6, q5 -; CHECK-NEXT: vadd.i8 q0, q0, q4 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %s1 = shufflevector <64 x i8> %src, <64 x i8> undef, <16 x i32> @@ -1321,37 +1358,37 @@ ; CHECKFP: @ %bb.0: @ %entry ; CHECKFP-NEXT: .vsave {d8, d9} ; CHECKFP-NEXT: vpush {d8, d9} -; CHECKFP-NEXT: vmov.f32 s13, s4 -; CHECKFP-NEXT: vmovx.f16 s4, s4 -; CHECKFP-NEXT: vmovx.f16 s17, s3 -; CHECKFP-NEXT: vins.f16 s3, s4 -; CHECKFP-NEXT: vmovx.f16 s4, s7 -; CHECKFP-NEXT: vmovx.f16 s18, s6 -; CHECKFP-NEXT: vmovx.f16 s16, s0 -; CHECKFP-NEXT: vins.f16 s6, s4 ; CHECKFP-NEXT: vmovx.f16 s14, s2 ; CHECKFP-NEXT: vmov.f32 s12, s1 -; CHECKFP-NEXT: vmovx.f16 s4, s10 -; CHECKFP-NEXT: vmovx.f16 s19, s9 ; CHECKFP-NEXT: vins.f16 s12, s14 ; CHECKFP-NEXT: vmovx.f16 s14, s5 -; CHECKFP-NEXT: vins.f16 s16, s2 -; CHECKFP-NEXT: vmovx.f16 s2, s11 +; CHECKFP-NEXT: vmov.f32 s13, s4 ; CHECKFP-NEXT: vmovx.f16 s15, s8 -; CHECKFP-NEXT: vins.f16 s18, s8 -; CHECKFP-NEXT: vmovx.f16 s8, s1 -; CHECKFP-NEXT: vins.f16 s9, s4 ; CHECKFP-NEXT: vins.f16 s13, s14 ; CHECKFP-NEXT: vmov.f32 s14, s7 -; CHECKFP-NEXT: vins.f16 s10, s2 +; CHECKFP-NEXT: vins.f16 s14, s15 +; CHECKFP-NEXT: vmovx.f16 s16, s11 +; CHECKFP-NEXT: vmov.f32 s15, s10 +; CHECKFP-NEXT: vmovx.f16 s17, s3 +; CHECKFP-NEXT: vins.f16 s15, s16 +; CHECKFP-NEXT: vmovx.f16 s16, s0 +; CHECKFP-NEXT: vins.f16 s16, s2 +; CHECKFP-NEXT: vmovx.f16 s2, s1 +; CHECKFP-NEXT: vins.f16 s0, s2 +; CHECKFP-NEXT: vmovx.f16 s2, s4 +; CHECKFP-NEXT: vins.f16 s3, s2 +; CHECKFP-NEXT: vmovx.f16 s2, s7 +; CHECKFP-NEXT: vmovx.f16 s18, s6 +; CHECKFP-NEXT: vins.f16 s6, s2 +; CHECKFP-NEXT: vmovx.f16 s2, s10 +; CHECKFP-NEXT: vmovx.f16 s19, s9 +; CHECKFP-NEXT: vins.f16 s9, s2 ; CHECKFP-NEXT: vmov.f32 s1, s3 +; CHECKFP-NEXT: vins.f16 s18, s8 ; CHECKFP-NEXT: vins.f16 s19, s11 ; CHECKFP-NEXT: vins.f16 s17, s5 -; CHECKFP-NEXT: vins.f16 s0, s8 -; CHECKFP-NEXT: vmov.f32 s2, s6 ; CHECKFP-NEXT: vmov.f32 s3, s9 -; CHECKFP-NEXT: vins.f16 s14, s15 -; CHECKFP-NEXT: vmov.f32 s15, s10 +; CHECKFP-NEXT: vmov.f32 s2, s6 ; CHECKFP-NEXT: vadd.f16 q0, q0, q4 ; CHECKFP-NEXT: vadd.f16 q0, q0, q3 ; CHECKFP-NEXT: vpop {d8, d9} diff --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll --- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll +++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll @@ -140,12 +140,14 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) { ; CHECK-MVE-LABEL: vector_add_f16: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: .save {r4, r5, r7, lr} -; CHECK-MVE-NEXT: push {r4, r5, r7, lr} +; CHECK-MVE-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-MVE-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-MVE-NEXT: .pad #4 +; CHECK-MVE-NEXT: sub sp, #4 ; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: vmov d8, r0, r1 -; CHECK-MVE-NEXT: add r0, sp, #64 +; CHECK-MVE-NEXT: add r0, sp, #72 ; CHECK-MVE-NEXT: vldrw.u32 q6, [r0] ; CHECK-MVE-NEXT: vmov d9, r2, r3 ; CHECK-MVE-NEXT: vmov.u16 r4, q4[0] @@ -189,58 +191,63 @@ ; CHECK-MVE-NEXT: bl __aeabi_f2h ; CHECK-MVE-NEXT: vmov.16 q5[3], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q6[4] -; CHECK-MVE-NEXT: vmov.u16 r4, q4[4] +; CHECK-MVE-NEXT: vmov r5, r4, d10 +; CHECK-MVE-NEXT: vmov.u16 r6, q4[4] ; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: mov r0, r4 +; CHECK-MVE-NEXT: mov r7, r0 +; CHECK-MVE-NEXT: mov r0, r6 ; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: mov r1, r5 +; CHECK-MVE-NEXT: mov r1, r7 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: bl __aeabi_f2h ; CHECK-MVE-NEXT: vmov.16 q5[4], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q6[5] -; CHECK-MVE-NEXT: vmov.u16 r4, q4[5] +; CHECK-MVE-NEXT: vmov.u16 r6, q4[5] ; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: mov r0, r4 +; CHECK-MVE-NEXT: mov r7, r0 +; CHECK-MVE-NEXT: mov r0, r6 ; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: mov r1, r5 +; CHECK-MVE-NEXT: mov r1, r7 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: bl __aeabi_f2h ; CHECK-MVE-NEXT: vmov.16 q5[5], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q6[6] -; CHECK-MVE-NEXT: vmov.u16 r4, q4[6] +; CHECK-MVE-NEXT: vmov.u16 r6, q4[6] ; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: mov r0, r4 +; CHECK-MVE-NEXT: mov r7, r0 +; CHECK-MVE-NEXT: mov r0, r6 ; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: mov r1, r5 +; CHECK-MVE-NEXT: mov r1, r7 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: bl __aeabi_f2h ; CHECK-MVE-NEXT: vmov.16 q5[6], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q6[7] -; CHECK-MVE-NEXT: vmov.u16 r4, q4[7] +; CHECK-MVE-NEXT: vmov.u16 r6, q4[7] ; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: mov r0, r4 +; CHECK-MVE-NEXT: mov r7, r0 +; CHECK-MVE-NEXT: mov r0, r6 ; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: mov r1, r5 +; CHECK-MVE-NEXT: mov r1, r7 ; CHECK-MVE-NEXT: bl __aeabi_fadd ; CHECK-MVE-NEXT: bl __aeabi_f2h ; CHECK-MVE-NEXT: vmov.16 q5[7], r0 -; CHECK-MVE-NEXT: vmov r0, r1, d10 +; CHECK-MVE-NEXT: mov r0, r5 ; CHECK-MVE-NEXT: vmov r2, r3, d11 +; CHECK-MVE-NEXT: mov r1, r4 ; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-MVE-NEXT: pop {r4, r5, r7, pc} +; CHECK-MVE-NEXT: add sp, #4 +; CHECK-MVE-NEXT: pop {r4, r5, r6, r7, pc} ; ; CHECK-BE-LABEL: vector_add_f16: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .save {r4, r5, r7, lr} -; CHECK-BE-NEXT: push {r4, r5, r7, lr} +; CHECK-BE-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-BE-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-BE-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-BE-NEXT: vmov d0, r1, r0 -; CHECK-BE-NEXT: add r0, sp, #64 +; CHECK-BE-NEXT: add r0, sp, #72 ; CHECK-BE-NEXT: vldrh.u16 q6, [r0] ; CHECK-BE-NEXT: vmov d1, r3, r2 ; CHECK-BE-NEXT: vrev64.16 q4, q0 @@ -285,50 +292,54 @@ ; CHECK-BE-NEXT: bl __aeabi_f2h ; CHECK-BE-NEXT: vmov.16 q5[3], r0 ; CHECK-BE-NEXT: vmov.u16 r0, q6[4] -; CHECK-BE-NEXT: vmov.u16 r4, q4[4] +; CHECK-BE-NEXT: vrev64.16 q0, q5 +; CHECK-BE-NEXT: vmov.u16 r6, q4[4] +; CHECK-BE-NEXT: vmov r4, r5, d0 ; CHECK-BE-NEXT: bl __aeabi_h2f -; CHECK-BE-NEXT: mov r5, r0 -; CHECK-BE-NEXT: mov r0, r4 +; CHECK-BE-NEXT: mov r7, r0 +; CHECK-BE-NEXT: mov r0, r6 ; CHECK-BE-NEXT: bl __aeabi_h2f -; CHECK-BE-NEXT: mov r1, r5 +; CHECK-BE-NEXT: mov r1, r7 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h ; CHECK-BE-NEXT: vmov.16 q5[4], r0 ; CHECK-BE-NEXT: vmov.u16 r0, q6[5] -; CHECK-BE-NEXT: vmov.u16 r4, q4[5] +; CHECK-BE-NEXT: vmov.u16 r6, q4[5] ; CHECK-BE-NEXT: bl __aeabi_h2f -; CHECK-BE-NEXT: mov r5, r0 -; CHECK-BE-NEXT: mov r0, r4 +; CHECK-BE-NEXT: mov r7, r0 +; CHECK-BE-NEXT: mov r0, r6 ; CHECK-BE-NEXT: bl __aeabi_h2f -; CHECK-BE-NEXT: mov r1, r5 +; CHECK-BE-NEXT: mov r1, r7 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h ; CHECK-BE-NEXT: vmov.16 q5[5], r0 ; CHECK-BE-NEXT: vmov.u16 r0, q6[6] -; CHECK-BE-NEXT: vmov.u16 r4, q4[6] +; CHECK-BE-NEXT: vmov.u16 r6, q4[6] ; CHECK-BE-NEXT: bl __aeabi_h2f -; CHECK-BE-NEXT: mov r5, r0 -; CHECK-BE-NEXT: mov r0, r4 +; CHECK-BE-NEXT: mov r7, r0 +; CHECK-BE-NEXT: mov r0, r6 ; CHECK-BE-NEXT: bl __aeabi_h2f -; CHECK-BE-NEXT: mov r1, r5 +; CHECK-BE-NEXT: mov r1, r7 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h ; CHECK-BE-NEXT: vmov.16 q5[6], r0 ; CHECK-BE-NEXT: vmov.u16 r0, q6[7] -; CHECK-BE-NEXT: vmov.u16 r4, q4[7] +; CHECK-BE-NEXT: vmov.u16 r6, q4[7] ; CHECK-BE-NEXT: bl __aeabi_h2f -; CHECK-BE-NEXT: mov r5, r0 -; CHECK-BE-NEXT: mov r0, r4 +; CHECK-BE-NEXT: mov r7, r0 +; CHECK-BE-NEXT: mov r0, r6 ; CHECK-BE-NEXT: bl __aeabi_h2f -; CHECK-BE-NEXT: mov r1, r5 +; CHECK-BE-NEXT: mov r1, r7 ; CHECK-BE-NEXT: bl __aeabi_fadd ; CHECK-BE-NEXT: bl __aeabi_f2h ; CHECK-BE-NEXT: vmov.16 q5[7], r0 +; CHECK-BE-NEXT: mov r0, r5 ; CHECK-BE-NEXT: vrev64.16 q0, q5 -; CHECK-BE-NEXT: vmov r1, r0, d0 +; CHECK-BE-NEXT: mov r1, r4 ; CHECK-BE-NEXT: vmov r3, r2, d1 ; CHECK-BE-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-BE-NEXT: pop {r4, r5, r7, pc} +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: pop {r4, r5, r6, r7, pc} ; ; CHECK-FP-LABEL: vector_add_f16: ; CHECK-FP: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -250,30 +250,26 @@ define arm_aapcs_vfpcc <2 x i32> @vabd_v2u32(<2 x i32> %src1, <2 x i32> %src2) { ; CHECK-LABEL: vabd_v2u32: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: subs r0, r2, r0 -; CHECK-NEXT: sbc.w r1, r3, r1 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: subs r0, r1, r0 +; CHECK-NEXT: sbc r1, r12, #0 ; CHECK-NEXT: eor.w r0, r0, r1, asr #31 -; CHECK-NEXT: eor.w r2, r1, r1, asr #31 -; CHECK-NEXT: subs.w lr, r0, r1, asr #31 -; CHECK-NEXT: sbc.w r12, r2, r1, asr #31 -; CHECK-NEXT: vmov r2, r3, d2 -; CHECK-NEXT: vmov r1, r0, d0 -; CHECK-NEXT: subs r1, r1, r2 -; CHECK-NEXT: sbcs r0, r3 -; CHECK-NEXT: eor.w r1, r1, r0, asr #31 -; CHECK-NEXT: subs.w r1, r1, r0, asr #31 -; CHECK-NEXT: vmov q0[2], q0[0], r1, lr -; CHECK-NEXT: eor.w r1, r0, r0, asr #31 -; CHECK-NEXT: sbc.w r0, r1, r0, asr #31 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: eor.w r3, r1, r1, asr #31 +; CHECK-NEXT: subs.w r0, r0, r1, asr #31 +; CHECK-NEXT: sbc.w r1, r3, r1, asr #31 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: subs r2, r2, r3 +; CHECK-NEXT: sbc r3, r12, #0 +; CHECK-NEXT: eor.w r2, r2, r3, asr #31 +; CHECK-NEXT: subs.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: eor.w r0, r3, r3, asr #31 +; CHECK-NEXT: sbc.w r0, r0, r3, asr #31 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 +; CHECK-NEXT: bx lr %zextsrc1 = zext <2 x i32> %src1 to <2 x i64> %zextsrc2 = zext <2 x i32> %src2 to <2 x i64> %add1 = sub <2 x i64> %zextsrc1, %zextsrc2 @@ -401,26 +397,26 @@ ; CHECK-NEXT: subs r4, r4, r6 ; CHECK-NEXT: sbc.w r9, r3, r6, asr #31 ; CHECK-NEXT: vmov r6, s8 +; CHECK-NEXT: vmov r3, s6 ; CHECK-NEXT: subs r5, r7, r6 +; CHECK-NEXT: asr.w r7, r7, #31 ; CHECK-NEXT: vmov q2[2], q2[0], r5, r8 -; CHECK-NEXT: asr.w r5, r7, #31 -; CHECK-NEXT: sbc.w r5, r5, r6, asr #31 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r7, s6 -; CHECK-NEXT: subs r3, r7, r6 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 -; CHECK-NEXT: asr.w r3, r5, #31 -; CHECK-NEXT: mov.w r4, #0 -; CHECK-NEXT: bfi r4, r3, #0, #4 -; CHECK-NEXT: asr.w r3, r9, #31 -; CHECK-NEXT: bfi r4, r3, #4, #4 -; CHECK-NEXT: asr.w r3, r12, #31 -; CHECK-NEXT: bfi r4, r3, #8, #4 -; CHECK-NEXT: asr.w r3, r7, #31 -; CHECK-NEXT: sbc.w r3, r3, r6, asr #31 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: sbc.w r6, r7, r6, asr #31 +; CHECK-NEXT: asrs r6, r6, #31 +; CHECK-NEXT: subs r7, r3, r5 +; CHECK-NEXT: asr.w r3, r3, #31 +; CHECK-NEXT: vmov q2[3], q2[1], r4, r7 +; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: sbc.w r3, r3, r5, asr #31 +; CHECK-NEXT: bfi r7, r6, #0, #4 +; CHECK-NEXT: asr.w r4, r9, #31 +; CHECK-NEXT: asr.w r6, r12, #31 +; CHECK-NEXT: bfi r7, r4, #4, #4 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: bfi r4, r3, #12, #4 -; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: bfi r7, r6, #8, #4 +; CHECK-NEXT: bfi r7, r3, #12, #4 +; CHECK-NEXT: vmsr p0, r7 ; CHECK-NEXT: vpst ; CHECK-NEXT: vsubt.i32 q2, q0, q2 ; CHECK-NEXT: vstrb.8 q2, [r2], #16 @@ -547,62 +543,55 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vabd_loop_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB20_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [r1], #16 -; CHECK-NEXT: vldrw.u32 q5, [r0], #16 -; CHECK-NEXT: vmov.f32 s8, s18 -; CHECK-NEXT: vmov.f32 s10, s19 -; CHECK-NEXT: vmov.f32 s12, s22 -; CHECK-NEXT: vand q2, q2, q0 -; CHECK-NEXT: vmov.f32 s14, s23 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vmov r3, r12, d4 -; CHECK-NEXT: vmov r4, r5, d6 -; CHECK-NEXT: vmov.f32 s18, s17 -; CHECK-NEXT: vmov.f32 s22, s21 -; CHECK-NEXT: vand q4, q4, q0 -; CHECK-NEXT: vand q5, q5, q0 -; CHECK-NEXT: vmov r6, r7, d11 -; CHECK-NEXT: subs.w r8, r4, r3 -; CHECK-NEXT: sbc.w r12, r5, r12 -; CHECK-NEXT: vmov r5, r3, d9 -; CHECK-NEXT: subs.w r10, r6, r5 -; CHECK-NEXT: sbc.w r9, r7, r3 -; CHECK-NEXT: vmov r6, r7, d8 -; CHECK-NEXT: vmov r4, r3, d10 -; CHECK-NEXT: subs r4, r4, r6 -; CHECK-NEXT: sbcs r3, r7 -; CHECK-NEXT: vmov q4[2], q4[0], r4, r8 -; CHECK-NEXT: vmov r4, r6, d5 -; CHECK-NEXT: vmov r7, r5, d7 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: subs r4, r7, r4 -; CHECK-NEXT: vmov q4[3], q4[1], r10, r4 -; CHECK-NEXT: mov.w r4, #0 -; CHECK-NEXT: bfi r4, r3, #0, #4 -; CHECK-NEXT: asr.w r3, r9, #31 -; CHECK-NEXT: bfi r4, r3, #4, #4 -; CHECK-NEXT: asr.w r3, r12, #31 -; CHECK-NEXT: bfi r4, r3, #8, #4 -; CHECK-NEXT: sbc.w r3, r5, r6 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: vmov.f32 s12, s10 +; CHECK-NEXT: vmov.f32 s10, s5 +; CHECK-NEXT: vmov.f32 s14, s11 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov r5, s10 +; CHECK-NEXT: vmov.f32 s10, s9 +; CHECK-NEXT: vmov r6, s10 +; CHECK-NEXT: subs r3, r4, r3 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: sbc r8, r12, #0 +; CHECK-NEXT: subs r5, r6, r5 +; CHECK-NEXT: sbc r6, r12, #0 +; CHECK-NEXT: subs r4, r4, r7 +; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: sbc r7, r12, #0 +; CHECK-NEXT: subs r3, r4, r3 +; CHECK-NEXT: asr.w r4, r7, #31 +; CHECK-NEXT: vmov q2[3], q2[1], r5, r3 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: asr.w r3, r6, #31 +; CHECK-NEXT: bfi r5, r4, #0, #4 +; CHECK-NEXT: bfi r5, r3, #4, #4 +; CHECK-NEXT: asr.w r3, r8, #31 +; CHECK-NEXT: bfi r5, r3, #8, #4 +; CHECK-NEXT: sbc r3, r12, #0 ; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: bfi r4, r3, #12, #4 -; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: bfi r5, r3, #12, #4 +; CHECK-NEXT: vmsr p0, r5 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.i32 q4, q1, q4 -; CHECK-NEXT: vstrb.8 q4, [r2], #16 +; CHECK-NEXT: vsubt.i32 q2, q0, q2 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB20_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: br label %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -36,12 +36,11 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x) { ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -129,36 +128,37 @@ ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i16> %x to <8 x i64> @@ -226,11 +226,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) { ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov.i32 q1, #0xffff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -396,76 +397,77 @@ ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[8] ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[8] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u8 r3, q0[14] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i64> @@ -538,37 +540,38 @@ ; CHECK-NEXT: vmov.u16 r0, q0[1] ; CHECK-NEXT: vmov.u16 r1, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 -; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r0, r1, d5 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i8> %x to <8 x i64> @@ -646,11 +649,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) { ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -726,17 +730,14 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) { ; CHECK-LABEL: add_v2i32_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) @@ -834,36 +835,37 @@ ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w lr, r3, r2 +; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: add r2, lr -; CHECK-NEXT: add.w lr, r2, r3 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: add.w r12, r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add lr, r2 -; CHECK-NEXT: vmov r3, r2, d5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: adc r12, r12, #0 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} @@ -941,13 +943,13 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov.i32 q1, #0xffff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -1130,76 +1132,77 @@ ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vmov.i64 q1, #0xff ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w lr, r3, r2 +; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: vmov.u8 r2, q0[2] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: add r2, lr -; CHECK-NEXT: add.w lr, r2, r3 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: add.w r12, r2, r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add lr, r2 -; CHECK-NEXT: vmov r3, r2, d5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: adc r12, r12, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: adds.w r12, lr, r2 ; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov.u8 r3, q0[8] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov.u8 r3, q0[10] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov.u8 r3, q0[12] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r3, d4 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d5 -; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov.u8 r3, q0[14] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} @@ -1283,36 +1286,37 @@ ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: add.w lr, r3, r2 +; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: add r2, lr -; CHECK-NEXT: add.w lr, r2, r3 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: add.w r12, r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: add lr, r2 -; CHECK-NEXT: vmov r3, r2, d5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: adc r12, r12, #0 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r7, pc} @@ -1400,13 +1404,13 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -46,23 +46,22 @@ ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: bfi r1, r0, #0, #8 +; CHECK-NEXT: bfi r2, r0, #0, #8 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, eq -; CHECK-NEXT: bfi r1, r0, #8, #8 -; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: bfi r2, r0, #8, #8 +; CHECK-NEXT: vmsr p0, r2 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -194,6 +193,7 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q5[3] ; CHECK-NEXT: vmov.u16 r1, q5[1] +; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vpsel q6, q4, q3 @@ -208,30 +208,29 @@ ; CHECK-NEXT: vand q7, q2, q1 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vpsel q7, q7, q2 -; CHECK-NEXT: vmov r0, r1, d15 -; CHECK-NEXT: vmov r2, r3, d14 -; CHECK-NEXT: orrs r1, r3 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov r1, s28 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, r2, d13 +; CHECK-NEXT: vmov q6[2], q6[0], r1, r2 +; CHECK-NEXT: vmov q6[3], q6[1], r1, r2 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vpsel q6, q6, q2 -; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov r1, s24 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s26 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vpsel q3, q4, q3 @@ -244,14 +243,14 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vpsel q4, q4, q2 -; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov r2, r3, d7 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.u16 r3, q0[6] @@ -259,12 +258,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -427,10 +426,10 @@ ; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -687,6 +686,7 @@ ; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q6[3] ; CHECK-NEXT: vmov.u16 r1, q6[1] +; CHECK-NEXT: vmov.u16 r3, q6[4] ; CHECK-NEXT: vmov q3[3], q3[1], r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vpsel q7, q1, q0 @@ -701,31 +701,30 @@ ; CHECK-NEXT: vand q0, q4, q3 ; CHECK-NEXT: vmov.i32 q4, #0x0 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: orrs r1, r3 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, r3, d15 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, r2, d15 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r2 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 -; CHECK-NEXT: vmov.u8 r2, q2[3] -; CHECK-NEXT: vmov.u8 r3, q2[2] +; CHECK-NEXT: vmov q0[3], q0[1], r1, r2 +; CHECK-NEXT: vmov.u8 r1, q2[3] +; CHECK-NEXT: vmov.u8 r2, q2[2] ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q6[6] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q6[4] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q6[7] ; CHECK-NEXT: vmov.u16 r3, q6[5] +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpsel q6, q1, q7 @@ -738,14 +737,14 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov r2, r3, d13 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q2[7] ; CHECK-NEXT: vmov.u8 r3, q2[6] @@ -753,10 +752,10 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q5[8] ; CHECK-NEXT: vmov.16 q6[0], r2 @@ -774,7 +773,7 @@ ; CHECK-NEXT: vmov.16 q6[6], r2 ; CHECK-NEXT: vmov.u8 r2, q5[15] ; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vcmp.i16 ne, q6, zr ; CHECK-NEXT: vpsel q5, q1, q7 ; CHECK-NEXT: vmov.u16 r2, q5[2] @@ -794,32 +793,32 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov r2, r3, d13 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q2[11] ; CHECK-NEXT: vmov.u8 r3, q2[10] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpsel q1, q1, q7 @@ -832,14 +831,14 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov r2, r3, d3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q2[15] ; CHECK-NEXT: vmov.u8 r3, q2[14] @@ -847,12 +846,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -1102,6 +1101,7 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q5[3] ; CHECK-NEXT: vmov.u16 r1, q5[1] +; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vpsel q6, q4, q3 @@ -1116,30 +1116,29 @@ ; CHECK-NEXT: vand q7, q2, q1 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vpsel q7, q7, q2 -; CHECK-NEXT: vmov r0, r1, d15 -; CHECK-NEXT: vmov r2, r3, d14 -; CHECK-NEXT: orrs r1, r3 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, r3, d13 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov r1, s28 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: vmov r1, r2, d13 +; CHECK-NEXT: vmov q6[2], q6[0], r1, r2 +; CHECK-NEXT: vmov q6[3], q6[1], r1, r2 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vpsel q6, q6, q2 -; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov r1, s24 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s26 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vpsel q3, q4, q3 @@ -1152,14 +1151,14 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vpsel q4, q4, q2 -; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vmov r2, r3, d7 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.u16 r3, q0[6] @@ -1167,12 +1166,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1347,10 +1346,10 @@ ; CHECK-NEXT: bfi r1, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r1 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: add r0, r1 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1472,12 +1471,9 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %b, i64 %a) { ; CHECK-LABEL: add_v2i32_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: csetm r2, eq ; CHECK-NEXT: bfi r3, r2, #0, #8 @@ -1488,13 +1484,13 @@ ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer %xx = zext <2 x i32> %x to <2 x i64> @@ -1652,10 +1648,9 @@ ; CHECK-NEXT: vand q7, q2, q1 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vpsel q7, q7, q2 -; CHECK-NEXT: vmov r12, lr, d15 -; CHECK-NEXT: vmov r2, r3, d14 -; CHECK-NEXT: orr.w lr, lr, r3 -; CHECK-NEXT: add r12, r2 +; CHECK-NEXT: vmov r2, s30 +; CHECK-NEXT: vmov r3, s28 +; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov r3, r2, d13 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 @@ -1663,15 +1658,16 @@ ; CHECK-NEXT: vmov.u16 r3, q0[2] ; CHECK-NEXT: vcmp.i32 ne, q6, zr ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vpsel q6, q6, q2 -; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: vmov r2, s24 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d13 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] @@ -1688,12 +1684,12 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vpsel q4, q4, q2 -; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: adc r3, lr, #0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov r2, r3, d7 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 ; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 @@ -1703,12 +1699,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vpsel q0, q0, q2 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, lr, #0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} @@ -1832,8 +1828,6 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %b, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xffff ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q1, q1, q2 @@ -1849,13 +1843,12 @@ ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: orr.w r3, r3, lr +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer %xx = zext <2 x i16> %x to <2 x i64> @@ -2110,27 +2103,27 @@ ; CHECK-NEXT: vand q0, q4, q3 ; CHECK-NEXT: vmov.i32 q4, #0x0 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: orr.w lr, lr, r3 -; CHECK-NEXT: add r12, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov r3, r2, d15 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vmov.u8 r2, q2[3] ; CHECK-NEXT: vmov.u8 r3, q2[2] ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q6[6] -; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov.u16 r3, q6[4] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q6[7] @@ -2147,12 +2140,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, lr, #0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov r2, r3, d13 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -2162,10 +2155,10 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, lr, #0 ; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u8 r2, q5[8] ; CHECK-NEXT: vmov.16 q6[0], r2 @@ -2183,7 +2176,7 @@ ; CHECK-NEXT: vmov.16 q6[6], r2 ; CHECK-NEXT: vmov.u8 r2, q5[15] ; CHECK-NEXT: vmov.16 q6[7], r2 -; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vcmp.i16 ne, q6, zr ; CHECK-NEXT: vpsel q5, q1, q7 ; CHECK-NEXT: vmov.u16 r2, q5[2] @@ -2203,12 +2196,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, lr, #0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov r2, r3, d13 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -2218,13 +2211,13 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, lr, #0 ; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q5[6] -; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] @@ -2241,12 +2234,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, lr, #0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vmov r2, r3, d3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -2256,12 +2249,12 @@ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q3 ; CHECK-NEXT: vpsel q0, q0, q4 -; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds.w r12, r12, r2 -; CHECK-NEXT: adc.w lr, lr, r3 -; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, lr, #0 ; CHECK-NEXT: adds.w r2, r2, r12 -; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: add sp, #16 @@ -2506,8 +2499,6 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q1, q1, q2 @@ -2523,13 +2514,12 @@ ; CHECK-NEXT: bfi r3, r2, #8, #8 ; CHECK-NEXT: vmsr p0, r3 ; CHECK-NEXT: vpsel q0, q0, q1 -; CHECK-NEXT: vmov r12, lr, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: orr.w r3, r3, lr +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer %xx = zext <2 x i8> %x to <2 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -269,14 +269,17 @@ ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q2, #0xffff -; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umlal r0, r1, r3, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -808,16 +811,15 @@ ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q2, #0xff -; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: mla r0, r2, r1, r0 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -1105,20 +1107,21 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xffff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: umull r2, lr, r3, r2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umlal r2, lr, r3, r12 +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> %yy = zext <2 x i16> %y to <2 x i64> @@ -1440,22 +1443,18 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umull r12, lr, r3, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: orr.w r3, r3, lr +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: mla r2, r2, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> %yy = zext <2 x i8> %y to <2 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -348,43 +348,40 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov r3, r2, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vmov lr, r12, d5 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f32 s16, s10 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov r5, r6, d4 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vmov r0, r7, d8 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, r4, d7 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov r0, r4, d7 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, r3, d4 ; CHECK-NEXT: adds r0, r0, r5 -; CHECK-NEXT: adc.w r8, r6, r7 -; CHECK-NEXT: vmov r6, r5, d1 -; CHECK-NEXT: vmov r2, r7, d0 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adc.w r6, r5, r4 -; CHECK-NEXT: vmov r5, r4, d2 -; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r8, r6 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: adc.w r8, r4, r6 +; CHECK-NEXT: vmov r5, r6, d2 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov r7, r4, d0 ; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: adc.w r0, r7, r4 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov r5, r6, d2 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r8 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: adds r5, r5, r7 +; CHECK-NEXT: vmov q0[2], q0[0], r5, lr +; CHECK-NEXT: adc.w r0, r4, r6 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 8 @@ -520,11 +517,12 @@ ; CHECK-NEXT: ldr r2, [r0] ; CHECK-NEXT: ldr r0, [r0, #4] ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.32 q1[1], r0 ; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vmovx.f16 s2, s5 ; CHECK-NEXT: vins.f16 s4, s2 -; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s0, s5 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -312,17 +312,17 @@ ; CHECK-NEXT: vmovx.f16 s2, s8 ; CHECK-NEXT: vins.f16 s3, s2 ; CHECK-NEXT: vmovx.f16 s2, s11 -; CHECK-NEXT: vmovx.f16 s8, s14 ; CHECK-NEXT: vmovx.f16 s18, s10 -; CHECK-NEXT: vmovx.f16 s19, s13 ; CHECK-NEXT: vins.f16 s10, s2 -; CHECK-NEXT: vins.f16 s13, s8 +; CHECK-NEXT: vmovx.f16 s2, s14 +; CHECK-NEXT: vmovx.f16 s19, s13 +; CHECK-NEXT: vins.f16 s13, s2 ; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vins.f16 s18, s12 ; CHECK-NEXT: vins.f16 s19, s15 ; CHECK-NEXT: vins.f16 s17, s9 -; CHECK-NEXT: vmov.f32 s2, s10 ; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov.f32 s2, s10 ; CHECK-NEXT: vadd.i16 q0, q0, q4 ; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -367,56 +367,56 @@ ; CHECK-NEXT: vmovx.f16 s2, s8 ; CHECK-NEXT: vins.f16 s3, s2 ; CHECK-NEXT: vmovx.f16 s2, s11 -; CHECK-NEXT: vmovx.f16 s8, s14 ; CHECK-NEXT: vmovx.f16 s18, s10 -; CHECK-NEXT: vmovx.f16 s19, s13 ; CHECK-NEXT: vins.f16 s10, s2 -; CHECK-NEXT: vins.f16 s13, s8 +; CHECK-NEXT: vmovx.f16 s2, s14 +; CHECK-NEXT: vmovx.f16 s19, s13 +; CHECK-NEXT: vins.f16 s13, s2 ; CHECK-NEXT: vmov.f32 s1, s3 ; CHECK-NEXT: vins.f16 s18, s12 ; CHECK-NEXT: vins.f16 s19, s15 ; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vins.f16 s17, s9 ; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] ; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vmovx.f16 s6, s14 -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmov.f32 s22, s15 -; CHECK-NEXT: vmovx.f16 s8, s8 -; CHECK-NEXT: vins.f16 s22, s8 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmov.f32 s23, s10 -; CHECK-NEXT: vmovx.f16 s4, s16 -; CHECK-NEXT: vins.f16 s23, s8 -; CHECK-NEXT: vmovx.f16 s8, s17 -; CHECK-NEXT: vins.f16 s16, s8 -; CHECK-NEXT: vmovx.f16 s8, s12 -; CHECK-NEXT: vmovx.f16 s5, s19 -; CHECK-NEXT: vins.f16 s19, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 -; CHECK-NEXT: vmovx.f16 s7, s9 -; CHECK-NEXT: vins.f16 s14, s8 -; CHECK-NEXT: vmovx.f16 s8, s10 -; CHECK-NEXT: vins.f16 s4, s18 -; CHECK-NEXT: vmov.f32 s20, s17 -; CHECK-NEXT: vmovx.f16 s18, s18 -; CHECK-NEXT: vins.f16 s9, s8 -; CHECK-NEXT: vins.f16 s5, s13 -; CHECK-NEXT: vins.f16 s20, s18 -; CHECK-NEXT: vmov.f32 s17, s19 -; CHECK-NEXT: vins.f16 s7, s11 -; CHECK-NEXT: vmovx.f16 s13, s13 -; CHECK-NEXT: vmov.f32 s21, s12 -; CHECK-NEXT: vmov.f32 s18, s14 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vmovx.f16 s11, s16 +; CHECK-NEXT: vmovx.f16 s10, s6 +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vmovx.f16 s10, s13 +; CHECK-NEXT: vins.f16 s9, s10 +; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vins.f16 s10, s11 +; CHECK-NEXT: vmovx.f16 s20, s19 +; CHECK-NEXT: vmov.f32 s11, s18 +; CHECK-NEXT: vmovx.f16 s21, s7 +; CHECK-NEXT: vins.f16 s11, s20 +; CHECK-NEXT: vmovx.f16 s20, s4 +; CHECK-NEXT: vins.f16 s20, s6 +; CHECK-NEXT: vmovx.f16 s6, s5 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s12 +; CHECK-NEXT: vins.f16 s7, s6 +; CHECK-NEXT: vmovx.f16 s6, s15 +; CHECK-NEXT: vmovx.f16 s22, s14 +; CHECK-NEXT: vins.f16 s14, s6 +; CHECK-NEXT: vmovx.f16 s6, s18 +; CHECK-NEXT: vmovx.f16 s23, s17 +; CHECK-NEXT: vins.f16 s17, s6 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vins.f16 s22, s16 +; CHECK-NEXT: vins.f16 s23, s19 ; CHECK-NEXT: vins.f16 s21, s13 -; CHECK-NEXT: vmov.f32 s19, s9 +; CHECK-NEXT: vmov.f32 s7, s17 +; CHECK-NEXT: vmov.f32 s6, s14 ; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vadd.i16 q1, q4, q1 ; CHECK-NEXT: vadd.i16 q1, q1, q5 +; CHECK-NEXT: vadd.i16 q1, q1, q2 ; CHECK-NEXT: vstrw.32 q1, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -584,8 +584,8 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) { ; CHECK-LABEL: vld3_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] ; CHECK-NEXT: vldrw.u32 q2, [r0, #32] @@ -598,66 +598,72 @@ ; CHECK-NEXT: vmov.u8 r2, q1[10] ; CHECK-NEXT: vmov.8 q3[3], r2 ; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: vmov.8 q3[4], r2 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.8 q4[4], r2 ; CHECK-NEXT: vmov.u8 r2, q0[0] -; CHECK-NEXT: vmov.8 q3[5], r2 +; CHECK-NEXT: vmov.8 q4[5], r2 ; CHECK-NEXT: vmov.u8 r2, q0[3] -; CHECK-NEXT: vmov.8 q3[6], r2 +; CHECK-NEXT: vmov.8 q4[6], r2 ; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: vmov.8 q4[8], r2 +; CHECK-NEXT: vmov.8 q5[8], r2 ; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: vmov.8 q4[9], r2 +; CHECK-NEXT: vmov.8 q5[9], r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: vmov.8 q4[10], r2 -; CHECK-NEXT: vmov.u8 r0, q2[2] -; CHECK-NEXT: vmov.8 q4[11], r0 +; CHECK-NEXT: vmov.8 q5[10], r2 +; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q4[12], r0 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.8 q6[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[8] -; CHECK-NEXT: vmov.8 q4[13], r0 +; CHECK-NEXT: vmov.8 q6[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[11] -; CHECK-NEXT: vmov.8 q4[14], r0 +; CHECK-NEXT: vmov.8 q6[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[14] -; CHECK-NEXT: vmov.8 q4[15], r0 +; CHECK-NEXT: vmov.8 q6[15], r0 ; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.8 q3[7], r0 +; CHECK-NEXT: vmov.8 q4[7], r0 ; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vmov.f32 s13, s17 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.f32 s15, s27 ; CHECK-NEXT: vmov.8 q4[1], r0 ; CHECK-NEXT: vmov.u8 r0, q1[6] ; CHECK-NEXT: vmov.8 q4[2], r0 ; CHECK-NEXT: vmov.u8 r0, q1[9] ; CHECK-NEXT: vmov.8 q4[3], r0 ; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.8 q5[4], r0 ; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.8 q5[5], r0 ; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.8 q5[6], r0 ; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmov.8 q5[8], r0 +; CHECK-NEXT: vmov.8 q6[8], r0 ; CHECK-NEXT: vmov.u8 r0, q0[11] -; CHECK-NEXT: vmov.8 q5[9], r0 +; CHECK-NEXT: vmov.8 q6[9], r0 ; CHECK-NEXT: vmov.u8 r0, q0[14] -; CHECK-NEXT: vmov.8 q5[10], r0 +; CHECK-NEXT: vmov.8 q6[10], r0 ; CHECK-NEXT: vmov.u8 r0, q2[1] -; CHECK-NEXT: vmov.8 q5[11], r0 +; CHECK-NEXT: vmov.8 q6[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q5[12], r0 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.8 q7[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[7] -; CHECK-NEXT: vmov.8 q5[13], r0 +; CHECK-NEXT: vmov.8 q7[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vmov.8 q5[14], r0 +; CHECK-NEXT: vmov.8 q7[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vmov.8 q7[15], r0 ; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.8 q4[7], r0 +; CHECK-NEXT: vmov.8 q5[7], r0 +; CHECK-NEXT: vmov.f32 s19, s31 +; CHECK-NEXT: vmov.f32 s17, s21 ; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s19, s23 ; CHECK-NEXT: vadd.i8 q3, q4, q3 ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q1[5] @@ -667,34 +673,37 @@ ; CHECK-NEXT: vmov.u8 r0, q1[11] ; CHECK-NEXT: vmov.8 q4[3], r0 ; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov.8 q1[4], r0 ; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.8 q1[5], r0 ; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.8 q1[6], r0 ; CHECK-NEXT: vmov.u8 r0, q0[10] -; CHECK-NEXT: vmov.8 q1[8], r0 +; CHECK-NEXT: vmov.8 q5[8], r0 ; CHECK-NEXT: vmov.u8 r0, q0[13] -; CHECK-NEXT: vmov.8 q1[9], r0 +; CHECK-NEXT: vmov.8 q5[9], r0 ; CHECK-NEXT: vmov.u8 r0, q2[0] -; CHECK-NEXT: vmov.8 q1[10], r0 +; CHECK-NEXT: vmov.8 q5[10], r0 ; CHECK-NEXT: vmov.u8 r0, q2[3] -; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.8 q5[11], r0 ; CHECK-NEXT: vmov.u8 r0, q2[6] -; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov.8 q6[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[9] -; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.8 q6[13], r0 ; CHECK-NEXT: vmov.u8 r0, q2[12] -; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.8 q6[14], r0 ; CHECK-NEXT: vmov.u8 r0, q2[15] -; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vmov.8 q6[15], r0 ; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.8 q4[7], r0 -; CHECK-NEXT: vmov.f32 s18, s6 -; CHECK-NEXT: vmov.f32 s19, s7 +; CHECK-NEXT: vmov.8 q1[7], r0 +; CHECK-NEXT: vmov.f32 s19, s27 +; CHECK-NEXT: vmov.f32 s17, s5 ; CHECK-NEXT: vadd.i8 q0, q3, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <48 x i8>, <48 x i8>* %src, align 4 @@ -716,21 +725,21 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s3 ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r0, r3, d5 +; CHECK-NEXT: vmov r0, r3, d7 ; CHECK-NEXT: vmov r2, r4, d3 ; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: vmov r5, r8, d6 +; CHECK-NEXT: vmov r5, r8, d4 ; CHECK-NEXT: vmov lr, r12, d1 ; CHECK-NEXT: adds.w r0, r0, lr ; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r2, r3, r4 -; CHECK-NEXT: vmov r3, r4, d4 +; CHECK-NEXT: vmov r3, r4, d6 ; CHECK-NEXT: adds r6, r6, r5 ; CHECK-NEXT: adc.w r7, r7, r8 ; CHECK-NEXT: adds r3, r3, r6 @@ -755,57 +764,57 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: vmov r3, r8, d7 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.f32 s24, s22 -; CHECK-NEXT: vmov.f32 s25, s23 +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov.f32 s20, s14 +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vmov r4, r8, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] ; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vmov r6, r7, d12 -; CHECK-NEXT: adds.w r0, r5, lr -; CHECK-NEXT: adc.w r5, r4, r12 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r4, r2, d10 -; CHECK-NEXT: adc.w r12, r5, r8 -; CHECK-NEXT: vmov r5, r0, d8 -; CHECK-NEXT: adds r6, r6, r4 -; CHECK-NEXT: adcs r2, r7 -; CHECK-NEXT: adds r6, r6, r5 -; CHECK-NEXT: adc.w r8, r2, r0 -; CHECK-NEXT: vmov r7, r4, d1 -; CHECK-NEXT: vmov r2, r5, d9 -; CHECK-NEXT: vmov r3, r0, d0 -; CHECK-NEXT: adds r2, r2, r7 -; CHECK-NEXT: adc.w r7, r5, r4 -; CHECK-NEXT: vmov r5, r4, d7 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adcs r7, r4 -; CHECK-NEXT: vmov r5, r4, d2 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r8, r7 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r3, r2, d1 +; CHECK-NEXT: vmov r6, r7, d10 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: adds.w r0, r3, lr +; CHECK-NEXT: vmov r3, r5, d6 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: vmov.f32 s13, s19 +; CHECK-NEXT: adds.w lr, r0, r4 +; CHECK-NEXT: adc.w r12, r2, r8 +; CHECK-NEXT: vmov r2, r0, d6 +; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: adc.w r8, r5, r7 +; CHECK-NEXT: vmov r6, r5, d8 +; CHECK-NEXT: vmov r4, r7, d5 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: adcs r0, r5 +; CHECK-NEXT: vmov r6, r5, d4 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: adcs r0, r5 +; CHECK-NEXT: vmov r6, r5, d1 +; CHECK-NEXT: adds r4, r4, r6 +; CHECK-NEXT: adcs r5, r7 +; CHECK-NEXT: vmov r6, r7, d3 +; CHECK-NEXT: adds r4, r4, r6 +; CHECK-NEXT: adcs r5, r7 +; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r5 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r0, r4 -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adds r3, r3, r6 ; CHECK-NEXT: vmov q0[2], q0[0], r3, lr -; CHECK-NEXT: adcs r0, r5 +; CHECK-NEXT: adc.w r0, r8, r7 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <12 x i64>, <12 x i64>* %src, align 4 @@ -1025,16 +1034,19 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r2, r3, [r0] ; CHECK-NEXT: ldr r0, [r0, #8] +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 ; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s4, s2 -; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s8, s4 ; CHECK-NEXT: vmovx.f16 s2, s1 -; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vadd.f16 q1, q0, q2 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vins.f16 s8, s10 +; CHECK-NEXT: vins.f16 s4, s2 ; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vadd.f16 q1, q1, q2 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] @@ -1055,24 +1067,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r2, r3, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmovx.f16 s13, s7 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vins.f16 s1, s0 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.f32 s1, s0 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmovx.f16 s2, s13 +; CHECK-NEXT: vmovx.f16 s9, s7 +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s2, s6 +; CHECK-NEXT: vins.f16 s5, s2 ; CHECK-NEXT: vins.f16 s7, s0 ; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vins.f16 s12, s6 -; CHECK-NEXT: vins.f16 s13, s9 +; CHECK-NEXT: vins.f16 s8, s6 +; CHECK-NEXT: vins.f16 s9, s13 ; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vadd.f16 q1, q1, q3 +; CHECK-NEXT: vadd.f16 q1, q1, q2 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vmov r0, r2, d0 ; CHECK-NEXT: strd r0, r2, [r1] @@ -1093,40 +1106,40 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vmovx.f16 s8, s8 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s3, s8 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmovx.f16 s18, s10 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vins.f16 s10, s8 ; CHECK-NEXT: vmovx.f16 s6, s2 ; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vmovx.f16 s8, s14 -; CHECK-NEXT: vmovx.f16 s19, s13 ; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s2, s15 +; CHECK-NEXT: vmov.f32 s5, s8 ; CHECK-NEXT: vmovx.f16 s7, s12 -; CHECK-NEXT: vins.f16 s18, s12 -; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vins.f16 s13, s8 ; CHECK-NEXT: vins.f16 s5, s6 ; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vins.f16 s14, s2 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vmovx.f16 s16, s15 +; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vmovx.f16 s17, s3 +; CHECK-NEXT: vins.f16 s7, s16 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vins.f16 s3, s2 +; CHECK-NEXT: vmovx.f16 s2, s11 +; CHECK-NEXT: vmovx.f16 s18, s10 +; CHECK-NEXT: vins.f16 s10, s2 +; CHECK-NEXT: vmovx.f16 s2, s14 +; CHECK-NEXT: vmovx.f16 s19, s13 +; CHECK-NEXT: vins.f16 s13, s2 ; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vins.f16 s18, s12 ; CHECK-NEXT: vins.f16 s19, s15 ; CHECK-NEXT: vins.f16 s17, s9 -; CHECK-NEXT: vins.f16 s0, s12 -; CHECK-NEXT: vmov.f32 s2, s10 ; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vmov.f32 s2, s10 ; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] @@ -1182,43 +1195,43 @@ ; CHECK-NEXT: vmov.f32 s3, s13 ; CHECK-NEXT: vins.f16 s17, s9 ; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] ; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vadd.f16 q2, q0, q1 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vstrw.32 q2, [r1, #16] ; CHECK-NEXT: vmovx.f16 s10, s2 ; CHECK-NEXT: vmov.f32 s8, s1 ; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s10, s13 -; CHECK-NEXT: vmov.f32 s9, s12 -; CHECK-NEXT: vmovx.f16 s11, s4 +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmovx.f16 s11, s12 ; CHECK-NEXT: vins.f16 s9, s10 -; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vmov.f32 s10, s7 ; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vmovx.f16 s16, s7 -; CHECK-NEXT: vmov.f32 s11, s6 +; CHECK-NEXT: vmovx.f16 s16, s15 +; CHECK-NEXT: vmov.f32 s11, s14 ; CHECK-NEXT: vmovx.f16 s17, s3 ; CHECK-NEXT: vins.f16 s11, s16 ; CHECK-NEXT: vmovx.f16 s16, s0 ; CHECK-NEXT: vins.f16 s16, s2 ; CHECK-NEXT: vmovx.f16 s2, s1 ; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s2, s12 +; CHECK-NEXT: vmovx.f16 s2, s4 ; CHECK-NEXT: vins.f16 s3, s2 -; CHECK-NEXT: vmovx.f16 s2, s15 -; CHECK-NEXT: vmovx.f16 s18, s14 -; CHECK-NEXT: vins.f16 s14, s2 -; CHECK-NEXT: vmovx.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s19, s5 -; CHECK-NEXT: vins.f16 s5, s2 +; CHECK-NEXT: vmovx.f16 s2, s7 +; CHECK-NEXT: vmovx.f16 s18, s6 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vmovx.f16 s2, s14 +; CHECK-NEXT: vmovx.f16 s19, s13 +; CHECK-NEXT: vins.f16 s13, s2 ; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s18, s4 -; CHECK-NEXT: vins.f16 s19, s7 -; CHECK-NEXT: vins.f16 s17, s13 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vins.f16 s18, s12 +; CHECK-NEXT: vins.f16 s19, s15 +; CHECK-NEXT: vins.f16 s17, s5 +; CHECK-NEXT: vmov.f32 s3, s13 +; CHECK-NEXT: vmov.f32 s2, s6 ; CHECK-NEXT: vadd.f16 q0, q0, q4 ; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -92,44 +92,41 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov lr, r12, d5 -; CHECK-NEXT: vldrw.u32 q2, [r0], #64 -; CHECK-NEXT: vmov r4, r8, d9 -; CHECK-NEXT: vmov.f32 s12, s10 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov r2, r7, d1 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vmov r3, r6, d1 -; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r7, r7, r12 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov r3, lr, d3 +; CHECK-NEXT: vmov r2, r12, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0], #64 +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov r4, r7, d7 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov r3, r6, d5 +; CHECK-NEXT: adc.w r5, r12, lr ; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, r5, d2 -; CHECK-NEXT: adc.w r6, r6, r8 -; CHECK-NEXT: adds.w r12, r3, r2 -; CHECK-NEXT: vmov r3, r2, d0 -; CHECK-NEXT: adc.w lr, r6, r7 +; CHECK-NEXT: vmov r4, r12, d4 +; CHECK-NEXT: adcs r7, r6 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: vmov r3, r6, d0 +; CHECK-NEXT: adc.w r8, r7, r5 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov r2, r7, d2 ; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r6, r4, d6 -; CHECK-NEXT: adcs r2, r5 -; CHECK-NEXT: vmov r5, r7, d4 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: adcs r4, r7 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r12 -; CHECK-NEXT: vmov q0[3], q0[1], r2, lr +; CHECK-NEXT: vmov r5, r4, d0 +; CHECK-NEXT: adc.w r6, r6, r12 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r7, r4 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r7, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: vmov q0[3], q0[1], r3, r8 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -359,51 +359,51 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.u8 q1, [r0, #32] +; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vldrb.u8 q2, [r0, #48] -; CHECK-NEXT: vmovx.f16 s18, s5 -; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vins.f16 s18, s0 +; CHECK-NEXT: vmovx.f16 s18, s1 +; CHECK-NEXT: vmovx.f16 s4, s3 +; CHECK-NEXT: vins.f16 s18, s4 ; CHECK-NEXT: vmovx.f16 s19, s9 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vins.f16 s19, s0 -; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: vmovx.f16 s4, s11 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vins.f16 s19, s4 +; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vins.f16 s9, s11 -; CHECK-NEXT: vmov.f32 s22, s5 -; CHECK-NEXT: vmovx.f16 s16, s1 -; CHECK-NEXT: vmovx.f16 s12, s3 +; CHECK-NEXT: vmov.f32 s22, s1 +; CHECK-NEXT: vmovx.f16 s16, s5 +; CHECK-NEXT: vmovx.f16 s12, s7 ; CHECK-NEXT: vins.f16 s16, s12 ; CHECK-NEXT: vldrb.u8 q3, [r0, #16] -; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vins.f16 s5, s7 ; CHECK-NEXT: vmov.f32 s23, s9 ; CHECK-NEXT: vmovx.f16 s17, s13 ; CHECK-NEXT: vmovx.f16 s20, s15 ; CHECK-NEXT: vins.f16 s13, s15 ; CHECK-NEXT: vins.f16 s17, s20 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vmov.f32 s20, s5 +; CHECK-NEXT: vmovx.f16 s1, s2 ; CHECK-NEXT: vmov.f32 s21, s13 ; CHECK-NEXT: vadd.i16 q4, q5, q4 -; CHECK-NEXT: vmovx.f16 s22, s4 +; CHECK-NEXT: vmovx.f16 s22, s0 ; CHECK-NEXT: vins.f16 s22, s1 ; CHECK-NEXT: vmovx.f16 s23, s8 ; CHECK-NEXT: vmovx.f16 s1, s10 -; CHECK-NEXT: vmovx.f16 s20, s0 +; CHECK-NEXT: vmovx.f16 s20, s4 ; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s1, s2 +; CHECK-NEXT: vmovx.f16 s1, s6 ; CHECK-NEXT: vins.f16 s20, s1 ; CHECK-NEXT: vmovx.f16 s21, s12 ; CHECK-NEXT: vmovx.f16 s1, s14 ; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vins.f16 s21, s1 ; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmov.f32 s3, s8 -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vadd.i16 q0, q0, q5 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vins.f16 s21, s1 +; CHECK-NEXT: vmov.f32 s7, s8 +; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vadd.i16 q0, q1, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} @@ -609,44 +609,41 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov r3, r2, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov lr, r12, d5 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov r0, r8, d9 -; CHECK-NEXT: vmov.f32 s12, s10 -; CHECK-NEXT: vmov.f32 s13, s11 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vmov r5, r6, d1 -; CHECK-NEXT: adds.w r2, r2, lr -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov r4, r12, d2 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov r0, r4, d7 +; CHECK-NEXT: adds.w r3, r3, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: vmov r12, lr, d4 ; CHECK-NEXT: adds r0, r0, r5 -; CHECK-NEXT: vmov r5, r7, d0 -; CHECK-NEXT: adc.w r6, r6, r8 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w lr, r6, r3 -; CHECK-NEXT: vmov r3, r6, d6 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, r2, d4 -; CHECK-NEXT: adc.w r7, r7, r12 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: adcs r2, r6 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r2, r7 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r2, lr +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: vmov r5, r6, d0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: adc.w r8, r4, r2 +; CHECK-NEXT: vmov r2, r7, d2 +; CHECK-NEXT: vmov r3, r4, d0 +; CHECK-NEXT: adds.w r5, r5, r12 +; CHECK-NEXT: adc.w r6, r6, lr +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r7, r4 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r3, r8 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 8 @@ -666,84 +663,79 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vldrw.u32 q6, [r0, #112] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vldrw.u32 q6, [r0, #80] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov r4, r5, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-NEXT: vmov.f32 s6, s20 +; CHECK-NEXT: vmov.f32 s7, s21 +; CHECK-NEXT: vmov r2, r3, d7 +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmov lr, r12, d3 ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s2, s20 -; CHECK-NEXT: vldrw.u32 q7, [r0, #112] -; CHECK-NEXT: vmov.f32 s3, s21 -; CHECK-NEXT: vmov r3, r2, d11 -; CHECK-NEXT: vldrw.u32 q5, [r0, #96] -; CHECK-NEXT: vmov.f32 s0, s26 -; CHECK-NEXT: vmov.f32 s1, s27 -; CHECK-NEXT: vmov lr, r12, d9 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmov.f32 s6, s28 -; CHECK-NEXT: vmov.f32 s7, s29 -; CHECK-NEXT: vmov.f32 s10, s20 -; CHECK-NEXT: vmov.f32 s11, s21 -; CHECK-NEXT: vmov r0, r6, d1 -; CHECK-NEXT: adds r7, r4, r3 -; CHECK-NEXT: vmov r4, r8, d0 -; CHECK-NEXT: adcs r5, r2 -; CHECK-NEXT: vmov r2, r3, d12 -; CHECK-NEXT: vmov.f32 s0, s18 -; CHECK-NEXT: vmov.f32 s1, s19 +; CHECK-NEXT: vmov r0, r6, d11 +; CHECK-NEXT: vmov.f32 s22, s24 +; CHECK-NEXT: vmov.f32 s23, s25 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov r4, r8, d11 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r5, r7, d13 +; CHECK-NEXT: vmov.f32 s22, s16 +; CHECK-NEXT: vmov.f32 s23, s17 ; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: vmov.f32 s16, s14 ; CHECK-NEXT: adc.w r6, r6, r12 -; CHECK-NEXT: adds.w lr, r0, r7 -; CHECK-NEXT: adc.w r12, r6, r5 -; CHECK-NEXT: vmov r6, r5, d0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, r0, d8 -; CHECK-NEXT: adc.w r3, r3, r8 -; CHECK-NEXT: adds r6, r6, r4 -; CHECK-NEXT: adcs r0, r5 -; CHECK-NEXT: adds.w r9, r6, r2 -; CHECK-NEXT: adc.w r8, r0, r3 -; CHECK-NEXT: vmov r5, r4, d15 -; CHECK-NEXT: vmov r3, r6, d3 -; CHECK-NEXT: vmov r7, r0, d5 -; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adds.w lr, r0, r2 +; CHECK-NEXT: adc.w r12, r6, r3 +; CHECK-NEXT: vmov.f32 s17, s15 +; CHECK-NEXT: vmov r0, r3, d11 +; CHECK-NEXT: adds r6, r4, r5 +; CHECK-NEXT: vmov r5, r4, d9 +; CHECK-NEXT: adc.w r7, r7, r8 +; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: adds.w r9, r0, r6 +; CHECK-NEXT: adc.w r8, r3, r7 +; CHECK-NEXT: vmov r7, r6, d6 +; CHECK-NEXT: vmov r5, r4, d8 +; CHECK-NEXT: vmov.f32 s12, s10 +; CHECK-NEXT: vmov.f32 s13, s11 +; CHECK-NEXT: vmov r2, r3, d4 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: adds r7, r7, r5 ; CHECK-NEXT: adcs r6, r4 -; CHECK-NEXT: vmov r5, r4, d11 -; CHECK-NEXT: adds r5, r5, r7 -; CHECK-NEXT: adcs r0, r4 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adc.w r10, r0, r6 -; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vmov r5, r4, d6 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: adcs r3, r4 +; CHECK-NEXT: adds r2, r2, r7 +; CHECK-NEXT: adc.w r10, r3, r6 ; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: vmov r2, r0, d2 -; CHECK-NEXT: vmov q1[2], q1[0], r9, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r8, r10 +; CHECK-NEXT: vmov r4, r5, d4 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov r0, r3, d2 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r9 +; CHECK-NEXT: vmov q1[3], q1[1], r10, r8 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: adds r4, r4, r6 ; CHECK-NEXT: adcs r5, r7 -; CHECK-NEXT: vmov r6, r7, d6 -; CHECK-NEXT: adds r2, r2, r6 -; CHECK-NEXT: adcs r0, r7 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: adcs r0, r5 +; CHECK-NEXT: vmov r6, r7, d0 +; CHECK-NEXT: adds r0, r0, r6 +; CHECK-NEXT: adcs r3, r7 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r0, lr +; CHECK-NEXT: adc.w r0, r5, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %l1 = load <16 x i64>, <16 x i64>* %src, align 8 @@ -1104,50 +1096,50 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrb.u8 q0, [r0, #32] ; CHECK-NEXT: vldrb.u8 q2, [r0, #48] -; CHECK-NEXT: vmovx.f16 s18, s1 +; CHECK-NEXT: vldrb.u8 q4, [r0, #16] +; CHECK-NEXT: vmovx.f16 s14, s1 ; CHECK-NEXT: vmovx.f16 s4, s3 -; CHECK-NEXT: vins.f16 s18, s4 -; CHECK-NEXT: vmovx.f16 s19, s9 +; CHECK-NEXT: vins.f16 s14, s4 +; CHECK-NEXT: vmovx.f16 s15, s9 ; CHECK-NEXT: vmovx.f16 s4, s11 ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s19, s4 +; CHECK-NEXT: vins.f16 s15, s4 ; CHECK-NEXT: vldrb.u8 q1, [r0] ; CHECK-NEXT: vmovx.f16 s22, s0 ; CHECK-NEXT: vmovx.f16 s3, s2 -; CHECK-NEXT: vmovx.f16 s16, s5 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vins.f16 s16, s12 -; CHECK-NEXT: vldrb.u8 q3, [r0, #16] +; CHECK-NEXT: vmovx.f16 s12, s5 +; CHECK-NEXT: vmovx.f16 s13, s7 +; CHECK-NEXT: vins.f16 s12, s13 +; CHECK-NEXT: vmovx.f16 s13, s17 +; CHECK-NEXT: vmovx.f16 s20, s19 ; CHECK-NEXT: vins.f16 s22, s3 ; CHECK-NEXT: vmovx.f16 s23, s8 -; CHECK-NEXT: vmovx.f16 s17, s13 -; CHECK-NEXT: vmovx.f16 s20, s15 ; CHECK-NEXT: vmovx.f16 s3, s10 -; CHECK-NEXT: vins.f16 s17, s20 +; CHECK-NEXT: vins.f16 s13, s20 ; CHECK-NEXT: vins.f16 s23, s3 ; CHECK-NEXT: vmovx.f16 s20, s4 ; CHECK-NEXT: vmovx.f16 s3, s6 ; CHECK-NEXT: vins.f16 s9, s11 ; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vins.f16 s13, s15 +; CHECK-NEXT: vins.f16 s17, s19 ; CHECK-NEXT: vins.f16 s20, s3 -; CHECK-NEXT: vmovx.f16 s21, s12 -; CHECK-NEXT: vmovx.f16 s3, s14 +; CHECK-NEXT: vmovx.f16 s21, s16 +; CHECK-NEXT: vmovx.f16 s3, s18 ; CHECK-NEXT: vins.f16 s8, s10 ; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vins.f16 s12, s14 +; CHECK-NEXT: vins.f16 s16, s18 ; CHECK-NEXT: vins.f16 s4, s6 ; CHECK-NEXT: vmov.f32 s24, s5 ; CHECK-NEXT: vins.f16 s21, s3 ; CHECK-NEXT: vmov.f32 s26, s1 ; CHECK-NEXT: vmov.f32 s27, s9 -; CHECK-NEXT: vmov.f32 s25, s13 +; CHECK-NEXT: vmov.f32 s25, s17 ; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vadd.f16 q4, q6, q4 +; CHECK-NEXT: vadd.f16 q3, q6, q3 ; CHECK-NEXT: vmov.f32 s7, s8 -; CHECK-NEXT: vmov.f32 s5, s12 +; CHECK-NEXT: vmov.f32 s5, s16 ; CHECK-NEXT: vadd.f16 q0, q1, q5 -; CHECK-NEXT: vadd.f16 q0, q0, q4 +; CHECK-NEXT: vadd.f16 q0, q0, q3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -6,119 +6,114 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: mul r12, r3, r2 ; CHECK-NEXT: lsrs.w r2, r12, #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 -; CHECK-NEXT: ldr r2, [sp, #56] +; CHECK-NEXT: ldr r2, [sp, #72] ; CHECK-NEXT: and.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0, #32] +; CHECK-NEXT: vldrh.u16 q0, [r0, #32] ; CHECK-NEXT: vldrh.u16 q4, [r0, #48] ; CHECK-NEXT: vldrh.u16 q3, [r0], #64 -; CHECK-NEXT: vmovx.f16 s26, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmovx.f16 s26, s0 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vldrh.u16 q5, [r0, #-48] ; CHECK-NEXT: vmovx.f16 s27, s16 -; CHECK-NEXT: vins.f16 s26, s6 -; CHECK-NEXT: vmovx.f16 s6, s18 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vins.f16 s26, s2 +; CHECK-NEXT: vmovx.f16 s2, s18 +; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmovx.f16 s10, s1 ; CHECK-NEXT: vmovx.f16 s24, s12 ; CHECK-NEXT: vins.f16 s10, s8 -; CHECK-NEXT: vins.f16 s27, s6 -; CHECK-NEXT: vmovx.f16 s6, s14 +; CHECK-NEXT: vins.f16 s27, s2 +; CHECK-NEXT: vmovx.f16 s2, s14 ; CHECK-NEXT: vmovx.f16 s8, s19 ; CHECK-NEXT: vmovx.f16 s11, s17 -; CHECK-NEXT: vmov.f32 s0, s13 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s4, s13 ; CHECK-NEXT: vins.f16 s11, s8 ; CHECK-NEXT: vmovx.f16 s25, s20 -; CHECK-NEXT: vins.f16 s24, s6 -; CHECK-NEXT: vmovx.f16 s6, s22 +; CHECK-NEXT: vins.f16 s24, s2 +; CHECK-NEXT: vmovx.f16 s2, s22 ; CHECK-NEXT: vmovx.f16 s1, s15 ; CHECK-NEXT: vmovx.f16 s8, s13 ; CHECK-NEXT: vins.f16 s20, s22 ; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vins.f16 s25, s6 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s17 -; CHECK-NEXT: vins.f16 s0, s15 +; CHECK-NEXT: vins.f16 s25, s2 +; CHECK-NEXT: vmov.f32 s7, s17 +; CHECK-NEXT: vins.f16 s4, s15 ; CHECK-NEXT: vmovx.f16 s9, s21 +; CHECK-NEXT: vins.f16 s21, s23 ; CHECK-NEXT: vins.f16 s8, s1 ; CHECK-NEXT: vmovx.f16 s1, s23 ; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vins.f16 s21, s23 -; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmov.f32 s14, s0 ; CHECK-NEXT: vmov.f32 s15, s16 ; CHECK-NEXT: vins.f16 s9, s1 ; CHECK-NEXT: vmov.f32 s13, s20 +; CHECK-NEXT: vins.f16 s7, s19 +; CHECK-NEXT: vmul.f16 q4, q3, r2 ; CHECK-NEXT: vmul.f16 q6, q6, r2 -; CHECK-NEXT: vmul.f16 q3, q3, r2 -; CHECK-NEXT: vins.f16 s2, s7 -; CHECK-NEXT: vins.f16 s3, s19 -; CHECK-NEXT: vmov.f32 s1, s21 -; CHECK-NEXT: vmul.f16 q0, q0, r2 -; CHECK-NEXT: vmovx.f16 s4, s12 -; CHECK-NEXT: vmovx.f16 s6, s24 +; CHECK-NEXT: vins.f16 s6, s3 +; CHECK-NEXT: vmov.f32 s5, s21 +; CHECK-NEXT: vmul.f16 q1, q1, r2 +; CHECK-NEXT: vmovx.f16 s2, s16 +; CHECK-NEXT: vmovx.f16 s0, s24 ; CHECK-NEXT: vmul.f16 q2, q2, r2 -; CHECK-NEXT: vmovx.f16 s7, s0 -; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmovx.f16 s15, s5 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmovx.f16 s14, s17 +; CHECK-NEXT: vins.f16 s15, s0 +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmovx.f16 s23, s6 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmovx.f16 s22, s18 +; CHECK-NEXT: vins.f16 s23, s0 +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vins.f16 s22, s0 +; CHECK-NEXT: vmovx.f16 s31, s7 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vins.f16 s4, s8 ; CHECK-NEXT: vmovx.f16 s8, s8 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s5, s1 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vins.f16 s7, s8 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmovx.f16 s6, s13 -; CHECK-NEXT: vmovx.f16 s8, s25 -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmovx.f16 s19, s2 -; CHECK-NEXT: vmovx.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s18, s14 -; CHECK-NEXT: vins.f16 s19, s8 -; CHECK-NEXT: vmovx.f16 s8, s26 -; CHECK-NEXT: vins.f16 s18, s8 -; CHECK-NEXT: vmovx.f16 s23, s3 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vins.f16 s14, s26 -; CHECK-NEXT: vins.f16 s23, s8 -; CHECK-NEXT: vmovx.f16 s22, s15 -; CHECK-NEXT: vins.f16 s15, s27 -; CHECK-NEXT: vmovx.f16 s8, s27 -; CHECK-NEXT: vins.f16 s12, s24 -; CHECK-NEXT: vins.f16 s13, s25 -; CHECK-NEXT: vins.f16 s3, s11 -; CHECK-NEXT: vins.f16 s1, s9 -; CHECK-NEXT: vins.f16 s2, s10 -; CHECK-NEXT: vins.f16 s22, s8 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: vmov.f32 s17, s0 -; CHECK-NEXT: vmov.f32 s10, s4 -; CHECK-NEXT: vmov q6, q0 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov.f32 s9, s0 -; CHECK-NEXT: vmov.f32 s17, s2 -; CHECK-NEXT: vmov.f32 s16, s14 -; CHECK-NEXT: vmov.f32 s21, s3 -; CHECK-NEXT: vstrh.16 q4, [r1, #32] -; CHECK-NEXT: vmov.f32 s20, s15 -; CHECK-NEXT: vmov.f32 s7, s5 -; CHECK-NEXT: vstrh.16 q5, [r1, #48] -; CHECK-NEXT: vstrh.16 q2, [r1], #64 -; CHECK-NEXT: vmov.f32 s4, s13 -; CHECK-NEXT: vmov.f32 s5, s25 -; CHECK-NEXT: vstrh.16 q1, [r1, #-48] +; CHECK-NEXT: vins.f16 s6, s10 +; CHECK-NEXT: vins.f16 s18, s26 +; CHECK-NEXT: vins.f16 s7, s11 +; CHECK-NEXT: vins.f16 s31, s0 +; CHECK-NEXT: vmovx.f16 s30, s19 +; CHECK-NEXT: vins.f16 s19, s27 +; CHECK-NEXT: vmovx.f16 s0, s27 +; CHECK-NEXT: vins.f16 s16, s24 +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vins.f16 s17, s25 +; CHECK-NEXT: vins.f16 s30, s0 +; CHECK-NEXT: vins.f16 s3, s8 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s0, s16 +; CHECK-NEXT: vmov.f32 s21, s6 +; CHECK-NEXT: vmov.f32 s20, s18 +; CHECK-NEXT: vmov.f32 s29, s7 +; CHECK-NEXT: vstrh.16 q5, [r1, #32] +; CHECK-NEXT: vmov.f32 s28, s19 +; CHECK-NEXT: vstrh.16 q7, [r1, #48] +; CHECK-NEXT: vstrh.16 q0, [r1], #64 +; CHECK-NEXT: vmov.f32 s12, s17 +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vstrh.16 q3, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll @@ -4,31 +4,39 @@ define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32* nocapture %z, i32 %n) { ; CHECK-LABEL: test32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r5, lr} +; CHECK-NEXT: push {r5, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: blt .LBB0_2 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q3, [r1], #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: vmullb.s32 q2, q1, q0 -; CHECK-NEXT: vmullt.s32 q3, q1, q0 +; CHECK-NEXT: vmullt.s32 q2, q3, q1 +; CHECK-NEXT: vmullb.s32 q4, q3, q1 ; CHECK-NEXT: vmov r12, r5, d5 ; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r12 -; CHECK-NEXT: vmov r12, r5, d7 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov r12, r5, d9 ; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmov r4, r5, d6 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r12 -; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: vmov.32 q1[2], r12 +; CHECK-NEXT: vmov r12, r5, d4 +; CHECK-NEXT: lsrl r12, r5, #31 +; CHECK-NEXT: vmov.32 q2[0], r12 +; CHECK-NEXT: vmov r12, r5, d8 +; CHECK-NEXT: lsrl r12, r5, #31 +; CHECK-NEXT: vmov.32 q3[0], r12 +; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmov.f32 s14, s6 +; CHECK-NEXT: vmov.f32 s15, s2 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: .LBB0_2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r5, pc} entry: %0 = and i32 %n, 3 %cmp = icmp eq i32 %0, 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -232,34 +232,33 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: umull lr, r12, r1, r0 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: umull r2, r5, r3, r0 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: umull lr, r12, r1, r0 +; CHECK-NEXT: vmov q1[2], q1[0], r2, lr ; CHECK-NEXT: asrs r2, r0, #31 ; CHECK-NEXT: mla r4, r1, r2, r12 ; CHECK-NEXT: asrs r1, r1, #31 ; CHECK-NEXT: mla r5, r3, r2, r5 ; CHECK-NEXT: asrs r3, r3, #31 ; CHECK-NEXT: mla r1, r1, r0, r4 -; CHECK-NEXT: vmov r4, s4 ; CHECK-NEXT: mla r3, r3, r0, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r5, lr, r4, r0 -; CHECK-NEXT: umull r3, r12, r1, r0 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r3 -; CHECK-NEXT: mla r3, r1, r2, r12 +; CHECK-NEXT: vmov q1[3], q1[1], r3, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: umull r3, r5, r1, r0 +; CHECK-NEXT: mla r5, r1, r2, r5 ; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r2, r4, r2, lr -; CHECK-NEXT: mla r1, r1, r0, r3 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: mla r0, r3, r0, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: mla r12, r1, r0, r5 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: umull r4, r1, r5, r0 +; CHECK-NEXT: mla r1, r5, r2, r1 +; CHECK-NEXT: asrs r2, r5, #31 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: mla r0, r2, r0, r1 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> @@ -276,34 +275,33 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r7, lr} ; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: asrs r4, r0, #31 -; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov.f32 s6, s3 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: umull lr, r12, r0, r1 +; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: umull r2, r5, r0, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: umull lr, r12, r0, r1 +; CHECK-NEXT: vmov q1[2], q1[0], r2, lr ; CHECK-NEXT: asrs r2, r1, #31 ; CHECK-NEXT: mla r2, r0, r2, r12 ; CHECK-NEXT: mla r1, r4, r1, r2 ; CHECK-NEXT: asrs r2, r3, #31 ; CHECK-NEXT: mla r2, r0, r2, r5 -; CHECK-NEXT: vmov r5, s4 ; CHECK-NEXT: mla r2, r4, r3, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r3, lr, r0, r5 -; CHECK-NEXT: umull r2, r12, r0, r1 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: mla r2, r0, r2, r12 -; CHECK-NEXT: mla r1, r4, r1, r2 -; CHECK-NEXT: asrs r2, r5, #31 -; CHECK-NEXT: mla r0, r0, r2, lr -; CHECK-NEXT: mla r0, r4, r5, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r1 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: umull r2, r3, r0, r1 +; CHECK-NEXT: asrs r5, r1, #31 +; CHECK-NEXT: mla r3, r0, r5, r3 +; CHECK-NEXT: mla r12, r4, r1, r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: umull r5, r1, r0, r3 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 +; CHECK-NEXT: asrs r2, r3, #31 +; CHECK-NEXT: mla r0, r0, r2, r1 +; CHECK-NEXT: mla r0, r4, r3, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -6,11 +6,16 @@ define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vst2_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldm.w r0, {r2, r3, r12} -; CHECK-NEXT: ldr r0, [r0, #12] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r0 -; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: ldrd r2, r12, [r0] +; CHECK-NEXT: ldrd r3, r0, [r0, #8] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r12 +; CHECK-NEXT: vmov.32 q3[0], r2 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov.f32 s13, s8 +; CHECK-NEXT: vmov.f32 s14, s6 +; CHECK-NEXT: vmov.f32 s15, s2 +; CHECK-NEXT: vstrw.32 q3, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 @@ -502,14 +507,15 @@ ; CHECK-LABEL: vst2_v2f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r2, r0, [r0] -; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: vmovx.f16 s5, s4 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmovx.f16 s1, s4 ; CHECK-NEXT: vins.f16 s4, s0 ; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmov r0, r2, d2 -; CHECK-NEXT: str r2, [r1, #4] +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: str r0, [r1, #4] +; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: bx lr entry: @@ -527,23 +533,21 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r2, r12, [r0] ; CHECK-NEXT: ldrd r3, r0, [r0, #8] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.32 q0[1], r12 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmovx.f16 s2, s0 -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vins.f16 s2, s4 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vins.f16 s1, s5 -; CHECK-NEXT: vmovx.f16 s6, s5 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s11, s4 -; CHECK-NEXT: vstrh.16 q2, [r1] +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmovx.f16 s1, s4 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vmov.32 q1[1], r12 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: vmovx.f16 s3, s5 +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vins.f16 s3, s2 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -8,21 +8,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} ; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 +; CHECK-NEXT: vmov.32 q2[0], r2 ; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov.f32 s8, s7 -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.32 q1[1], r0 ; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s5 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r12 +; CHECK-NEXT: vmov.f32 s11, s13 +; CHECK-NEXT: vmov q1[3], q1[1], r3, lr +; CHECK-NEXT: vmov.f32 s4, s7 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r12 +; CHECK-NEXT: vmov.f32 s9, s6 ; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: strd r2, r0, [r1, #16] +; CHECK-NEXT: strd r4, r0, [r1, #16] ; CHECK-NEXT: pop {r4, pc} entry: %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 @@ -266,22 +270,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrh r2, [r0, #10] -; CHECK-NEXT: ldrh r4, [r0, #8] -; CHECK-NEXT: ldrh.w r12, [r0, #2] -; CHECK-NEXT: ldrh.w lr, [r0] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r2 -; CHECK-NEXT: ldrh r3, [r0, #6] -; CHECK-NEXT: ldrh r0, [r0, #4] -; CHECK-NEXT: vmov q1[2], q1[0], r0, r3 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmovnt.i32 q2, q0 -; CHECK-NEXT: vmov q0[2], q0[0], lr, r12 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s3, s2 -; CHECK-NEXT: vmov.32 q0[2], r4 -; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: ldrh r4, [r0, #6] +; CHECK-NEXT: ldrh.w lr, [r0, #4] +; CHECK-NEXT: ldrh r3, [r0] +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: ldrh.w r12, [r0, #8] +; CHECK-NEXT: vmov.32 q1[0], lr +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: ldrh r0, [r0, #10] +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vmov.32 q2[2], r12 +; CHECK-NEXT: vstrh.32 q2, [r1] ; CHECK-NEXT: str r0, [r1, #8] ; CHECK-NEXT: pop {r4, pc} entry: @@ -353,43 +357,47 @@ ; CHECK-NEXT: vins.f16 s12, s11 ; CHECK-NEXT: vmov.f32 s1, s12 ; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: vmov.f32 s20, s4 +; CHECK-NEXT: vmov.f32 s3, s15 +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmovx.f16 s7, s14 -; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vins.f16 s0, s7 ; CHECK-NEXT: vmovx.f16 s7, s15 -; CHECK-NEXT: vins.f16 s3, s7 -; CHECK-NEXT: vmov.f32 s7, s6 -; CHECK-NEXT: vmovx.f16 s2, s2 -; CHECK-NEXT: vins.f16 s7, s10 -; CHECK-NEXT: vmov.f32 s20, s4 ; CHECK-NEXT: vins.f16 s15, s2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vins.f16 s20, s8 +; CHECK-NEXT: vins.f16 s3, s7 ; CHECK-NEXT: vmov.f32 s7, s6 ; CHECK-NEXT: vmovx.f16 s6, s5 +; CHECK-NEXT: vins.f16 s22, s10 +; CHECK-NEXT: vins.f16 s20, s8 ; CHECK-NEXT: vmov.f32 s2, s15 +; CHECK-NEXT: vmov.16 q4[2], r0 ; CHECK-NEXT: vmovx.f16 s15, s13 ; CHECK-NEXT: vins.f16 s13, s6 ; CHECK-NEXT: vmovx.f16 s6, s7 +; CHECK-NEXT: vins.f16 s5, s9 ; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vmov q2, q5 ; CHECK-NEXT: vins.f16 s14, s6 +; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmovx.f16 s6, s12 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vmovx.f16 s4, s4 ; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vmov.f32 s18, s22 ; CHECK-NEXT: vins.f16 s17, s15 ; CHECK-NEXT: vmov.f32 s16, s13 -; CHECK-NEXT: vins.f16 s22, s6 -; CHECK-NEXT: vmov.f32 s19, s14 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] +; CHECK-NEXT: vmov.f32 s19, s14 ; CHECK-NEXT: vmov.f32 s23, s5 ; CHECK-NEXT: vstrw.32 q4, [r1, #16] ; CHECK-NEXT: vmov.f32 s21, s12 +; CHECK-NEXT: vmov.f32 s22, s10 ; CHECK-NEXT: vstrw.32 q5, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr @@ -412,111 +420,116 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q7, [r0, #80] -; CHECK-NEXT: vmov.f32 s0, s11 +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vmov.f32 s0, s27 ; CHECK-NEXT: vmov.u16 r2, q1[5] ; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vins.f16 s0, s7 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.f64 d12, d4 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s26, s10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] ; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vstrw.32 q6, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vldrw.u32 q5, [r0, #16] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.f32 s14, s31 +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmov.f32 s2, s27 ; CHECK-NEXT: vins.f16 s12, s0 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.f32 s14, s11 ; CHECK-NEXT: vins.f16 s14, s0 -; CHECK-NEXT: vmov.f32 s20, s7 -; CHECK-NEXT: vmov q0, q3 -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vins.f16 s20, s15 -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.f32 s17, s20 -; CHECK-NEXT: vmovx.f16 s20, s31 -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmovx.f16 s7, s30 -; CHECK-NEXT: vins.f16 s16, s7 -; CHECK-NEXT: vmovx.f16 s7, s18 -; CHECK-NEXT: vins.f16 s31, s7 -; CHECK-NEXT: vmovx.f16 s7, s11 -; CHECK-NEXT: vins.f16 s3, s7 -; CHECK-NEXT: vins.f16 s19, s20 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s20, s24 -; CHECK-NEXT: vmovx.f16 s11, s8 -; CHECK-NEXT: vmov.f32 s7, s25 -; CHECK-NEXT: vins.f16 s20, s0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vins.f16 s7, s1 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vmov.f32 s23, s7 -; CHECK-NEXT: vmovx.f16 s7, s24 -; CHECK-NEXT: vmov.f32 s24, s4 -; CHECK-NEXT: vins.f16 s8, s7 -; CHECK-NEXT: vins.f16 s24, s12 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vins.f16 s8, s13 -; CHECK-NEXT: vmovx.f16 s4, s4 -; CHECK-NEXT: vmov.f32 s27, s8 -; CHECK-NEXT: vmovx.f16 s8, s28 -; CHECK-NEXT: vins.f16 s28, s4 -; CHECK-NEXT: vmov.f32 s4, s6 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vins.f16 s4, s14 -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.f32 s18, s31 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmovx.f16 s4, s29 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vins.f16 s1, s4 -; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vmov.f32 s0, s23 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vins.f16 s0, s11 +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmov.f32 s2, s23 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmovx.f16 s2, s19 +; CHECK-NEXT: vins.f16 s19, s0 +; CHECK-NEXT: vmovx.f16 s0, s31 +; CHECK-NEXT: vins.f16 s15, s0 +; CHECK-NEXT: vins.f16 s7, s2 +; CHECK-NEXT: vmov.f32 s6, s19 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov.f32 s2, s22 +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s4, s17 +; CHECK-NEXT: vins.f16 s17, s0 +; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vins.f16 s2, s10 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vins.f16 s18, s0 +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmovx.f16 s2, s29 ; CHECK-NEXT: vins.f16 s29, s0 -; CHECK-NEXT: vins.f16 s30, s4 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s25, s28 -; CHECK-NEXT: vins.f16 s26, s8 -; CHECK-NEXT: vmov.f32 s0, s29 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.f32 s3, s30 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s22, s11 -; CHECK-NEXT: vstrw.32 q6, [r1, #48] -; CHECK-NEXT: vmov.f32 s8, s30 -; CHECK-NEXT: vstrw.32 q0, [r1, #64] -; CHECK-NEXT: vins.f16 s8, s6 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmovx.f16 s4, s29 -; CHECK-NEXT: vins.f16 s5, s8 -; CHECK-NEXT: vmovx.f16 s8, s30 +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vmov.u16 r0, q5[3] +; CHECK-NEXT: vins.f16 s30, s0 +; CHECK-NEXT: vmov.f32 s0, s24 ; CHECK-NEXT: vins.f16 s9, s4 -; CHECK-NEXT: vins.f16 s10, s8 -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vstrw.32 q4, [r1, #80] +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.f32 s11, s18 +; CHECK-NEXT: vins.f16 s0, s20 +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vins.f16 s5, s2 +; CHECK-NEXT: vmov.f32 s3, s26 +; CHECK-NEXT: vins.f16 s18, s22 +; CHECK-NEXT: vins.f16 s25, s21 +; CHECK-NEXT: vmov.u16 r0, q5[1] +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmovx.f16 s2, s24 +; CHECK-NEXT: vmovx.f16 s6, s28 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vins.f16 s28, s2 +; CHECK-NEXT: vins.f16 s22, s6 +; CHECK-NEXT: vmov.f32 s4, s29 +; CHECK-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-NEXT: vmov.f32 s7, s30 +; CHECK-NEXT: vmov.f32 s1, s28 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s22 +; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s24, s28 +; CHECK-NEXT: vins.f16 s24, s20 +; CHECK-NEXT: vins.f16 s29, s21 +; CHECK-NEXT: vmov.u16 r0, q5[1] +; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov.f32 s6, s18 +; CHECK-NEXT: vmovx.f16 s20, s16 +; CHECK-NEXT: vmovx.f16 s18, s28 +; CHECK-NEXT: vins.f16 s22, s20 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [r1] -; CHECK-NEXT: vstrw.32 q1, [r1, #32] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s16, s18 +; CHECK-NEXT: vmov.f32 s3, s25 +; CHECK-NEXT: vmov.f32 s8, s17 +; CHECK-NEXT: vstrw.32 q1, [r1, #80] +; CHECK-NEXT: vmov.f32 s27, s29 +; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vmov.f32 s25, s16 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vmov.f32 s26, s22 +; CHECK-NEXT: vstrw.32 q6, [r1, #48] +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -543,22 +556,20 @@ ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: mov r4, sp -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r5, [r0, #5] +; CHECK-NEXT: mov r5, sp +; CHECK-NEXT: ldrb r3, [r0, #2] ; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #1] +; CHECK-NEXT: ldrb.w lr, [r0, #3] +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: ldrb r4, [r0, #5] ; CHECK-NEXT: ldrb r0, [r0, #4] -; CHECK-NEXT: vmov.16 q0[1], r12 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: add r0, sp, #8 -; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: vmov.16 q0[3], r12 ; CHECK-NEXT: vmov.16 q0[4], lr -; CHECK-NEXT: vmov.16 q0[5], r5 -; CHECK-NEXT: vstrb.16 q0, [r4] +; CHECK-NEXT: vmov.16 q0[5], r4 +; CHECK-NEXT: vstrb.16 q0, [r5] ; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: ldr r2, [sp] @@ -1205,18 +1216,21 @@ ; CHECK-NEXT: ldrd r2, r3, [r0] ; CHECK-NEXT: ldr r0, [r0, #8] ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmovx.f16 s2, s0 -; CHECK-NEXT: vmovx.f16 s6, s4 -; CHECK-NEXT: vins.f16 s4, s2 -; CHECK-NEXT: vmovx.f16 s2, s1 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, r2, d0 -; CHECK-NEXT: stm r1!, {r0, r2, r3} +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmovx.f16 s4, s8 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s0, s5 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: str r0, [r1, #8] +; CHECK-NEXT: strd r3, r2, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -1237,32 +1251,36 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r3 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r2 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmovx.f16 s9, s3 -; CHECK-NEXT: vmovx.f16 s6, s0 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: ldr r4, [r0, #16] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r12 +; CHECK-NEXT: ldr r0, [r0, #20] +; CHECK-NEXT: vmov.32 q2[0], r4 +; CHECK-NEXT: vmov.f32 s4, s0 +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmovx.f16 s14, s8 +; CHECK-NEXT: vmov.32 q2[1], r0 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vins.f16 s9, s0 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vins.f16 s9, s8 -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vmov r0, r2, d4 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s11, s1 -; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmov.f32 s0, s9 +; CHECK-NEXT: vins.f16 s5, s12 +; CHECK-NEXT: vins.f16 s6, s14 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r12 +; CHECK-NEXT: vmov q3[3], q3[1], r3, lr +; CHECK-NEXT: vmovx.f16 s13, s15 +; CHECK-NEXT: vins.f16 s1, s15 +; CHECK-NEXT: vins.f16 s13, s2 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: strd r0, r2, [r1, #16] ; CHECK-NEXT: pop {r4, pc} entry: @@ -1282,58 +1300,62 @@ define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) { ; CHECK-LABEL: vst3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vmov.f32 s4, s15 +; CHECK-NEXT: vmov.f32 s4, s11 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vins.f16 s4, s19 ; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmovx.f16 s10, s16 +; CHECK-NEXT: vmov.f32 s22, s10 ; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmovx.f16 s4, s19 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vins.f16 s22, s18 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovx.f16 s2, s11 +; CHECK-NEXT: vmov.16 q1[6], r3 +; CHECK-NEXT: vmov.f32 s11, s10 +; CHECK-NEXT: vmov.f32 s3, s7 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmovx.f16 s8, s6 -; CHECK-NEXT: vmovx.f16 s2, s15 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s8, s7 -; CHECK-NEXT: vins.f16 s3, s8 -; CHECK-NEXT: vmov.f32 s8, s12 -; CHECK-NEXT: vins.f16 s8, s16 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmovx.f16 s12, s6 +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vins.f16 s0, s12 +; CHECK-NEXT: vmovx.f16 s12, s7 ; CHECK-NEXT: vins.f16 s7, s2 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vins.f16 s2, s17 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmovx.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vins.f16 s4, s2 -; CHECK-NEXT: vins.f16 s10, s12 -; CHECK-NEXT: vmovx.f16 s12, s17 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vins.f16 s2, s18 -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vmovx.f16 s2, s13 +; CHECK-NEXT: vmovx.f16 s2, s17 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmovx.f16 s2, s9 ; CHECK-NEXT: vins.f16 s5, s2 -; CHECK-NEXT: vmovx.f16 s2, s14 +; CHECK-NEXT: vmovx.f16 s2, s11 ; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vins.f16 s20, s16 +; CHECK-NEXT: vmovx.f16 s2, s16 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vins.f16 s9, s17 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov q4, q5 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vins.f16 s18, s8 ; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmov.f32 s9, s4 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vins.f16 s13, s10 +; CHECK-NEXT: vmov.f32 s23, s9 ; CHECK-NEXT: vstrw.32 q0, [r1, #32] -; CHECK-NEXT: vmov.f32 s19, s6 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vstrw.32 q4, [r1, #16] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmov.f32 s15, s6 +; CHECK-NEXT: vmov.f32 s21, s4 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: vmov.f32 s22, s18 +; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 @@ -1354,128 +1376,133 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q6, [r0, #32] -; CHECK-NEXT: vmov.f32 s8, s12 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: .pad #96 +; CHECK-NEXT: sub sp, #96 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: vldrw.u32 q6, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-NEXT: vmovx.f16 s2, s7 +; CHECK-NEXT: vmov.f32 s0, s31 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vins.f16 s0, s5 -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: vmov.f32 s11, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmov.f32 s12, s8 -; CHECK-NEXT: vmov.f64 d11, d9 -; CHECK-NEXT: vmov.f32 s21, s17 -; CHECK-NEXT: vmov.f64 d7, d5 -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s2, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vins.f16 s0, s7 +; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vmov.f64 d8, d2 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.f32 s18, s6 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.f32 s2, s31 +; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vmov.f32 s11, s3 ; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vins.f16 s14, s2 -; CHECK-NEXT: vmovx.f16 s2, s24 -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s7 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vins.f16 s11, s2 +; CHECK-NEXT: vmov.f32 s0, s27 +; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vmovx.f16 s2, s11 +; CHECK-NEXT: vins.f16 s0, s11 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vins.f16 s16, s24 -; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vins.f16 s0, s25 -; CHECK-NEXT: vmov.f32 s19, s0 -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov.f64 d15, d13 -; CHECK-NEXT: vmov.f32 s17, s13 -; CHECK-NEXT: vmov.f32 s24, s16 -; CHECK-NEXT: vmov.f64 d13, d9 -; CHECK-NEXT: vmov.f64 d9, d7 -; CHECK-NEXT: vldrw.u32 q3, [r0, #64] -; CHECK-NEXT: vmovx.f16 s2, s12 -; CHECK-NEXT: vins.f16 s12, s0 -; CHECK-NEXT: vins.f16 s26, s2 -; CHECK-NEXT: vmovx.f16 s2, s30 -; CHECK-NEXT: vmov.f32 s0, s19 -; CHECK-NEXT: vstrw.32 q6, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vins.f16 s0, s31 -; CHECK-NEXT: vmov.f32 s29, s25 -; CHECK-NEXT: vmov.16 q6[0], r0 -; CHECK-NEXT: vmov.f32 s25, s0 -; CHECK-NEXT: vmovx.f16 s0, s31 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s2, s10 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q5[0], r3 +; CHECK-NEXT: vmov.f32 s21, s0 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vmov.16 q0[6], r2 ; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.f32 s2, s27 +; CHECK-NEXT: vins.f16 s20, s0 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmovx.f16 s2, s15 -; CHECK-NEXT: vins.f16 s24, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: vins.f16 s23, s2 +; CHECK-NEXT: vmov.f32 s2, s30 +; CHECK-NEXT: vins.f16 s2, s18 ; CHECK-NEXT: vins.f16 s15, s0 -; CHECK-NEXT: vmovx.f16 s0, s6 -; CHECK-NEXT: vmov.f32 s4, s23 -; CHECK-NEXT: vins.f16 s27, s2 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vins.f16 s4, s7 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s10 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s0, s4 -; CHECK-NEXT: vmovx.f16 s4, s11 -; CHECK-NEXT: vmovx.f16 s2, s23 -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vmovx.f16 s4, s5 -; CHECK-NEXT: vins.f16 s11, s2 -; CHECK-NEXT: vmov.f32 s2, s22 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s0, s29 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vins.f16 s6, s0 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vins.f16 s17, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vins.f16 s18, s10 +; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmovx.f16 s2, s13 +; CHECK-NEXT: vins.f16 s13, s0 +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vmov.f32 s19, s6 +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vins.f16 s9, s2 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s16, s28 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vins.f16 s16, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vins.f16 s29, s1 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s19, s29 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.f32 s22, s15 +; CHECK-NEXT: vmovx.f16 s0, s28 +; CHECK-NEXT: vmov.f32 s1, s28 +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmov.f32 s0, s24 ; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.f32 s29, s12 -; CHECK-NEXT: vmovx.f16 s4, s21 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vins.f16 s9, s4 -; CHECK-NEXT: vmovx.f16 s4, s22 -; CHECK-NEXT: vins.f16 s10, s4 -; CHECK-NEXT: vmov.f32 s21, s17 -; CHECK-NEXT: vmov.f32 s22, s18 -; CHECK-NEXT: vins.f16 s5, s12 -; CHECK-NEXT: vmov.f32 s4, s18 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmovx.f16 s12, s17 -; CHECK-NEXT: vins.f16 s4, s18 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s13 -; CHECK-NEXT: vmov.16 q7[2], r0 -; CHECK-NEXT: vmov.f32 s2, s11 -; CHECK-NEXT: vmov.f32 s30, s4 -; CHECK-NEXT: vmovx.f16 s4, s21 -; CHECK-NEXT: vins.f16 s13, s4 -; CHECK-NEXT: vmovx.f16 s4, s22 -; CHECK-NEXT: vins.f16 s14, s4 -; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #80] +; CHECK-NEXT: vmovx.f16 s6, s28 +; CHECK-NEXT: vins.f16 s0, s28 +; CHECK-NEXT: vstrw.32 q5, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s25, s29 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmovx.f16 s10, s12 +; CHECK-NEXT: vmovx.f16 s6, s24 +; CHECK-NEXT: vmov.f32 s3, s25 +; CHECK-NEXT: vldrw.u32 q6, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s12, s6 +; CHECK-NEXT: vins.f16 s30, s10 +; CHECK-NEXT: vmov.f64 d12, d10 +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.f32 s1, s12 +; CHECK-NEXT: vmov.f32 s2, s30 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [r1, #48] ; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vins.f16 s29, s12 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vstrw.32 q6, [r1, #32] -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vstrw.32 q5, [r1, #48] -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov.f32 s28, s13 -; CHECK-NEXT: vstrw.32 q1, [r1, #64] -; CHECK-NEXT: vmov.f32 s31, s14 -; CHECK-NEXT: vstrw.32 q7, [r1, #16] -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vmov.f32 s27, s23 +; CHECK-NEXT: vstrw.32 q1, [r1, #32] +; CHECK-NEXT: vmov.f32 s8, s13 +; CHECK-NEXT: vstrw.32 q6, [r1, #16] +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vstrw.32 q0, [r1, #80] +; CHECK-NEXT: vstrw.32 q2, [r1, #64] +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: add sp, #96 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -8,25 +8,36 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: add.w r6, r0, #16 -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: ldrd r3, r2, [r0] +; CHECK-NEXT: ldrd lr, r12, [r0, #8] ; CHECK-NEXT: ldm r6, {r4, r5, r6} -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 +; CHECK-NEXT: vmov.32 q4[1], r2 +; CHECK-NEXT: vmov q4[2], q4[0], r3, lr +; CHECK-NEXT: vmov.32 q3[1], r5 ; CHECK-NEXT: ldr r0, [r0, #28] -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 +; CHECK-NEXT: vmov.32 q5[0], r3 +; CHECK-NEXT: vmov.32 q2[0], r4 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r6 -; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmov.f32 s17, s18 +; CHECK-NEXT: vmov.f32 s18, s8 +; CHECK-NEXT: vmov q2[2], q2[0], r3, lr +; CHECK-NEXT: vmov q3[2], q3[0], r4, r6 +; CHECK-NEXT: vmov.f32 s16, s20 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r0 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: vmov.32 q1[1], r5 +; CHECK-NEXT: vmov q2[3], q2[1], r2, r12 +; CHECK-NEXT: vmov.32 q5[1], r2 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s0, s21 +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 @@ -253,27 +264,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: ldrh r3, [r0, #2] -; CHECK-NEXT: ldrh r2, [r0] -; CHECK-NEXT: ldrh.w r12, [r0, #10] -; CHECK-NEXT: ldrh.w lr, [r0, #4] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r3 -; CHECK-NEXT: ldrh r4, [r0, #12] -; CHECK-NEXT: ldrh r5, [r0, #6] +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: ldrh r3, [r0, #8] +; CHECK-NEXT: ldrh.w r12, [r0, #12] +; CHECK-NEXT: ldrh.w lr, [r0, #2] +; CHECK-NEXT: ldrh r4, [r0, #6] +; CHECK-NEXT: ldrh r5, [r0, #10] ; CHECK-NEXT: ldrh r6, [r0, #14] -; CHECK-NEXT: ldrh r0, [r0, #8] -; CHECK-NEXT: vmov q0[2], q0[0], r0, r12 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[1], lr -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.16 q1[3], r4 -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.16 q1[5], r5 -; CHECK-NEXT: vmov.16 q1[6], r12 -; CHECK-NEXT: vmov.16 q1[7], r6 -; CHECK-NEXT: vstrh.16 q1, [r1] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov.16 q0[2], r3 +; CHECK-NEXT: vmov.16 q0[3], r12 +; CHECK-NEXT: vmov.16 q0[4], lr +; CHECK-NEXT: vmov.16 q0[5], r4 +; CHECK-NEXT: vmov.16 q0[6], r5 +; CHECK-NEXT: vmov.16 q0[7], r6 +; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %s1 = getelementptr <2 x i16>, <2 x i16>* %src, i32 0 @@ -420,55 +427,51 @@ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vins.f16 s5, s9 -; CHECK-NEXT: vins.f16 s12, s0 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmovx.f16 s27, s4 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmov.f32 s5, s4 -; CHECK-NEXT: vmovx.f16 s8, s8 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vmovx.f16 s2, s13 -; CHECK-NEXT: vins.f16 s27, s8 -; CHECK-NEXT: vmovx.f16 s4, s12 -; CHECK-NEXT: vmovx.f16 s8, s16 -; CHECK-NEXT: vins.f16 s13, s17 -; CHECK-NEXT: vins.f16 s12, s16 -; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vmovx.f16 s23, s4 ; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vmov.f32 s22, s4 -; CHECK-NEXT: vmovx.f16 s4, s11 -; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vmov.f32 s21, s4 +; CHECK-NEXT: vmovx.f16 s22, s12 +; CHECK-NEXT: vmovx.f16 s4, s16 ; CHECK-NEXT: vmovx.f16 s27, s7 +; CHECK-NEXT: vins.f16 s22, s4 +; CHECK-NEXT: vmovx.f16 s4, s11 ; CHECK-NEXT: vins.f16 s7, s11 ; CHECK-NEXT: vins.f16 s27, s4 ; CHECK-NEXT: vmovx.f16 s26, s15 ; CHECK-NEXT: vmovx.f16 s4, s19 +; CHECK-NEXT: vmovx.f16 s3, s5 +; CHECK-NEXT: vins.f16 s5, s9 +; CHECK-NEXT: vmovx.f16 s0, s9 ; CHECK-NEXT: vmov.f32 s25, s7 ; CHECK-NEXT: vins.f16 s26, s4 ; CHECK-NEXT: vmovx.f16 s7, s6 ; CHECK-NEXT: vmovx.f16 s4, s10 ; CHECK-NEXT: vins.f16 s6, s10 -; CHECK-NEXT: vmov.f32 s21, s5 +; CHECK-NEXT: vins.f16 s3, s0 +; CHECK-NEXT: vmovx.f16 s2, s13 +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmovx.f16 s8, s8 +; CHECK-NEXT: vins.f16 s12, s16 ; CHECK-NEXT: vins.f16 s15, s19 ; CHECK-NEXT: vins.f16 s7, s4 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmovx.f16 s6, s14 ; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vins.f16 s13, s17 ; CHECK-NEXT: vins.f16 s14, s18 ; CHECK-NEXT: vins.f16 s2, s0 ; CHECK-NEXT: vmov.f32 s0, s13 -; CHECK-NEXT: vmov.f32 s24, s15 ; CHECK-NEXT: vins.f16 s6, s4 -; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vmov.f32 s24, s15 +; CHECK-NEXT: vins.f16 s23, s8 +; CHECK-NEXT: vmov.f32 s20, s12 ; CHECK-NEXT: vstrb.8 q6, [r1, #48] -; CHECK-NEXT: vstrb.8 q1, [r1, #32] +; CHECK-NEXT: vmov.f32 s4, s14 ; CHECK-NEXT: vstrb.8 q0, [r1, #16] +; CHECK-NEXT: vstrb.8 q1, [r1, #32] ; CHECK-NEXT: vstrb.8 q5, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr @@ -495,26 +498,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: ldrb r4, [r0, #5] -; CHECK-NEXT: ldrb r5, [r0, #4] -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r6, [r0, #7] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: ldrb r0, [r0, #6] -; CHECK-NEXT: vmov.16 q0[1], r12 -; CHECK-NEXT: vmov.16 q0[2], r5 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.16 q0[4], r3 -; CHECK-NEXT: vmov.16 q0[5], lr -; CHECK-NEXT: vmov.16 q0[6], r4 -; CHECK-NEXT: vmov.16 q0[7], r6 +; CHECK-NEXT: ldrb r4, [r0] +; CHECK-NEXT: ldrb r6, [r0, #2] +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrb r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[1], r6 +; CHECK-NEXT: ldrb r3, [r0, #6] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: ldrb r5, [r0, #1] +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: ldrb.w r12, [r0, #5] +; CHECK-NEXT: ldrb.w lr, [r0, #7] +; CHECK-NEXT: vmov.16 q0[4], r5 +; CHECK-NEXT: ldrb r0, [r0, #3] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q0[6], r12 +; CHECK-NEXT: vmov.16 q0[7], lr ; CHECK-NEXT: vstrb.16 q0, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: @@ -1052,20 +1051,22 @@ ; CHECK-LABEL: vst4_v2f16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldm.w r0, {r2, r3, r12} -; CHECK-NEXT: vmov.32 q1[0], r12 +; CHECK-NEXT: vmov.32 q2[0], r12 +; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: ldr r0, [r0, #12] ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmovx.f16 s2, s0 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s3, s4 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s6, s5 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vins.f16 s3, s6 -; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov.32 q3[1], r0 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmovx.f16 s3, s8 +; CHECK-NEXT: vins.f16 s8, s13 +; CHECK-NEXT: vins.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s4, s13 +; CHECK-NEXT: vins.f16 s0, s5 +; CHECK-NEXT: vins.f16 s3, s4 +; CHECK-NEXT: vmov.f32 s1, s8 ; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: bx lr entry: @@ -1089,38 +1090,44 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: add.w r6, r0, #16 -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldm r6, {r4, r5, r6} -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 +; CHECK-NEXT: ldrd r2, lr, [r0, #16] +; CHECK-NEXT: ldrd r4, r3, [r0] +; CHECK-NEXT: ldr.w r12, [r0, #24] +; CHECK-NEXT: vmov.32 q0[1], lr +; CHECK-NEXT: ldrd r6, r5, [r0, #8] +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov q3[2], q3[0], r4, r6 +; CHECK-NEXT: vmovx.f16 s3, s4 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.32 q2[0], r4 +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: vins.f16 s3, s0 +; CHECK-NEXT: vmov.f32 s0, s8 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s12, s14 ; CHECK-NEXT: ldr r0, [r0, #28] -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r6 -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vmov q0[3], q0[1], r5, r0 -; CHECK-NEXT: vins.f16 s5, s7 -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s2, s2 -; CHECK-NEXT: vmovx.f16 s11, s1 -; CHECK-NEXT: vins.f16 s12, s2 -; CHECK-NEXT: vmovx.f16 s2, s3 -; CHECK-NEXT: vins.f16 s11, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s6 -; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s6, s7 -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vins.f16 s10, s6 -; CHECK-NEXT: vmov.f32 s9, s1 -; CHECK-NEXT: vmov.f32 s5, s0 -; CHECK-NEXT: vstrh.16 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s12 -; CHECK-NEXT: vstrh.16 q1, [r1] +; CHECK-NEXT: vins.f16 s0, s14 +; CHECK-NEXT: vins.f16 s2, s12 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r12 +; CHECK-NEXT: vmov.32 q1[1], lr +; CHECK-NEXT: vmov q3[3], q3[1], lr, r0 +; CHECK-NEXT: vmovx.f16 s7, s5 +; CHECK-NEXT: vmovx.f16 s4, s15 +; CHECK-NEXT: vins.f16 s5, s15 +; CHECK-NEXT: vmov q3[2], q3[0], r4, r6 +; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r5 +; CHECK-NEXT: vins.f16 s7, s4 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vmovx.f16 s4, s15 +; CHECK-NEXT: vins.f16 s9, s15 +; CHECK-NEXT: vins.f16 s6, s4 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vstrh.16 q1, [r1, #16] +; CHECK-NEXT: vstrh.16 q0, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 @@ -1208,61 +1215,57 @@ define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) { ; CHECK-LABEL: vst4_v8f16_align1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmovx.f16 s2, s21 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s2, s9 -; CHECK-NEXT: vmovx.f16 s12, s25 -; CHECK-NEXT: vmovx.f16 s19, s4 -; CHECK-NEXT: vins.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s12, s20 -; CHECK-NEXT: vins.f16 s19, s12 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmovx.f16 s14, s24 -; CHECK-NEXT: vmovx.f16 s15, s7 -; CHECK-NEXT: vins.f16 s12, s14 -; CHECK-NEXT: vmovx.f16 s14, s23 -; CHECK-NEXT: vins.f16 s15, s14 -; CHECK-NEXT: vmovx.f16 s14, s11 -; CHECK-NEXT: vmovx.f16 s1, s27 +; CHECK-NEXT: vmovx.f16 s3, s5 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vins.f16 s3, s0 +; CHECK-NEXT: vmovx.f16 s2, s13 +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vmovx.f16 s11, s4 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vins.f16 s11, s0 +; CHECK-NEXT: vmovx.f16 s10, s12 +; CHECK-NEXT: vmovx.f16 s0, s24 +; CHECK-NEXT: vmovx.f16 s19, s7 +; CHECK-NEXT: vins.f16 s10, s0 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: vins.f16 s19, s0 +; CHECK-NEXT: vmovx.f16 s18, s15 +; CHECK-NEXT: vmovx.f16 s0, s27 ; CHECK-NEXT: vins.f16 s7, s23 -; CHECK-NEXT: vins.f16 s14, s1 +; CHECK-NEXT: vins.f16 s18, s0 ; CHECK-NEXT: vmovx.f16 s23, s6 -; CHECK-NEXT: vmovx.f16 s1, s22 -; CHECK-NEXT: vins.f16 s6, s22 +; CHECK-NEXT: vmovx.f16 s0, s22 ; CHECK-NEXT: vins.f16 s5, s21 +; CHECK-NEXT: vins.f16 s6, s22 ; CHECK-NEXT: vins.f16 s4, s20 -; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s22, s10 -; CHECK-NEXT: vins.f16 s10, s26 -; CHECK-NEXT: vmovx.f16 s1, s26 -; CHECK-NEXT: vins.f16 s9, s25 -; CHECK-NEXT: vins.f16 s8, s24 -; CHECK-NEXT: vins.f16 s11, s27 -; CHECK-NEXT: vmov q6, q1 -; CHECK-NEXT: vins.f16 s22, s1 -; CHECK-NEXT: vmov.f32 s1, s25 -; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov.f32 s3, s0 -; CHECK-NEXT: vmov.f32 s0, s9 -; CHECK-NEXT: vmov.f32 s26, s12 +; CHECK-NEXT: vins.f16 s12, s24 +; CHECK-NEXT: vins.f16 s15, s27 +; CHECK-NEXT: vins.f16 s23, s0 +; CHECK-NEXT: vmovx.f16 s22, s14 +; CHECK-NEXT: vins.f16 s14, s26 +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vins.f16 s13, s25 +; CHECK-NEXT: vins.f16 s22, s0 +; CHECK-NEXT: vmov.f32 s0, s13 +; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.f32 s9, s4 ; CHECK-NEXT: vstrb.8 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s25, s4 -; CHECK-NEXT: vmov.f32 s27, s19 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vstrb.8 q6, [r1] -; CHECK-NEXT: vmov.f32 s12, s11 +; CHECK-NEXT: vmov.f32 s8, s12 +; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vstrb.8 q2, [r1] +; CHECK-NEXT: vmov.f32 s16, s15 ; CHECK-NEXT: vmov.f32 s21, s6 -; CHECK-NEXT: vstrb.8 q3, [r1, #48] -; CHECK-NEXT: vmov.f32 s20, s10 +; CHECK-NEXT: vstrb.8 q4, [r1, #48] +; CHECK-NEXT: vmov.f32 s20, s14 ; CHECK-NEXT: vstrb.8 q5, [r1, #32] -; CHECK-NEXT: vpop {d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll @@ -50,41 +50,39 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pSrc, i32 %blockSize, <4 x i32> %a) { ; CHECK-LABEL: foo_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov.i64 q5, #0xffffffff +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q4, [r0] -; CHECK-NEXT: vmov.f32 s0, s16 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vand q6, q0, q5 -; CHECK-NEXT: vmov r0, r1, d13 -; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov.f32 s0, s18 -; CHECK-NEXT: vmov.f32 s2, s19 +; CHECK-NEXT: vmov.f32 s18, s19 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bl __aeabi_ul2d +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: bl __aeabi_ul2d +; CHECK-NEXT: vmov.f32 s2, s17 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: vand q5, q0, q5 -; CHECK-NEXT: vmov r4, r5, d11 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: vmov d8, r4, r5 +; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 -; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: vmov r2, r3, d10 ; CHECK-NEXT: vmov d11, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d10, r0, r1 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %pSrc, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) diff --git a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll --- a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -49,14 +49,12 @@ define void @zero_test() { ; X86-LABEL: zero_test: ; X86: # %bb.0: # %entry -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: movlps %xmm0, (%eax) +; X86-NEXT: movl $0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: zero_test: ; X64: # %bb.0: # %entry -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: movlps %xmm0, (%rax) +; X64-NEXT: movq $0, (%rax) ; X64-NEXT: retq entry: %0 = select <2 x i1> undef, <2 x float> undef, <2 x float> zeroinitializer diff --git a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll --- a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll +++ b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll @@ -29,8 +29,8 @@ ; X86-LABEL: store_64: ; X86: # %bb.0: # %BB ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: movlps %xmm0, (%eax) +; X86-NEXT: movl $0, 4(%eax) +; X86-NEXT: movl $0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: store_64: diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -507,7 +507,6 @@ ; SSE2-SSSE3-NEXT: shll $16, %edx ; SSE2-SSSE3-NEXT: orl %eax, %edx ; SSE2-SSSE3-NEXT: shlq $32, %rdx -; SSE2-SSSE3-NEXT: orq %rcx, %rdx ; SSE2-SSSE3-NEXT: movq %rdx, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-SSSE3-NEXT: movd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -773,6 +773,7 @@ define <4 x i32> @ossfuzz5688(i32 %a0) { ; CHECK-LABEL: ossfuzz5688: ; CHECK: # %bb.0: +; CHECK-NEXT: movl $0, (%rax) ; CHECK-NEXT: retq %1 = insertelement <4 x i32> zeroinitializer, i32 -2147483648, i32 %a0 %2 = extractelement <4 x i32> %1, i32 %a0 diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -48,11 +48,10 @@ ; AVX1-NEXT: movl $1091567616, 30256(%rax) # imm = 0x41100000 ; AVX1-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001 ; AVX1-NEXT: movq %rcx, 46348(%rax) -; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3,7.812501848093234E-3] -; AVX1-NEXT: # ymm0 = mem[0,1,0,1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4575657222473777152,4575657222473777152,4575657222473777152,4575657222473777152] ; AVX1-NEXT: vmovups %ymm0, 48296(%rax) -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovsd %xmm0, 47372(%rax) +; AVX1-NEXT: movabsq $4575657222473777152, %rcx # imm = 0x3F8000003F800000 +; AVX1-NEXT: movq %rcx, 47372(%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -62,10 +61,10 @@ ; AVX2-NEXT: movl $1091567616, 30256(%rax) # imm = 0x41100000 ; AVX2-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001 ; AVX2-NEXT: movq %rcx, 46348(%rax) -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm1 -; AVX2-NEXT: vmovups %ymm1, 48296(%rax) -; AVX2-NEXT: vmovlps %xmm0, 47372(%rax) +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm0 = [4575657222473777152,4575657222473777152,4575657222473777152,4575657222473777152] +; AVX2-NEXT: vmovups %ymm0, 48296(%rax) +; AVX2-NEXT: movabsq $4575657222473777152, %rcx # imm = 0x3F8000003F800000 +; AVX2-NEXT: movq %rcx, 47372(%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq alloca_0: diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll --- a/llvm/test/CodeGen/X86/fold-load-vec.ll +++ b/llvm/test/CodeGen/X86/fold-load-vec.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq $0, (%rsp) ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movlps %xmm0, (%rsp) ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movlps %xmm0, (%rsp) ; CHECK-NEXT: movlps %xmm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1047,12 +1047,11 @@ ; BWON-F16C-LABEL: main.158: ; BWON-F16C: # %bb.0: # %entry ; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; BWON-F16C-NEXT: vucomiss %xmm0, %xmm1 -; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; BWON-F16C-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; BWON-F16C-NEXT: vucomiss %xmm1, %xmm2 ; BWON-F16C-NEXT: jae .LBB20_2 ; BWON-F16C-NEXT: # %bb.1: # %entry ; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -1065,8 +1064,7 @@ ; CHECK-I686-LABEL: main.158: ; CHECK-I686: # %bb.0: # %entry ; CHECK-I686-NEXT: subl $12, %esp -; CHECK-I686-NEXT: pxor %xmm0, %xmm0 -; CHECK-I686-NEXT: movd %xmm0, (%esp) +; CHECK-I686-NEXT: movl $0, (%esp) ; CHECK-I686-NEXT: calll __truncsfhf2 ; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esp) diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2061,8 +2061,11 @@ ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] +; SSE2-NEXT: movaps {{.*#+}} xmm3 = [4294967295,0,4294967295,4294967295] +; SSE2-NEXT: andps %xmm3, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] +; SSE2-NEXT: andps %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll --- a/llvm/test/CodeGen/X86/nontemporal-3.ll +++ b/llvm/test/CodeGen/X86/nontemporal-3.ll @@ -93,247 +93,66 @@ } define void @test_zero_v8f32_align1(<8 x float>* %dst) nounwind { -; SSE2-LABEL: test_zero_v8f32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v8f32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorl %eax, %eax -; SSE4A-NEXT: movntiq %rax, 8(%rdi) -; SSE4A-NEXT: movntiq %rax, 24(%rdi) -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v8f32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq -; -; AVX-LABEL: test_zero_v8f32_align1: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 8(%rdi) -; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: test_zero_v8f32_align1: -; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 8(%rdi) -; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: retq +; CHECK-LABEL: test_zero_v8f32_align1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movntiq %rax, 8(%rdi) +; CHECK-NEXT: movntiq %rax, (%rdi) +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) +; CHECK-NEXT: retq store <8 x float> zeroinitializer, <8 x float>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v4i64_align1(<4 x i64>* %dst) nounwind { -; SSE2-LABEL: test_zero_v4i64_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v4i64_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v4i64_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq -; -; AVX-LABEL: test_zero_v4i64_align1: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 8(%rdi) -; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: test_zero_v4i64_align1: -; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 8(%rdi) -; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: retq +; CHECK-LABEL: test_zero_v4i64_align1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movntiq %rax, 8(%rdi) +; CHECK-NEXT: movntiq %rax, (%rdi) +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) +; CHECK-NEXT: retq store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v8i32_align1(<8 x i32>* %dst) nounwind { -; SSE2-LABEL: test_zero_v8i32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v8i32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v8i32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq -; -; AVX-LABEL: test_zero_v8i32_align1: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 8(%rdi) -; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: test_zero_v8i32_align1: -; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 8(%rdi) -; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: retq +; CHECK-LABEL: test_zero_v8i32_align1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movntiq %rax, 8(%rdi) +; CHECK-NEXT: movntiq %rax, (%rdi) +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) +; CHECK-NEXT: retq store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v16i16_align1(<16 x i16>* %dst) nounwind { -; SSE2-LABEL: test_zero_v16i16_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v16i16_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v16i16_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq -; -; AVX-LABEL: test_zero_v16i16_align1: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 8(%rdi) -; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: test_zero_v16i16_align1: -; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 8(%rdi) -; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: retq +; CHECK-LABEL: test_zero_v16i16_align1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movntiq %rax, 8(%rdi) +; CHECK-NEXT: movntiq %rax, (%rdi) +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) +; CHECK-NEXT: retq store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v32i8_align1(<32 x i8>* %dst) nounwind { -; SSE2-LABEL: test_zero_v32i8_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v32i8_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v32i8_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq -; -; AVX-LABEL: test_zero_v32i8_align1: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 8(%rdi) -; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: test_zero_v32i8_align1: -; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 8(%rdi) -; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: retq +; CHECK-LABEL: test_zero_v32i8_align1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movntiq %rax, 8(%rdi) +; CHECK-NEXT: movntiq %rax, (%rdi) +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) +; CHECK-NEXT: retq store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 1, !nontemporal !1 ret void } @@ -508,347 +327,86 @@ } define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind { -; SSE2-LABEL: test_zero_v16f32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v16f32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorl %eax, %eax -; SSE4A-NEXT: movntiq %rax, 24(%rdi) -; SSE4A-NEXT: movntiq %rax, 8(%rdi) -; SSE4A-NEXT: movntiq %rax, 56(%rdi) -; SSE4A-NEXT: movntiq %rax, 40(%rdi) -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v16f32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq -; -; AVX-LABEL: test_zero_v16f32_align1: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: movntiq %rax, 8(%rdi) -; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: movntiq %rax, 40(%rdi) -; AVX-NEXT: movntiq %rax, 32(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: test_zero_v16f32_align1: -; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: movntiq %rax, 8(%rdi) -; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: movntiq %rax, 40(%rdi) -; AVX512-NEXT: movntiq %rax, 32(%rdi) -; AVX512-NEXT: retq +; CHECK-LABEL: test_zero_v16f32_align1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) +; CHECK-NEXT: movntiq %rax, 8(%rdi) +; CHECK-NEXT: movntiq %rax, (%rdi) +; CHECK-NEXT: movntiq %rax, 56(%rdi) +; CHECK-NEXT: movntiq %rax, 48(%rdi) +; CHECK-NEXT: movntiq %rax, 40(%rdi) +; CHECK-NEXT: movntiq %rax, 32(%rdi) +; CHECK-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind { -; SSE2-LABEL: test_zero_v8i64_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v8i64_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v8i64_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq -; -; AVX-LABEL: test_zero_v8i64_align1: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: movntiq %rax, 8(%rdi) -; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: movntiq %rax, 40(%rdi) -; AVX-NEXT: movntiq %rax, 32(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: test_zero_v8i64_align1: -; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: movntiq %rax, 8(%rdi) -; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: movntiq %rax, 40(%rdi) -; AVX512-NEXT: movntiq %rax, 32(%rdi) -; AVX512-NEXT: retq +; CHECK-LABEL: test_zero_v8i64_align1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) +; CHECK-NEXT: movntiq %rax, 8(%rdi) +; CHECK-NEXT: movntiq %rax, (%rdi) +; CHECK-NEXT: movntiq %rax, 56(%rdi) +; CHECK-NEXT: movntiq %rax, 48(%rdi) +; CHECK-NEXT: movntiq %rax, 40(%rdi) +; CHECK-NEXT: movntiq %rax, 32(%rdi) +; CHECK-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind { -; SSE2-LABEL: test_zero_v16i32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v16i32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v16i32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq -; -; AVX-LABEL: test_zero_v16i32_align1: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: movntiq %rax, 8(%rdi) -; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: movntiq %rax, 40(%rdi) -; AVX-NEXT: movntiq %rax, 32(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: test_zero_v16i32_align1: -; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: movntiq %rax, 8(%rdi) -; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: movntiq %rax, 40(%rdi) -; AVX512-NEXT: movntiq %rax, 32(%rdi) -; AVX512-NEXT: retq +; CHECK-LABEL: test_zero_v16i32_align1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) +; CHECK-NEXT: movntiq %rax, 8(%rdi) +; CHECK-NEXT: movntiq %rax, (%rdi) +; CHECK-NEXT: movntiq %rax, 56(%rdi) +; CHECK-NEXT: movntiq %rax, 48(%rdi) +; CHECK-NEXT: movntiq %rax, 40(%rdi) +; CHECK-NEXT: movntiq %rax, 32(%rdi) +; CHECK-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind { -; SSE2-LABEL: test_zero_v32i16_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v32i16_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v32i16_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq -; -; AVX-LABEL: test_zero_v32i16_align1: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: movntiq %rax, 8(%rdi) -; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: movntiq %rax, 40(%rdi) -; AVX-NEXT: movntiq %rax, 32(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: test_zero_v32i16_align1: -; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: movntiq %rax, 8(%rdi) -; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: movntiq %rax, 40(%rdi) -; AVX512-NEXT: movntiq %rax, 32(%rdi) -; AVX512-NEXT: retq +; CHECK-LABEL: test_zero_v32i16_align1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) +; CHECK-NEXT: movntiq %rax, 8(%rdi) +; CHECK-NEXT: movntiq %rax, (%rdi) +; CHECK-NEXT: movntiq %rax, 56(%rdi) +; CHECK-NEXT: movntiq %rax, 48(%rdi) +; CHECK-NEXT: movntiq %rax, 40(%rdi) +; CHECK-NEXT: movntiq %rax, 32(%rdi) +; CHECK-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v64i8_align1(<64 x i8>* %dst) nounwind { -; SSE2-LABEL: test_zero_v64i8_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v64i8_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v64i8_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq -; -; AVX-LABEL: test_zero_v64i8_align1: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movntiq %rax, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: movntiq %rax, 8(%rdi) -; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: movntiq %rax, 56(%rdi) -; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: movntiq %rax, 40(%rdi) -; AVX-NEXT: movntiq %rax, 32(%rdi) -; AVX-NEXT: retq -; -; AVX512-LABEL: test_zero_v64i8_align1: -; AVX512: # %bb.0: -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: movntiq %rax, 8(%rdi) -; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: movntiq %rax, 40(%rdi) -; AVX512-NEXT: movntiq %rax, 32(%rdi) -; AVX512-NEXT: retq +; CHECK-LABEL: test_zero_v64i8_align1: +; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movntiq %rax, 24(%rdi) +; CHECK-NEXT: movntiq %rax, 16(%rdi) +; CHECK-NEXT: movntiq %rax, 8(%rdi) +; CHECK-NEXT: movntiq %rax, (%rdi) +; CHECK-NEXT: movntiq %rax, 56(%rdi) +; CHECK-NEXT: movntiq %rax, 48(%rdi) +; CHECK-NEXT: movntiq %rax, 40(%rdi) +; CHECK-NEXT: movntiq %rax, 32(%rdi) +; CHECK-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1 ret void } @@ -1214,3 +772,7 @@ } !1 = !{i32 1} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SSE2: {{.*}} +; SSE41: {{.*}} +; SSE4A: {{.*}} diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll --- a/llvm/test/CodeGen/X86/pr41619.ll +++ b/llvm/test/CodeGen/X86/pr41619.ll @@ -7,10 +7,9 @@ ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: movl %eax, (%rax) -; CHECK-NEXT: vmovlps %xmm1, (%rax) +; CHECK-NEXT: movq $0, (%rax) ; CHECK-NEXT: retq bb: %tmp = bitcast double %arg to i64 diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll --- a/llvm/test/CodeGen/X86/promote-cmp.ll +++ b/llvm/test/CodeGen/X86/promote-cmp.ll @@ -8,38 +8,36 @@ ; SSE2-LABEL: PR45808: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm3, %xmm9 -; SSE2-NEXT: pxor %xmm4, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm8[0,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm6[1,3] -; SSE2-NEXT: andps %xmm10, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm8[1,3] -; SSE2-NEXT: orps %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] +; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: pxor %xmm4, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] +; SSE2-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: pand %xmm8, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] +; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,1,3,3] -; SSE2-NEXT: psllq $63, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm6 -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,1,3] -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: psllq $63, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: psllq $63, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE2-NEXT: pand %xmm3, %xmm0 ; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm3, %xmm0 @@ -48,19 +46,13 @@ ; SSE4-LABEL: PR45808: ; SSE4: # %bb.0: ; SSE4-NEXT: movdqa %xmm0, %xmm4 -; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: pcmpgtq %xmm3, %xmm0 -; SSE4-NEXT: movdqa %xmm4, %xmm5 -; SSE4-NEXT: pcmpgtq %xmm2, %xmm5 -; SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 -; SSE4-NEXT: pxor %xmm5, %xmm6 -; SSE4-NEXT: psllq $63, %xmm0 +; SSE4-NEXT: movdqa %xmm1, %xmm5 +; SSE4-NEXT: pcmpgtq %xmm3, %xmm5 +; SSE4-NEXT: pcmpgtq %xmm2, %xmm0 +; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4 +; SSE4-NEXT: movdqa %xmm5, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero -; SSE4-NEXT: psllq $63, %xmm0 -; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE4-NEXT: movapd %xmm2, %xmm0 +; SSE4-NEXT: movapd %xmm4, %xmm0 ; SSE4-NEXT: movapd %xmm3, %xmm1 ; SSE4-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -185,7 +185,7 @@ ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rsi -; SSE2-NEXT: movq %rsi, %xmm1 +; SSE2-NEXT: movq %rsi, %xmm0 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: imulq %r8 ; SSE2-NEXT: movq %rdx, %rax @@ -193,10 +193,10 @@ ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rdi -; SSE2-NEXT: movq %rdi, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movq %rdi, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [8589934591,8589934591] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rdx @@ -208,19 +208,22 @@ ; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax ; SSE2-NEXT: addq %rcx, %rax ; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3] -; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_srem_vec: @@ -269,7 +272,7 @@ ; SSE41-NEXT: pxor %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: pextrb $0, %xmm0, %eax +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: pextrb $8, %xmm0, %edx ; SSE41-NEXT: pextrb $0, %xmm2, %ecx ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -323,7 +326,7 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vpextrb $8, %xmm0, %edx ; AVX1-NEXT: vpextrb $0, %xmm1, %ecx ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -377,7 +380,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax +; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vpextrb $8, %xmm0, %edx ; AVX2-NEXT: vpextrb $0, %xmm1, %ecx ; AVX2-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/vec_setcc.ll b/llvm/test/CodeGen/X86/vec_setcc.ll --- a/llvm/test/CodeGen/X86/vec_setcc.ll +++ b/llvm/test/CodeGen/X86/vec_setcc.ll @@ -206,11 +206,12 @@ ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_setcc_v3i1_v3i16: @@ -218,9 +219,10 @@ ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: pextrb $2, %xmm1, %edx -; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: pextrb $4, %xmm0, %edx +; SSE41-NEXT: pextrb $8, %xmm0, %ecx ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx ; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx @@ -231,9 +233,10 @@ ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: vpextrb $2, %xmm0, %edx -; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpextrb $4, %xmm1, %edx +; AVX-NEXT: vpextrb $8, %xmm1, %ecx ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: # kill: def $dl killed $dl killed $edx ; AVX-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/vec_zero_cse.ll b/llvm/test/CodeGen/X86/vec_zero_cse.ll --- a/llvm/test/CodeGen/X86/vec_zero_cse.ll +++ b/llvm/test/CodeGen/X86/vec_zero_cse.ll @@ -15,8 +15,8 @@ ; X32: # %bb.0: ; X32-NEXT: movl $0, M1+4 ; X32-NEXT: movl $0, M1 -; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: movlps %xmm0, M2 +; X32-NEXT: movl $0, M2+4 +; X32-NEXT: movl $0, M2 ; X32-NEXT: retl ; ; X64-LABEL: test1: @@ -34,8 +34,8 @@ ; X32: # %bb.0: ; X32-NEXT: movl $-1, M1+4 ; X32-NEXT: movl $-1, M1 -; X32-NEXT: pcmpeqd %xmm0, %xmm0 -; X32-NEXT: movq %xmm0, M2 +; X32-NEXT: movl $-1, M2+4 +; X32-NEXT: movl $-1, M2 ; X32-NEXT: retl ; ; X64-LABEL: test2: diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll --- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -249,9 +249,9 @@ define void @test_urem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X64-LABEL: test_urem_pow2_v2i32: ; X64: # %bb.0: -; X64-NEXT: movabsq $30064771079, %rax # imm = 0x700000007 -; X64-NEXT: andq (%rdi), %rax -; X64-NEXT: movq %rax, (%rsi) +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movlps %xmm0, (%rsi) ; X64-NEXT: retq ; ; X86-LABEL: test_urem_pow2_v2i32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3160,48 +3160,18 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) define void @PR43024() { -; SSE2-LABEL: PR43024: -; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSE2-NEXT: movaps %xmm0, (%rax) -; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movss %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR43024: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSSE3-NEXT: movaps %xmm0, (%rax) -; SSSE3-NEXT: addss %xmm0, %xmm0 -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: movss %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR43024: -; SSE41: # %bb.0: -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] -; SSE41-NEXT: movaps %xmm0, (%rax) -; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movss %xmm0, (%rax) -; SSE41-NEXT: retq +; SSE-LABEL: PR43024: +; SSE: # %bb.0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; SSE-NEXT: movaps %xmm0, (%rax) +; SSE-NEXT: movl $2143289344, (%rax) # imm = 0x7FC00000 +; SSE-NEXT: retq ; ; AVX-LABEL: PR43024: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; AVX-NEXT: vmovaps %xmm0, (%rax) -; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovss %xmm0, (%rax) +; AVX-NEXT: movl $2143289344, (%rax) # imm = 0x7FC00000 ; AVX-NEXT: retq store <4 x float> , <4 x float>* undef, align 16 %1 = load <4 x float>, <4 x float>* undef, align 16 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2132,11 +2132,9 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll --- a/llvm/test/CodeGen/X86/vselect-constants.ll +++ b/llvm/test/CodeGen/X86/vselect-constants.ll @@ -280,26 +280,28 @@ ; SSE-LABEL: wrong_min_signbits: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1,0,0,0] -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: psllw $15, %xmm1 -; SSE-NEXT: psraw $15, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pcmpeqw %xmm1, %xmm0 +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: psllw $15, %xmm0 +; SSE-NEXT: psraw $15, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: wrong_min_signbits: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,0,0,0] -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,0,0,0] ; AVX-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -569,8 +569,7 @@ ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 @@ -583,23 +582,10 @@ ; SSE41-NEXT: movd %edi, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE41-NEXT: pinsrd $1, %edi, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq -; -; AVX-LABEL: simplify_select: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vmovd %edi, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 -; AVX-NEXT: retq %a = insertelement <2 x i32> , i32 %x, i32 1 %b = insertelement <2 x i32> , i32 %x, i32 0 %y = or <2 x i32> %a, %b diff --git a/llvm/test/CodeGen/X86/widen_cast-5.ll b/llvm/test/CodeGen/X86/widen_cast-5.ll --- a/llvm/test/CodeGen/X86/widen_cast-5.ll +++ b/llvm/test/CodeGen/X86/widen_cast-5.ll @@ -15,9 +15,9 @@ ; ; X64-LABEL: convert: ; X64: ## %bb.0: ## %entry -; X64-NEXT: movabsq $140733193388287, %rax ## imm = 0x7FFF000000FF -; X64-NEXT: xorq %rsi, %rax -; X64-NEXT: movq %rax, (%rdi) +; X64-NEXT: movq %rsi, %xmm0 +; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: %conv = bitcast i64 %src to <2 x i32> diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll --- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll +++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll @@ -105,8 +105,8 @@ ; X86-LABEL: shuf5: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movsd %xmm0, (%eax) +; X86-NEXT: movl $555819297, 4(%eax) # imm = 0x21212121 +; X86-NEXT: movl $555819297, (%eax) # imm = 0x21212121 ; X86-NEXT: retl ; ; X64-LABEL: shuf5: