diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18258,6 +18258,11 @@ unsigned NumElts = VecVT.getVectorNumElements(); unsigned VecEltBitWidth = VecVT.getScalarSizeInBits(); + // Try to simplify the whole operation to a constant, or simplify its + // operands. + if (SimplifyDemandedBits(SDValue(N, 0))) + return SDValue(N, 0); + // TODO: These transforms should not require the 'hasOneUse' restriction, but // there are regressions on multiple targets without it. We can end up with a // mess of scalar and vector code if we reduce only part of the DAG to scalar. diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -57,8 +57,8 @@ ; CHECK-LABEL: widen_f16_build_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #13294 -; CHECK-NEXT: dup.4h v0, w8 -; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: movk w8, #13294, lsl #16 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret %1 = bitcast half* %addr to <2 x half>* store <2 x half> , <2 x half>* %1, align 2 diff --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll --- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll +++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll @@ -24,17 +24,8 @@ define void @test2(float * %p1, i32 %v1) { ; CHECK-LABEL: test2: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 ; =16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: movi.16b v0, #63 -; CHECK-NEXT: and x8, x1, #0x3 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: str q0, [sp] -; CHECK-NEXT: bfi x9, x8, #2, #2 -; CHECK-NEXT: ldr s0, [x9] -; CHECK-NEXT: str s0, [x0] -; CHECK-NEXT: add sp, sp, #16 ; =16 +; CHECK-NEXT: mov w8, #1061109567 +; CHECK-NEXT: str w8, [x0] ; CHECK-NEXT: ret entry: %v2 = extractelement <3 x float> , i32 %v1 diff --git a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll --- a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll +++ b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll @@ -30,10 +30,10 @@ define [1 x <4 x float>] @test2() { ; CHECK-LABEL: .p2align 4 ; -- Begin function test2 ; CHECK-NEXT: lCPI1_0: -; CHECK-NEXT: .long 0x00000000 ; float 0 -; CHECK-NEXT: .long 0x00000000 ; float 0 -; CHECK-NEXT: .long 0x00000000 ; float 0 -; CHECK-NEXT: .long 0x3f800000 ; float 1 +; CHECK-NEXT: .long 0x80000000 ; float -0 +; CHECK-NEXT: .long 0x80000000 ; float -0 +; CHECK-NEXT: .long 0x80000000 ; float -0 +; CHECK-NEXT: .long 0xbf800000 ; float -1 ; CHECK-NEXT: .section __TEXT,__text,regular,pure_instructions ; CHECK-NEXT: .globl _test2 ; CHECK-NEXT: .p2align 2 @@ -43,17 +43,7 @@ ; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x8, lCPI1_0@PAGE ; CHECK-NEXT: Lloh3: -; CHECK-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF] -; CHECK-NEXT: mov s2, v1[1] -; CHECK-NEXT: fneg s0, s1 -; CHECK-NEXT: mov s3, v1[2] -; CHECK-NEXT: fneg s2, s2 -; CHECK-NEXT: mov s1, v1[3] -; CHECK-NEXT: fneg s3, s3 -; CHECK-NEXT: mov.s v0[1], v2[0] -; CHECK-NEXT: mov.s v0[2], v3[0] -; CHECK-NEXT: fneg s1, s1 -; CHECK-NEXT: mov.s v0[3], v1[0] +; CHECK-NEXT: ldr q0, [x8, lCPI1_0@PAGEOFF] ; CHECK-NEXT: ret ; ret [1 x <4 x float>] [<4 x float> diff --git a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll --- a/llvm/test/CodeGen/AArch64/dag-numsignbits.ll +++ b/llvm/test/CodeGen/AArch64/dag-numsignbits.ll @@ -8,19 +8,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI0_0 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: adrp x8, .LCPI0_1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: movi v2.4h, #1 ; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: mov w1, wzr ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: movi v1.4h, #1 -; CHECK-NEXT: cmgt v0.4h, v1.4h, v0.4h -; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: shl v0.4h, v0.4h, #15 -; CHECK-NEXT: sshr v0.4h, v0.4h, #15 +; CHECK-NEXT: cmgt v0.4h, v2.4h, v0.4h ; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: umov w3, v0.h[3] -; CHECK-NEXT: mov w1, wzr ; CHECK-NEXT: mov w2, wzr ; CHECK-NEXT: b foo %tmp3 = shufflevector <4 x i16> %a1, <4 x i16> undef, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -233,54 +233,51 @@ ; CHECK-NEXT: mov w9, #-822083584 ; CHECK-NEXT: mov w11, #1325400063 ; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fcvtzs w8, s5 +; CHECK-NEXT: fcvtzs w8, s0 ; CHECK-NEXT: mov w10, #-2147483648 -; CHECK-NEXT: fcmp s5, s6 ; CHECK-NEXT: fmov s7, w11 +; CHECK-NEXT: fcmp s0, s6 ; CHECK-NEXT: mov w12, #2147483647 ; CHECK-NEXT: csel w8, w10, w8, lt -; CHECK-NEXT: fcmp s5, s7 -; CHECK-NEXT: csel w8, w12, w8, gt -; CHECK-NEXT: fcmp s5, s5 -; CHECK-NEXT: fcvtzs w13, s4 -; CHECK-NEXT: csel w5, wzr, w8, vs -; CHECK-NEXT: fcmp s4, s6 -; CHECK-NEXT: csel w8, w10, w13, lt -; CHECK-NEXT: fcmp s4, s7 -; CHECK-NEXT: csel w8, w12, w8, gt -; CHECK-NEXT: fcmp s4, s4 -; CHECK-NEXT: fcvtzs w14, s0 -; CHECK-NEXT: csel w8, wzr, w8, vs -; CHECK-NEXT: fcmp s0, s6 -; CHECK-NEXT: csel w13, w10, w14, lt ; CHECK-NEXT: fcmp s0, s7 -; CHECK-NEXT: csel w13, w12, w13, gt +; CHECK-NEXT: csel w8, w12, w8, gt ; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: fcvtzs w9, s1 -; CHECK-NEXT: csel w0, wzr, w13, vs +; CHECK-NEXT: fcvtzs w13, s1 +; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: fcmp s1, s6 -; CHECK-NEXT: csel w9, w10, w9, lt +; CHECK-NEXT: csel w8, w10, w13, lt ; CHECK-NEXT: fcmp s1, s7 -; CHECK-NEXT: csel w9, w12, w9, gt +; CHECK-NEXT: csel w8, w12, w8, gt ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fcvtzs w11, s2 -; CHECK-NEXT: csel w1, wzr, w9, vs +; CHECK-NEXT: fcvtzs w14, s2 +; CHECK-NEXT: csel w1, wzr, w8, vs ; CHECK-NEXT: fcmp s2, s6 -; CHECK-NEXT: csel w9, w10, w11, lt +; CHECK-NEXT: csel w8, w10, w14, lt ; CHECK-NEXT: fcmp s2, s7 -; CHECK-NEXT: csel w9, w12, w9, gt +; CHECK-NEXT: csel w8, w12, w8, gt ; CHECK-NEXT: fcmp s2, s2 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fcvtzs w8, s3 -; CHECK-NEXT: csel w2, wzr, w9, vs +; CHECK-NEXT: fcvtzs w15, s3 +; CHECK-NEXT: csel w2, wzr, w8, vs ; CHECK-NEXT: fcmp s3, s6 -; CHECK-NEXT: csel w8, w10, w8, lt +; CHECK-NEXT: csel w8, w10, w15, lt ; CHECK-NEXT: fcmp s3, s7 -; CHECK-NEXT: mov v4.s[1], w5 ; CHECK-NEXT: csel w8, w12, w8, gt ; CHECK-NEXT: fcmp s3, s3 +; CHECK-NEXT: fcvtzs w9, s4 ; CHECK-NEXT: csel w3, wzr, w8, vs -; CHECK-NEXT: fmov w4, s4 +; CHECK-NEXT: fcmp s4, s6 +; CHECK-NEXT: csel w8, w10, w9, lt +; CHECK-NEXT: fcmp s4, s7 +; CHECK-NEXT: csel w8, w12, w8, gt +; CHECK-NEXT: fcmp s4, s4 +; CHECK-NEXT: fcvtzs w11, s5 +; CHECK-NEXT: csel w4, wzr, w8, vs +; CHECK-NEXT: fcmp s5, s6 +; CHECK-NEXT: csel w8, w10, w11, lt +; CHECK-NEXT: fcmp s5, s7 +; CHECK-NEXT: csel w8, w12, w8, gt +; CHECK-NEXT: fcmp s5, s5 +; CHECK-NEXT: csel w5, wzr, w8, vs ; CHECK-NEXT: ret %x = call <6 x i32> @llvm.fptosi.sat.v6f32.v6i32(<6 x float> %f) ret <6 x i32> %x @@ -292,63 +289,58 @@ ; CHECK-NEXT: mov w9, #-822083584 ; CHECK-NEXT: mov w11, #1325400063 ; CHECK-NEXT: fmov s7, w9 -; CHECK-NEXT: fcvtzs w8, s5 +; CHECK-NEXT: fcvtzs w8, s0 ; CHECK-NEXT: mov w10, #-2147483648 -; CHECK-NEXT: fcmp s5, s7 ; CHECK-NEXT: fmov s16, w11 +; CHECK-NEXT: fcmp s0, s7 ; CHECK-NEXT: mov w12, #2147483647 ; CHECK-NEXT: csel w8, w10, w8, lt -; CHECK-NEXT: fcmp s5, s16 -; CHECK-NEXT: csel w8, w12, w8, gt -; CHECK-NEXT: fcmp s5, s5 -; CHECK-NEXT: fcvtzs w13, s4 -; CHECK-NEXT: csel w8, wzr, w8, vs -; CHECK-NEXT: fcmp s4, s7 -; CHECK-NEXT: csel w11, w10, w13, lt -; CHECK-NEXT: fcmp s4, s16 -; CHECK-NEXT: csel w11, w12, w11, gt -; CHECK-NEXT: fcmp s4, s4 -; CHECK-NEXT: fcvtzs w14, s6 -; CHECK-NEXT: csel w11, wzr, w11, vs -; CHECK-NEXT: fcmp s6, s7 -; CHECK-NEXT: csel w14, w10, w14, lt -; CHECK-NEXT: fcmp s6, s16 -; CHECK-NEXT: csel w14, w12, w14, gt -; CHECK-NEXT: fcmp s6, s6 -; CHECK-NEXT: fcvtzs w9, s0 -; CHECK-NEXT: csel w6, wzr, w14, vs -; CHECK-NEXT: fcmp s0, s7 -; CHECK-NEXT: csel w9, w10, w9, lt ; CHECK-NEXT: fcmp s0, s16 -; CHECK-NEXT: csel w9, w12, w9, gt +; CHECK-NEXT: csel w8, w12, w8, gt ; CHECK-NEXT: fcmp s0, s0 ; CHECK-NEXT: fcvtzs w13, s1 -; CHECK-NEXT: csel w0, wzr, w9, vs +; CHECK-NEXT: csel w0, wzr, w8, vs ; CHECK-NEXT: fcmp s1, s7 -; CHECK-NEXT: csel w9, w10, w13, lt +; CHECK-NEXT: csel w8, w10, w13, lt ; CHECK-NEXT: fcmp s1, s16 -; CHECK-NEXT: csel w9, w12, w9, gt +; CHECK-NEXT: csel w8, w12, w8, gt ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fmov s4, w11 -; CHECK-NEXT: fcvtzs w11, s2 -; CHECK-NEXT: csel w1, wzr, w9, vs +; CHECK-NEXT: fcvtzs w14, s2 +; CHECK-NEXT: csel w1, wzr, w8, vs ; CHECK-NEXT: fcmp s2, s7 -; CHECK-NEXT: csel w9, w10, w11, lt +; CHECK-NEXT: csel w8, w10, w14, lt ; CHECK-NEXT: fcmp s2, s16 -; CHECK-NEXT: csel w9, w12, w9, gt +; CHECK-NEXT: csel w8, w12, w8, gt ; CHECK-NEXT: fcmp s2, s2 -; CHECK-NEXT: mov v4.s[1], w8 -; CHECK-NEXT: fcvtzs w8, s3 -; CHECK-NEXT: csel w2, wzr, w9, vs +; CHECK-NEXT: fcvtzs w15, s3 +; CHECK-NEXT: csel w2, wzr, w8, vs ; CHECK-NEXT: fcmp s3, s7 -; CHECK-NEXT: csel w8, w10, w8, lt +; CHECK-NEXT: csel w8, w10, w15, lt ; CHECK-NEXT: fcmp s3, s16 -; CHECK-NEXT: mov v4.s[2], w6 ; CHECK-NEXT: csel w8, w12, w8, gt ; CHECK-NEXT: fcmp s3, s3 +; CHECK-NEXT: fcvtzs w16, s4 ; CHECK-NEXT: csel w3, wzr, w8, vs -; CHECK-NEXT: mov w5, v4.s[1] -; CHECK-NEXT: fmov w4, s4 +; CHECK-NEXT: fcmp s4, s7 +; CHECK-NEXT: csel w8, w10, w16, lt +; CHECK-NEXT: fcmp s4, s16 +; CHECK-NEXT: csel w8, w12, w8, gt +; CHECK-NEXT: fcmp s4, s4 +; CHECK-NEXT: fcvtzs w9, s5 +; CHECK-NEXT: csel w4, wzr, w8, vs +; CHECK-NEXT: fcmp s5, s7 +; CHECK-NEXT: csel w8, w10, w9, lt +; CHECK-NEXT: fcmp s5, s16 +; CHECK-NEXT: csel w8, w12, w8, gt +; CHECK-NEXT: fcmp s5, s5 +; CHECK-NEXT: fcvtzs w11, s6 +; CHECK-NEXT: csel w5, wzr, w8, vs +; CHECK-NEXT: fcmp s6, s7 +; CHECK-NEXT: csel w8, w10, w11, lt +; CHECK-NEXT: fcmp s6, s16 +; CHECK-NEXT: csel w8, w12, w8, gt +; CHECK-NEXT: fcmp s6, s6 +; CHECK-NEXT: csel w6, wzr, w8, vs ; CHECK-NEXT: ret %x = call <7 x i32> @llvm.fptosi.sat.v7f32.v7i32(<7 x float> %f) ret <7 x i32> %x @@ -1208,68 +1200,65 @@ define <6 x i32> @test_signed_v6f16_v6i32(<6 x half> %f) { ; CHECK-LABEL: test_signed_v6f16_v6i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: mov w8, #-822083584 -; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: fcvt s1, h0 ; CHECK-NEXT: mov w10, #1325400063 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fcvt s2, h2 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: mov w9, #-2147483648 -; CHECK-NEXT: fmov s4, w10 -; CHECK-NEXT: fcvtzs w8, s2 -; CHECK-NEXT: fcmp s2, s3 +; CHECK-NEXT: fcvtzs w12, s1 +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: fmov s3, w10 ; CHECK-NEXT: mov w11, #2147483647 -; CHECK-NEXT: csel w8, w9, w8, lt -; CHECK-NEXT: fcmp s2, s4 +; CHECK-NEXT: csel w8, w9, w12, lt +; CHECK-NEXT: fcmp s1, s3 ; CHECK-NEXT: csel w8, w11, w8, gt -; CHECK-NEXT: fcmp s2, s2 +; CHECK-NEXT: fcmp s1, s1 +; CHECK-NEXT: mov h1, v0.h[1] ; CHECK-NEXT: fcvt s1, h1 ; CHECK-NEXT: fcvtzs w10, s1 -; CHECK-NEXT: csel w5, wzr, w8, vs +; CHECK-NEXT: csel w0, wzr, w8, vs +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: csel w8, w9, w10, lt ; CHECK-NEXT: fcmp s1, s3 +; CHECK-NEXT: csel w8, w11, w8, gt +; CHECK-NEXT: fcmp s1, s1 +; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvtzs w10, s1 +; CHECK-NEXT: csel w1, wzr, w8, vs +; CHECK-NEXT: fcmp s1, s2 ; CHECK-NEXT: csel w8, w9, w10, lt -; CHECK-NEXT: fcmp s1, s4 -; CHECK-NEXT: fcvt s2, h0 +; CHECK-NEXT: fcmp s1, s3 ; CHECK-NEXT: csel w8, w11, w8, gt ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fcvtzs w10, s2 -; CHECK-NEXT: csel w8, wzr, w8, vs -; CHECK-NEXT: fcmp s2, s3 -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: csel w10, w9, w10, lt -; CHECK-NEXT: fcmp s2, s4 -; CHECK-NEXT: csel w10, w11, w10, gt -; CHECK-NEXT: fcmp s2, s2 +; CHECK-NEXT: mov h1, v0.h[3] ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvtzs w12, s1 -; CHECK-NEXT: csel w0, wzr, w10, vs +; CHECK-NEXT: fcvtzs w10, s1 +; CHECK-NEXT: csel w2, wzr, w8, vs +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: csel w8, w9, w10, lt ; CHECK-NEXT: fcmp s1, s3 -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: csel w10, w9, w12, lt -; CHECK-NEXT: fcmp s1, s4 -; CHECK-NEXT: fcvt s2, h2 -; CHECK-NEXT: csel w10, w11, w10, gt +; CHECK-NEXT: csel w8, w11, w8, gt ; CHECK-NEXT: fcmp s1, s1 -; CHECK-NEXT: fcvtzs w13, s2 -; CHECK-NEXT: csel w1, wzr, w10, vs -; CHECK-NEXT: fcmp s2, s3 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel w10, w9, w13, lt -; CHECK-NEXT: fcmp s2, s4 +; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: fcvtzs w10, s1 +; CHECK-NEXT: csel w3, wzr, w8, vs +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: mov h0, v0.h[1] +; CHECK-NEXT: csel w8, w9, w10, lt +; CHECK-NEXT: fcmp s1, s3 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: csel w10, w11, w10, gt -; CHECK-NEXT: fcmp s2, s2 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: fcvtzs w8, s0 -; CHECK-NEXT: csel w2, wzr, w10, vs +; CHECK-NEXT: csel w8, w11, w8, gt +; CHECK-NEXT: fcmp s1, s1 +; CHECK-NEXT: fcvtzs w12, s0 +; CHECK-NEXT: csel w4, wzr, w8, vs +; CHECK-NEXT: fcmp s0, s2 +; CHECK-NEXT: csel w8, w9, w12, lt ; CHECK-NEXT: fcmp s0, s3 -; CHECK-NEXT: csel w8, w9, w8, lt -; CHECK-NEXT: fcmp s0, s4 -; CHECK-NEXT: mov v1.s[1], w5 ; CHECK-NEXT: csel w8, w11, w8, gt ; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: csel w3, wzr, w8, vs -; CHECK-NEXT: fmov w4, s1 +; CHECK-NEXT: csel w5, wzr, w8, vs ; CHECK-NEXT: ret %x = call <6 x i32> @llvm.fptosi.sat.v6f16.v6i32(<6 x half> %f) ret <6 x i32> %x @@ -1278,79 +1267,74 @@ define <7 x i32> @test_signed_v7f16_v7i32(<7 x half> %f) { ; CHECK-LABEL: test_signed_v7f16_v7i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov w10, #-822083584 -; CHECK-NEXT: mov h4, v3.h[1] -; CHECK-NEXT: mov w11, #1325400063 -; CHECK-NEXT: fmov s2, w10 -; CHECK-NEXT: fcvt s4, h4 -; CHECK-NEXT: mov w8, #-2147483648 -; CHECK-NEXT: fmov s1, w11 -; CHECK-NEXT: fcvtzs w10, s4 -; CHECK-NEXT: fcmp s4, s2 -; CHECK-NEXT: mov w9, #2147483647 -; CHECK-NEXT: csel w10, w8, w10, lt -; CHECK-NEXT: fcmp s4, s1 -; CHECK-NEXT: csel w10, w9, w10, gt -; CHECK-NEXT: fcmp s4, s4 -; CHECK-NEXT: fcvt s4, h3 -; CHECK-NEXT: fcvtzs w11, s4 -; CHECK-NEXT: csel w10, wzr, w10, vs -; CHECK-NEXT: fcmp s4, s2 -; CHECK-NEXT: csel w11, w8, w11, lt -; CHECK-NEXT: fcmp s4, s1 -; CHECK-NEXT: mov h3, v3.h[2] -; CHECK-NEXT: csel w11, w9, w11, gt -; CHECK-NEXT: fcmp s4, s4 -; CHECK-NEXT: fcvt s3, h3 -; CHECK-NEXT: fcvtzs w12, s3 -; CHECK-NEXT: csel w11, wzr, w11, vs -; CHECK-NEXT: fcmp s3, s2 -; CHECK-NEXT: csel w12, w8, w12, lt -; CHECK-NEXT: fcmp s3, s1 -; CHECK-NEXT: fcvt s4, h0 -; CHECK-NEXT: csel w12, w9, w12, gt -; CHECK-NEXT: fcmp s3, s3 -; CHECK-NEXT: fcvtzs w13, s4 -; CHECK-NEXT: csel w6, wzr, w12, vs -; CHECK-NEXT: fcmp s4, s2 -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: csel w12, w8, w13, lt -; CHECK-NEXT: fcmp s4, s1 -; CHECK-NEXT: csel w12, w9, w12, gt -; CHECK-NEXT: fcmp s4, s4 -; CHECK-NEXT: fcvt s3, h3 -; CHECK-NEXT: fcvtzs w13, s3 -; CHECK-NEXT: csel w0, wzr, w12, vs -; CHECK-NEXT: fcmp s3, s2 -; CHECK-NEXT: mov h4, v0.h[2] -; CHECK-NEXT: csel w12, w8, w13, lt -; CHECK-NEXT: fcmp s3, s1 -; CHECK-NEXT: fcvt s4, h4 -; CHECK-NEXT: csel w12, w9, w12, gt -; CHECK-NEXT: fcmp s3, s3 -; CHECK-NEXT: fmov s3, w11 -; CHECK-NEXT: fcvtzs w11, s4 -; CHECK-NEXT: csel w1, wzr, w12, vs -; CHECK-NEXT: fcmp s4, s2 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: csel w11, w8, w11, lt -; CHECK-NEXT: fcmp s4, s1 +; CHECK-NEXT: mov w8, #-822083584 +; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: mov w10, #1325400063 +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: mov w9, #-2147483648 +; CHECK-NEXT: fcvtzs w12, s1 +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: fmov s3, w10 +; CHECK-NEXT: mov w11, #2147483647 +; CHECK-NEXT: csel w8, w9, w12, lt +; CHECK-NEXT: fcmp s1, s3 +; CHECK-NEXT: csel w8, w11, w8, gt +; CHECK-NEXT: fcmp s1, s1 +; CHECK-NEXT: mov h1, v0.h[1] +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvtzs w10, s1 +; CHECK-NEXT: csel w0, wzr, w8, vs +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: csel w8, w9, w10, lt +; CHECK-NEXT: fcmp s1, s3 +; CHECK-NEXT: csel w8, w11, w8, gt +; CHECK-NEXT: fcmp s1, s1 +; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvtzs w10, s1 +; CHECK-NEXT: csel w1, wzr, w8, vs +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: csel w8, w9, w10, lt +; CHECK-NEXT: fcmp s1, s3 +; CHECK-NEXT: csel w8, w11, w8, gt +; CHECK-NEXT: fcmp s1, s1 +; CHECK-NEXT: mov h1, v0.h[3] +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvtzs w10, s1 +; CHECK-NEXT: csel w2, wzr, w8, vs +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: csel w8, w9, w10, lt +; CHECK-NEXT: fcmp s1, s3 +; CHECK-NEXT: csel w8, w11, w8, gt +; CHECK-NEXT: fcmp s1, s1 +; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: fcvtzs w10, s1 +; CHECK-NEXT: csel w3, wzr, w8, vs +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: csel w8, w9, w10, lt +; CHECK-NEXT: fcmp s1, s3 +; CHECK-NEXT: csel w8, w11, w8, gt +; CHECK-NEXT: fcmp s1, s1 +; CHECK-NEXT: mov h1, v0.h[1] +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvtzs w10, s1 +; CHECK-NEXT: csel w4, wzr, w8, vs +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: csel w8, w9, w10, lt +; CHECK-NEXT: fcmp s1, s3 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: csel w11, w9, w11, gt -; CHECK-NEXT: fcmp s4, s4 -; CHECK-NEXT: mov v3.s[1], w10 -; CHECK-NEXT: fcvtzs w10, s0 -; CHECK-NEXT: csel w2, wzr, w11, vs +; CHECK-NEXT: csel w8, w11, w8, gt +; CHECK-NEXT: fcmp s1, s1 +; CHECK-NEXT: fcvtzs w12, s0 +; CHECK-NEXT: csel w5, wzr, w8, vs ; CHECK-NEXT: fcmp s0, s2 -; CHECK-NEXT: csel w8, w8, w10, lt -; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: mov v3.s[2], w6 -; CHECK-NEXT: csel w8, w9, w8, gt +; CHECK-NEXT: csel w8, w9, w12, lt +; CHECK-NEXT: fcmp s0, s3 +; CHECK-NEXT: csel w8, w11, w8, gt ; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: csel w3, wzr, w8, vs -; CHECK-NEXT: mov w5, v3.s[1] -; CHECK-NEXT: fmov w4, s3 +; CHECK-NEXT: csel w6, wzr, w8, vs ; CHECK-NEXT: ret %x = call <7 x i32> @llvm.fptosi.sat.v7f16.v7i32(<7 x half> %f) ret <7 x i32> %x @@ -1715,48 +1699,44 @@ ; CHECK-NEXT: .cfi_offset b9, -56 ; CHECK-NEXT: .cfi_offset b10, -64 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v8.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixsfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #-251658240 ; CHECK-NEXT: mov w9, #1895825407 ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: mov x21, #-34359738368 ; CHECK-NEXT: fmov s10, w9 -; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: fcmp s0, s9 ; CHECK-NEXT: mov x22, #34359738367 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt -; CHECK-NEXT: fcmp s8, s10 +; CHECK-NEXT: fcmp s0, s10 +; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp s0, s0 +; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: fcmp s8, s10 +; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: fcmp s8, s8 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: csel x8, x21, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp s0, s10 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x22, x8, gt -; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x2, xzr, x8, vs +; CHECK-NEXT: csel x3, xzr, x9, vs ; CHECK-NEXT: add sp, sp, #80 // =80 ; CHECK-NEXT: ret %x = call <2 x i100> @llvm.fptosi.sat.v2f32.v2i100(<2 x float> %f) @@ -1782,48 +1762,44 @@ ; CHECK-NEXT: .cfi_offset b9, -56 ; CHECK-NEXT: .cfi_offset b10, -64 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v8.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixsfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #-16777216 ; CHECK-NEXT: mov w9, #2130706431 ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: mov x21, #-9223372036854775808 ; CHECK-NEXT: fmov s10, w9 -; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: fcmp s0, s9 ; CHECK-NEXT: mov x22, #9223372036854775807 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt -; CHECK-NEXT: fcmp s8, s10 +; CHECK-NEXT: fcmp s0, s10 +; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp s0, s0 +; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: fcmp s8, s10 +; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: fcmp s8, s8 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: csel x8, x21, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp s0, s10 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x22, x8, gt -; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x2, xzr, x8, vs +; CHECK-NEXT: csel x3, xzr, x9, vs ; CHECK-NEXT: add sp, sp, #80 // =80 ; CHECK-NEXT: ret %x = call <2 x i128> @llvm.fptosi.sat.v2f32.v2i128(<2 x float> %f) @@ -2078,48 +2054,44 @@ ; CHECK-NEXT: .cfi_offset b8, -48 ; CHECK-NEXT: .cfi_offset b9, -56 ; CHECK-NEXT: .cfi_offset b10, -64 -; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v8.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixdfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x8, #-4170333254945079296 ; CHECK-NEXT: mov x9, #5053038781909696511 ; CHECK-NEXT: fmov d9, x8 ; CHECK-NEXT: mov x21, #-34359738368 ; CHECK-NEXT: fmov d10, x9 -; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: fcmp d0, d9 ; CHECK-NEXT: mov x22, #34359738367 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt -; CHECK-NEXT: fcmp d8, d10 +; CHECK-NEXT: fcmp d0, d10 +; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: fcmp d8, d8 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp d0, d0 +; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixdfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: fcmp d8, d10 +; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: fcmp d8, d8 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fcmp d0, d9 -; CHECK-NEXT: csel x8, x21, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp d0, d10 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x22, x8, gt -; CHECK-NEXT: fcmp d0, d0 -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x2, xzr, x8, vs +; CHECK-NEXT: csel x3, xzr, x9, vs ; CHECK-NEXT: add sp, sp, #80 // =80 ; CHECK-NEXT: ret %x = call <2 x i100> @llvm.fptosi.sat.v2f64.v2i100(<2 x double> %f) @@ -2144,48 +2116,44 @@ ; CHECK-NEXT: .cfi_offset b8, -48 ; CHECK-NEXT: .cfi_offset b9, -56 ; CHECK-NEXT: .cfi_offset b10, -64 -; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v8.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixdfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x8, #-4044232465378705408 ; CHECK-NEXT: mov x9, #5179139571476070399 ; CHECK-NEXT: fmov d9, x8 ; CHECK-NEXT: mov x21, #-9223372036854775808 ; CHECK-NEXT: fmov d10, x9 -; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: fcmp d0, d9 ; CHECK-NEXT: mov x22, #9223372036854775807 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x21, x1, lt -; CHECK-NEXT: fcmp d8, d10 +; CHECK-NEXT: fcmp d0, d10 +; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: csel x9, x22, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: fcmp d8, d8 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp d0, d0 +; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: csel x19, xzr, x8, vs ; CHECK-NEXT: csel x20, xzr, x9, vs -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixdfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x21, x1, lt +; CHECK-NEXT: fcmp d8, d10 +; CHECK-NEXT: csel x9, x22, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le +; CHECK-NEXT: fcmp d8, d8 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fcmp d0, d9 -; CHECK-NEXT: csel x8, x21, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: fcmp d0, d10 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x22, x8, gt -; CHECK-NEXT: fcmp d0, d0 -; CHECK-NEXT: csel x9, xzr, x9, vs ; CHECK-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x2, xzr, x8, vs +; CHECK-NEXT: csel x3, xzr, x9, vs ; CHECK-NEXT: add sp, sp, #80 // =80 ; CHECK-NEXT: ret %x = call <2 x i128> @llvm.fptosi.sat.v2f64.v2i128(<2 x double> %f) @@ -2614,8 +2582,7 @@ ; CHECK-NEXT: .cfi_offset b9, -88 ; CHECK-NEXT: .cfi_offset b10, -96 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvt s8, h1 +; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: bl __fixsfti @@ -2630,7 +2597,7 @@ ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 @@ -2643,7 +2610,7 @@ ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt -; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le @@ -2657,6 +2624,7 @@ ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le @@ -2667,19 +2635,18 @@ ; CHECK-NEXT: csel x24, xzr, x9, vs ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x8, x25, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x26, x8, gt +; CHECK-NEXT: csel x9, x26, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: mov x6, x23 -; CHECK-NEXT: mov x7, x24 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 ; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload @@ -2687,10 +2654,8 @@ ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x6, xzr, x8, vs +; CHECK-NEXT: csel x7, xzr, x9, vs ; CHECK-NEXT: add sp, sp, #112 // =112 ; CHECK-NEXT: ret %x = call <4 x i100> @llvm.fptosi.sat.v4f16.v4i100(<4 x half> %f) @@ -2722,8 +2687,7 @@ ; CHECK-NEXT: .cfi_offset b9, -88 ; CHECK-NEXT: .cfi_offset b10, -96 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvt s8, h1 +; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: bl __fixsfti @@ -2738,7 +2702,7 @@ ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 @@ -2751,7 +2715,7 @@ ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt -; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le @@ -2765,6 +2729,7 @@ ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, x25, x1, lt +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s10 ; CHECK-NEXT: csel x9, x26, x9, gt ; CHECK-NEXT: csinv x8, x8, xzr, le @@ -2775,19 +2740,18 @@ ; CHECK-NEXT: csel x24, xzr, x9, vs ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csel x8, x25, x1, lt -; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: csel x8, xzr, x0, lt +; CHECK-NEXT: csel x9, x25, x1, lt ; CHECK-NEXT: fcmp s8, s10 -; CHECK-NEXT: csinv x9, x9, xzr, le -; CHECK-NEXT: csel x8, x26, x8, gt +; CHECK-NEXT: csel x9, x26, x9, gt +; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: fcmp s8, s8 -; CHECK-NEXT: csel x9, xzr, x9, vs -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: mov x6, x23 -; CHECK-NEXT: mov x7, x24 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 ; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp x24, x23, [sp, #64] // 16-byte Folded Reload @@ -2795,10 +2759,8 @@ ; CHECK-NEXT: ldr x30, [sp, #40] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #24] // 16-byte Folded Reload ; CHECK-NEXT: ldr d10, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: csel x1, xzr, x8, vs -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x6, xzr, x8, vs +; CHECK-NEXT: csel x7, xzr, x9, vs ; CHECK-NEXT: add sp, sp, #112 // =112 ; CHECK-NEXT: ret %x = call <4 x i128> @llvm.fptosi.sat.v4f16.v4i128(<4 x half> %f) diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -177,40 +177,37 @@ ; CHECK-LABEL: test_unsigned_v6f32_v6i32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w9, #1333788671 -; CHECK-NEXT: fcvtzu w8, s5 -; CHECK-NEXT: fcmp s5, #0.0 +; CHECK-NEXT: fcvtzu w8, s0 +; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csel w8, wzr, w8, lt -; CHECK-NEXT: fcmp s5, s6 -; CHECK-NEXT: fcvtzu w10, s4 -; CHECK-NEXT: csinv w5, w8, wzr, le -; CHECK-NEXT: fcmp s4, #0.0 -; CHECK-NEXT: csel w8, wzr, w10, lt -; CHECK-NEXT: fcmp s4, s6 -; CHECK-NEXT: fcvtzu w11, s0 -; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: csel w8, wzr, w11, lt ; CHECK-NEXT: fcmp s0, s6 -; CHECK-NEXT: fcvtzu w12, s1 +; CHECK-NEXT: fcvtzu w10, s1 ; CHECK-NEXT: csinv w0, w8, wzr, le ; CHECK-NEXT: fcmp s1, #0.0 -; CHECK-NEXT: csel w8, wzr, w12, lt +; CHECK-NEXT: csel w8, wzr, w10, lt ; CHECK-NEXT: fcmp s1, s6 -; CHECK-NEXT: fcvtzu w13, s2 +; CHECK-NEXT: fcvtzu w11, s2 ; CHECK-NEXT: csinv w1, w8, wzr, le ; CHECK-NEXT: fcmp s2, #0.0 -; CHECK-NEXT: csel w8, wzr, w13, lt +; CHECK-NEXT: csel w8, wzr, w11, lt ; CHECK-NEXT: fcmp s2, s6 -; CHECK-NEXT: fcvtzu w9, s3 +; CHECK-NEXT: fcvtzu w12, s3 ; CHECK-NEXT: csinv w2, w8, wzr, le ; CHECK-NEXT: fcmp s3, #0.0 -; CHECK-NEXT: mov v4.s[1], w5 -; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: csel w8, wzr, w12, lt ; CHECK-NEXT: fcmp s3, s6 +; CHECK-NEXT: fcvtzu w13, s4 ; CHECK-NEXT: csinv w3, w8, wzr, le -; CHECK-NEXT: fmov w4, s4 +; CHECK-NEXT: fcmp s4, #0.0 +; CHECK-NEXT: csel w8, wzr, w13, lt +; CHECK-NEXT: fcmp s4, s6 +; CHECK-NEXT: fcvtzu w9, s5 +; CHECK-NEXT: csinv w4, w8, wzr, le +; CHECK-NEXT: fcmp s5, #0.0 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s5, s6 +; CHECK-NEXT: csinv w5, w8, wzr, le ; CHECK-NEXT: ret %x = call <6 x i32> @llvm.fptoui.sat.v6f32.v6i32(<6 x float> %f) ret <6 x i32> %x @@ -220,47 +217,42 @@ ; CHECK-LABEL: test_unsigned_v7f32_v7i32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w9, #1333788671 -; CHECK-NEXT: fcvtzu w8, s5 -; CHECK-NEXT: fcmp s5, #0.0 +; CHECK-NEXT: fcvtzu w8, s0 +; CHECK-NEXT: fcmp s0, #0.0 ; CHECK-NEXT: fmov s7, w9 ; CHECK-NEXT: csel w8, wzr, w8, lt -; CHECK-NEXT: fcmp s5, s7 -; CHECK-NEXT: fcvtzu w10, s4 -; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: fcmp s4, #0.0 -; CHECK-NEXT: csel w10, wzr, w10, lt -; CHECK-NEXT: fcmp s4, s7 -; CHECK-NEXT: fcvtzu w11, s6 -; CHECK-NEXT: csinv w10, w10, wzr, le -; CHECK-NEXT: fcmp s6, #0.0 -; CHECK-NEXT: fmov s4, w10 -; CHECK-NEXT: csel w10, wzr, w11, lt -; CHECK-NEXT: fcmp s6, s7 -; CHECK-NEXT: fcvtzu w12, s0 -; CHECK-NEXT: csinv w6, w10, wzr, le -; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: mov v4.s[1], w8 -; CHECK-NEXT: csel w8, wzr, w12, lt ; CHECK-NEXT: fcmp s0, s7 -; CHECK-NEXT: fcvtzu w13, s1 +; CHECK-NEXT: fcvtzu w10, s1 ; CHECK-NEXT: csinv w0, w8, wzr, le ; CHECK-NEXT: fcmp s1, #0.0 -; CHECK-NEXT: csel w8, wzr, w13, lt +; CHECK-NEXT: csel w8, wzr, w10, lt ; CHECK-NEXT: fcmp s1, s7 -; CHECK-NEXT: fcvtzu w14, s2 +; CHECK-NEXT: fcvtzu w11, s2 ; CHECK-NEXT: csinv w1, w8, wzr, le ; CHECK-NEXT: fcmp s2, #0.0 -; CHECK-NEXT: csel w8, wzr, w14, lt +; CHECK-NEXT: csel w8, wzr, w11, lt ; CHECK-NEXT: fcmp s2, s7 -; CHECK-NEXT: fcvtzu w9, s3 +; CHECK-NEXT: fcvtzu w12, s3 ; CHECK-NEXT: csinv w2, w8, wzr, le ; CHECK-NEXT: fcmp s3, #0.0 -; CHECK-NEXT: mov v4.s[2], w6 -; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: csel w8, wzr, w12, lt ; CHECK-NEXT: fcmp s3, s7 +; CHECK-NEXT: fcvtzu w13, s4 ; CHECK-NEXT: csinv w3, w8, wzr, le -; CHECK-NEXT: mov w5, v4.s[1] -; CHECK-NEXT: fmov w4, s4 +; CHECK-NEXT: fcmp s4, #0.0 +; CHECK-NEXT: csel w8, wzr, w13, lt +; CHECK-NEXT: fcmp s4, s7 +; CHECK-NEXT: fcvtzu w14, s5 +; CHECK-NEXT: csinv w4, w8, wzr, le +; CHECK-NEXT: fcmp s5, #0.0 +; CHECK-NEXT: csel w8, wzr, w14, lt +; CHECK-NEXT: fcmp s5, s7 +; CHECK-NEXT: fcvtzu w9, s6 +; CHECK-NEXT: csinv w5, w8, wzr, le +; CHECK-NEXT: fcmp s6, #0.0 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s6, s7 +; CHECK-NEXT: csinv w6, w8, wzr, le ; CHECK-NEXT: ret %x = call <7 x i32> @llvm.fptoui.sat.v7f32.v7i32(<7 x float> %f) ret <7 x i32> %x @@ -927,52 +919,49 @@ define <6 x i32> @test_unsigned_v6f16_v6i32(<6 x half> %f) { ; CHECK-LABEL: test_unsigned_v6f16_v6i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: fcvt s1, h0 ; CHECK-NEXT: mov w8, #1333788671 -; CHECK-NEXT: fcvt s2, h2 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fcvtzu w8, s2 -; CHECK-NEXT: fcmp s2, #0.0 -; CHECK-NEXT: csel w8, wzr, w8, lt -; CHECK-NEXT: fcmp s2, s3 -; CHECK-NEXT: fcvt s1, h1 ; CHECK-NEXT: fcvtzu w9, s1 -; CHECK-NEXT: csinv w5, w8, wzr, le ; CHECK-NEXT: fcmp s1, #0.0 -; CHECK-NEXT: fcvt s2, h0 +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: csel w8, wzr, w9, lt -; CHECK-NEXT: fcmp s1, s3 +; CHECK-NEXT: fcmp s1, s2 ; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvtzu w9, s2 -; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: fcmp s2, #0.0 -; CHECK-NEXT: csel w9, wzr, w9, lt -; CHECK-NEXT: fcmp s2, s3 ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fcvtzu w10, s1 -; CHECK-NEXT: csinv w0, w9, wzr, le +; CHECK-NEXT: fcvtzu w9, s1 +; CHECK-NEXT: csinv w0, w8, wzr, le ; CHECK-NEXT: fcmp s1, #0.0 -; CHECK-NEXT: fcvt s2, h2 -; CHECK-NEXT: csel w9, wzr, w10, lt -; CHECK-NEXT: fcmp s1, s3 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: fcvtzu w11, s2 -; CHECK-NEXT: csinv w1, w9, wzr, le -; CHECK-NEXT: fcmp s2, #0.0 -; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: csel w8, wzr, w11, lt -; CHECK-NEXT: fcmp s2, s3 -; CHECK-NEXT: fcvtzu w12, s0 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvtzu w9, s1 +; CHECK-NEXT: csinv w1, w8, wzr, le +; CHECK-NEXT: fcmp s1, #0.0 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: mov h1, v0.h[3] +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fcvtzu w9, s1 ; CHECK-NEXT: csinv w2, w8, wzr, le -; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: mov v1.s[1], w5 -; CHECK-NEXT: csel w8, wzr, w12, lt -; CHECK-NEXT: fcmp s0, s3 +; CHECK-NEXT: fcmp s1, #0.0 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: mov h0, v0.h[1] +; CHECK-NEXT: fcvtzu w9, s1 ; CHECK-NEXT: csinv w3, w8, wzr, le -; CHECK-NEXT: fmov w4, s1 +; CHECK-NEXT: fcmp s1, #0.0 +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: fcvtzu w10, s0 +; CHECK-NEXT: csinv w4, w8, wzr, le +; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: csel w8, wzr, w10, lt +; CHECK-NEXT: fcmp s0, s2 +; CHECK-NEXT: csinv w5, w8, wzr, le ; CHECK-NEXT: ret %x = call <6 x i32> @llvm.fptoui.sat.v6f16.v6i32(<6 x half> %f) ret <6 x i32> %x @@ -981,61 +970,56 @@ define <7 x i32> @test_unsigned_v7f16_v7i32(<7 x half> %f) { ; CHECK-LABEL: test_unsigned_v7f16_v7i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: fcvt s1, h0 ; CHECK-NEXT: mov w8, #1333788671 -; CHECK-NEXT: fcvt s2, h2 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fcvtzu w8, s2 -; CHECK-NEXT: fcmp s2, #0.0 -; CHECK-NEXT: csel w8, wzr, w8, lt -; CHECK-NEXT: fcmp s2, s3 -; CHECK-NEXT: fcvt s2, h1 -; CHECK-NEXT: fcvtzu w9, s2 -; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: fcmp s2, #0.0 -; CHECK-NEXT: mov h1, v1.h[2] -; CHECK-NEXT: csel w9, wzr, w9, lt -; CHECK-NEXT: fcmp s2, s3 +; CHECK-NEXT: fcvtzu w9, s1 +; CHECK-NEXT: fcmp s1, #0.0 +; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: mov h1, v0.h[1] ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: fcvtzu w10, s1 -; CHECK-NEXT: csinv w9, w9, wzr, le +; CHECK-NEXT: fcvtzu w9, s1 +; CHECK-NEXT: csinv w0, w8, wzr, le ; CHECK-NEXT: fcmp s1, #0.0 -; CHECK-NEXT: fcvt s2, h0 -; CHECK-NEXT: csel w10, wzr, w10, lt -; CHECK-NEXT: fcmp s1, s3 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fcvtzu w9, s1 +; CHECK-NEXT: csinv w1, w8, wzr, le +; CHECK-NEXT: fcmp s1, #0.0 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: mov h1, v0.h[3] +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fcvtzu w9, s1 +; CHECK-NEXT: csinv w2, w8, wzr, le +; CHECK-NEXT: fcmp s1, #0.0 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: fcvt s1, h0 +; CHECK-NEXT: fcvtzu w9, s1 +; CHECK-NEXT: csinv w3, w8, wzr, le +; CHECK-NEXT: fcmp s1, #0.0 +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s1, s2 ; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvtzu w11, s2 -; CHECK-NEXT: csinv w6, w10, wzr, le -; CHECK-NEXT: fcmp s2, #0.0 -; CHECK-NEXT: csel w10, wzr, w11, lt -; CHECK-NEXT: fcmp s2, s3 ; CHECK-NEXT: fcvt s1, h1 -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fcvtzu w11, s1 -; CHECK-NEXT: csinv w0, w10, wzr, le +; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: fcvtzu w9, s1 +; CHECK-NEXT: csinv w4, w8, wzr, le ; CHECK-NEXT: fcmp s1, #0.0 -; CHECK-NEXT: fcvt s2, h2 -; CHECK-NEXT: csel w10, wzr, w11, lt -; CHECK-NEXT: fcmp s1, s3 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: fcvtzu w12, s2 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: csinv w1, w10, wzr, le -; CHECK-NEXT: fcmp s2, #0.0 ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: mov v1.s[1], w8 -; CHECK-NEXT: csel w8, wzr, w12, lt -; CHECK-NEXT: fcmp s2, s3 -; CHECK-NEXT: fcvtzu w13, s0 -; CHECK-NEXT: csinv w2, w8, wzr, le +; CHECK-NEXT: csel w8, wzr, w9, lt +; CHECK-NEXT: fcmp s1, s2 +; CHECK-NEXT: fcvtzu w10, s0 +; CHECK-NEXT: csinv w5, w8, wzr, le ; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: mov v1.s[2], w6 -; CHECK-NEXT: csel w8, wzr, w13, lt -; CHECK-NEXT: fcmp s0, s3 -; CHECK-NEXT: csinv w3, w8, wzr, le -; CHECK-NEXT: mov w5, v1.s[1] -; CHECK-NEXT: fmov w4, s1 +; CHECK-NEXT: csel w8, wzr, w10, lt +; CHECK-NEXT: fcmp s0, s2 +; CHECK-NEXT: csinv w6, w8, wzr, le ; CHECK-NEXT: ret %x = call <7 x i32> @llvm.fptoui.sat.v7f16.v7i32(<7 x half> %f) ret <7 x i32> %x @@ -1329,37 +1313,33 @@ ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v8.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #1904214015 -; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: fmov s9, w8 ; CHECK-NEXT: mov x21, #68719476735 +; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x10, xzr, x1, lt -; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: csel x19, x21, x10, gt ; CHECK-NEXT: csinv x20, x9, xzr, le -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x20 -; CHECK-NEXT: mov x3, x19 -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x1, x21, x9, gt +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: csel x3, x21, x9, gt +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x1, x19 +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csinv x2, x8, xzr, le ; CHECK-NEXT: add sp, sp, #64 // =64 ; CHECK-NEXT: ret %x = call <2 x i100> @llvm.fptoui.sat.v2f32.v2i100(<2 x float> %f) @@ -1380,36 +1360,32 @@ ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v8.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixunssfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov w8, #2139095039 -; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: fmov s9, w8 +; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: mov s8, v0.s[1] ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: csel x10, xzr, x0, lt -; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp s0, s9 +; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: csinv x19, x10, xzr, le ; CHECK-NEXT: csinv x20, x9, xzr, le -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp s8, #0.0 +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: fcmp s8, s9 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fcmp s0, #0.0 -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp s0, s9 -; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: csinv x1, x9, xzr, le -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csinv x2, x9, xzr, le +; CHECK-NEXT: csinv x3, x8, xzr, le ; CHECK-NEXT: add sp, sp, #64 // =64 ; CHECK-NEXT: ret %x = call <2 x i128> @llvm.fptoui.sat.v2f32.v2i128(<2 x float> %f) @@ -1618,37 +1594,33 @@ ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 -; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v8.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixunsdfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x8, #5057542381537067007 -; CHECK-NEXT: fcmp d8, #0.0 ; CHECK-NEXT: fmov d9, x8 ; CHECK-NEXT: mov x21, #68719476735 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x10, xzr, x1, lt -; CHECK-NEXT: fcmp d8, d9 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp d0, d9 +; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: csel x19, x21, x10, gt ; CHECK-NEXT: csinv x20, x9, xzr, le -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x20 -; CHECK-NEXT: mov x3, x19 -; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: fcmp d8, #0.0 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp d0, d9 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x1, x21, x9, gt +; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: csel x3, x21, x9, gt +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: mov x1, x19 +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csinv x2, x8, xzr, le ; CHECK-NEXT: add sp, sp, #64 // =64 ; CHECK-NEXT: ret %x = call <2 x i100> @llvm.fptoui.sat.v2f64.v2i100(<2 x double> %f) @@ -1668,36 +1640,32 @@ ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_offset b8, -40 ; CHECK-NEXT: .cfi_offset b9, -48 -; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v8.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixunsdfti +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: mov x8, #5183643171103440895 -; CHECK-NEXT: fcmp d8, #0.0 ; CHECK-NEXT: fmov d9, x8 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: mov d8, v0.d[1] ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: csel x10, xzr, x0, lt -; CHECK-NEXT: fcmp d8, d9 -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: fcmp d0, d9 +; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: csinv x19, x10, xzr, le ; CHECK-NEXT: csinv x20, x9, xzr, le -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 +; CHECK-NEXT: fcmp d8, #0.0 +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: fcmp d8, d9 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: fcmp d0, #0.0 -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: fcmp d0, d9 -; CHECK-NEXT: csinv x8, x8, xzr, le ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: csinv x1, x9, xzr, le -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csinv x2, x9, xzr, le +; CHECK-NEXT: csinv x3, x8, xzr, le ; CHECK-NEXT: add sp, sp, #64 // =64 ; CHECK-NEXT: ret %x = call <2 x i128> @llvm.fptoui.sat.v2f64.v2i128(<2 x double> %f) @@ -2043,7 +2011,7 @@ ; CHECK-NEXT: .cfi_offset b8, -72 ; CHECK-NEXT: .cfi_offset b9, -80 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[2] +; CHECK-NEXT: mov h1, v0.h[1] ; CHECK-NEXT: fcvt s8, h1 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: mov v0.16b, v8.16b @@ -2052,7 +2020,7 @@ ; CHECK-NEXT: mov w8, #1904214015 ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: mov h0, v0.h[1] +; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: csel x10, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 @@ -2066,7 +2034,6 @@ ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt -; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: mov v0.16b, v8.16b @@ -2077,6 +2044,7 @@ ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: mov v0.16b, v8.16b @@ -2087,22 +2055,19 @@ ; CHECK-NEXT: csel x8, xzr, x0, lt ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: csel x1, x25, x9, gt -; CHECK-NEXT: mov x2, x22 -; CHECK-NEXT: mov x3, x21 -; CHECK-NEXT: mov x4, x20 -; CHECK-NEXT: mov x5, x19 -; CHECK-NEXT: mov x6, x24 -; CHECK-NEXT: mov x7, x23 +; CHECK-NEXT: csel x7, x25, x9, gt +; CHECK-NEXT: mov x0, x24 +; CHECK-NEXT: mov x1, x23 +; CHECK-NEXT: mov x2, x20 +; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: mov x4, x22 +; CHECK-NEXT: mov x5, x21 ; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x25, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csinv x6, x8, xzr, le ; CHECK-NEXT: add sp, sp, #96 // =96 ; CHECK-NEXT: ret %x = call <4 x i100> @llvm.fptoui.sat.v4f16.v4i100(<4 x half> %f) @@ -2129,8 +2094,7 @@ ; CHECK-NEXT: .cfi_offset b8, -72 ; CHECK-NEXT: .cfi_offset b9, -80 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: fcvt s8, h1 +; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: mov v0.16b, v8.16b ; CHECK-NEXT: bl __fixunssfti @@ -2138,7 +2102,7 @@ ; CHECK-NEXT: mov w8, #2139095039 ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: fmov s9, w8 -; CHECK-NEXT: mov h0, v0.h[2] +; CHECK-NEXT: mov h0, v0.h[1] ; CHECK-NEXT: csel x9, xzr, x1, lt ; CHECK-NEXT: csel x10, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 @@ -2151,7 +2115,7 @@ ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt -; CHECK-NEXT: mov h0, v0.h[3] +; CHECK-NEXT: mov h0, v0.h[2] ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: mov v0.16b, v8.16b @@ -2162,6 +2126,7 @@ ; CHECK-NEXT: fcmp s8, #0.0 ; CHECK-NEXT: csel x8, xzr, x1, lt ; CHECK-NEXT: csel x9, xzr, x0, lt +; CHECK-NEXT: mov h0, v0.h[3] ; CHECK-NEXT: fcmp s8, s9 ; CHECK-NEXT: fcvt s8, h0 ; CHECK-NEXT: mov v0.16b, v8.16b @@ -2169,25 +2134,22 @@ ; CHECK-NEXT: csinv x24, x8, xzr, le ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: fcmp s8, #0.0 -; CHECK-NEXT: csel x8, xzr, x0, lt -; CHECK-NEXT: csel x9, xzr, x1, lt +; CHECK-NEXT: csel x8, xzr, x1, lt +; CHECK-NEXT: csel x9, xzr, x0, lt ; CHECK-NEXT: fcmp s8, s9 -; CHECK-NEXT: csinv x8, x8, xzr, le -; CHECK-NEXT: mov x2, x19 -; CHECK-NEXT: mov x3, x20 -; CHECK-NEXT: mov x4, x21 -; CHECK-NEXT: mov x5, x22 -; CHECK-NEXT: mov x6, x23 -; CHECK-NEXT: mov x7, x24 +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: mov x1, x20 +; CHECK-NEXT: mov x2, x21 +; CHECK-NEXT: mov x3, x22 +; CHECK-NEXT: mov x4, x23 +; CHECK-NEXT: mov x5, x24 ; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: csinv x1, x9, xzr, le -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csinv x6, x9, xzr, le +; CHECK-NEXT: csinv x7, x8, xzr, le ; CHECK-NEXT: add sp, sp, #96 // =96 ; CHECK-NEXT: ret %x = call <4 x i128> @llvm.fptoui.sat.v4f16.v4i128(<4 x half> %f) diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll @@ -74,8 +74,7 @@ ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: // kill: def $x0 killed $w0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: mov x1, v0.d[1] +; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: ret Entry: %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0) diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -373,16 +373,16 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x7, #0 // =0 +; CHECK-NEXT: cmp x5, #0 // =0 ; CHECK-NEXT: cset w9, ge ; CHECK-NEXT: csinc w9, w9, wzr, ne -; CHECK-NEXT: cmp x3, #0 // =0 +; CHECK-NEXT: cmp x1, #0 // =0 ; CHECK-NEXT: cset w10, ge ; CHECK-NEXT: csinc w10, w10, wzr, ne ; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: cset w9, eq -; CHECK-NEXT: adds x11, x2, x6 -; CHECK-NEXT: adcs x12, x3, x7 +; CHECK-NEXT: adds x11, x0, x4 +; CHECK-NEXT: adcs x12, x1, x5 ; CHECK-NEXT: cmp x12, #0 // =0 ; CHECK-NEXT: cset w13, ge ; CHECK-NEXT: mov x8, #9223372036854775807 @@ -392,31 +392,28 @@ ; CHECK-NEXT: cset w13, ne ; CHECK-NEXT: asr x10, x12, #63 ; CHECK-NEXT: tst w9, w13 -; CHECK-NEXT: csel x3, x14, x12, ne -; CHECK-NEXT: csel x2, x10, x11, ne -; CHECK-NEXT: cmp x5, #0 // =0 +; CHECK-NEXT: csel x1, x14, x12, ne +; CHECK-NEXT: csel x0, x10, x11, ne +; CHECK-NEXT: cmp x7, #0 // =0 ; CHECK-NEXT: cset w9, ge ; CHECK-NEXT: csinc w9, w9, wzr, ne -; CHECK-NEXT: cmp x1, #0 // =0 +; CHECK-NEXT: cmp x3, #0 // =0 ; CHECK-NEXT: cset w10, ge ; CHECK-NEXT: csinc w10, w10, wzr, ne ; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: cset w9, eq -; CHECK-NEXT: adds x11, x0, x4 -; CHECK-NEXT: adcs x12, x1, x5 +; CHECK-NEXT: adds x11, x2, x6 +; CHECK-NEXT: adcs x12, x3, x7 ; CHECK-NEXT: cmp x12, #0 // =0 -; CHECK-NEXT: cset w13, ge -; CHECK-NEXT: csinc w13, w13, wzr, ne +; CHECK-NEXT: cset w14, ge +; CHECK-NEXT: csinc w14, w14, wzr, ne ; CHECK-NEXT: cinv x8, x8, ge -; CHECK-NEXT: cmp w10, w13 +; CHECK-NEXT: cmp w10, w14 ; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: asr x13, x12, #63 ; CHECK-NEXT: tst w9, w10 -; CHECK-NEXT: asr x9, x12, #63 -; CHECK-NEXT: csel x9, x9, x11, ne -; CHECK-NEXT: csel x1, x8, x12, ne -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x2, x13, x11, ne +; CHECK-NEXT: csel x3, x8, x12, ne ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -374,16 +374,16 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp x7, #0 // =0 +; CHECK-NEXT: cmp x5, #0 // =0 ; CHECK-NEXT: cset w9, ge ; CHECK-NEXT: csinc w9, w9, wzr, ne -; CHECK-NEXT: cmp x3, #0 // =0 +; CHECK-NEXT: cmp x1, #0 // =0 ; CHECK-NEXT: cset w10, ge ; CHECK-NEXT: csinc w10, w10, wzr, ne ; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: cset w9, ne -; CHECK-NEXT: subs x11, x2, x6 -; CHECK-NEXT: sbcs x12, x3, x7 +; CHECK-NEXT: subs x11, x0, x4 +; CHECK-NEXT: sbcs x12, x1, x5 ; CHECK-NEXT: cmp x12, #0 // =0 ; CHECK-NEXT: cset w13, ge ; CHECK-NEXT: mov x8, #9223372036854775807 @@ -393,31 +393,28 @@ ; CHECK-NEXT: cset w13, ne ; CHECK-NEXT: asr x10, x12, #63 ; CHECK-NEXT: tst w9, w13 -; CHECK-NEXT: csel x3, x14, x12, ne -; CHECK-NEXT: csel x2, x10, x11, ne -; CHECK-NEXT: cmp x5, #0 // =0 +; CHECK-NEXT: csel x1, x14, x12, ne +; CHECK-NEXT: csel x0, x10, x11, ne +; CHECK-NEXT: cmp x7, #0 // =0 ; CHECK-NEXT: cset w9, ge ; CHECK-NEXT: csinc w9, w9, wzr, ne -; CHECK-NEXT: cmp x1, #0 // =0 +; CHECK-NEXT: cmp x3, #0 // =0 ; CHECK-NEXT: cset w10, ge ; CHECK-NEXT: csinc w10, w10, wzr, ne ; CHECK-NEXT: cmp w10, w9 ; CHECK-NEXT: cset w9, ne -; CHECK-NEXT: subs x11, x0, x4 -; CHECK-NEXT: sbcs x12, x1, x5 +; CHECK-NEXT: subs x11, x2, x6 +; CHECK-NEXT: sbcs x12, x3, x7 ; CHECK-NEXT: cmp x12, #0 // =0 -; CHECK-NEXT: cset w13, ge -; CHECK-NEXT: csinc w13, w13, wzr, ne +; CHECK-NEXT: cset w14, ge +; CHECK-NEXT: csinc w14, w14, wzr, ne ; CHECK-NEXT: cinv x8, x8, ge -; CHECK-NEXT: cmp w10, w13 +; CHECK-NEXT: cmp w10, w14 ; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: asr x13, x12, #63 ; CHECK-NEXT: tst w9, w10 -; CHECK-NEXT: asr x9, x12, #63 -; CHECK-NEXT: csel x9, x9, x11, ne -; CHECK-NEXT: csel x1, x8, x12, ne -; CHECK-NEXT: fmov d0, x9 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x2, x13, x11, ne +; CHECK-NEXT: csel x3, x8, x12, ne ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -371,16 +371,6 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: adds x8, x2, x6 -; CHECK-NEXT: adcs x9, x3, x7 -; CHECK-NEXT: cmp x8, x2 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: cmp x9, x3 -; CHECK-NEXT: cset w11, lo -; CHECK-NEXT: csel w10, w10, w11, eq -; CHECK-NEXT: cmp w10, #0 // =0 -; CHECK-NEXT: csinv x3, x9, xzr, eq -; CHECK-NEXT: csinv x2, x8, xzr, eq ; CHECK-NEXT: adds x8, x0, x4 ; CHECK-NEXT: adcs x9, x1, x5 ; CHECK-NEXT: cmp x8, x0 @@ -389,11 +379,18 @@ ; CHECK-NEXT: cset w11, lo ; CHECK-NEXT: csel w10, w10, w11, eq ; CHECK-NEXT: cmp w10, #0 // =0 -; CHECK-NEXT: csinv x8, x8, xzr, eq ; CHECK-NEXT: csinv x1, x9, xzr, eq -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csinv x0, x8, xzr, eq +; CHECK-NEXT: adds x8, x2, x6 +; CHECK-NEXT: adcs x9, x3, x7 +; CHECK-NEXT: cmp x8, x2 +; CHECK-NEXT: cset w10, lo +; CHECK-NEXT: cmp x9, x3 +; CHECK-NEXT: cset w11, lo +; CHECK-NEXT: csel w10, w10, w11, eq +; CHECK-NEXT: cmp w10, #0 // =0 +; CHECK-NEXT: csinv x2, x8, xzr, eq +; CHECK-NEXT: csinv x3, x9, xzr, eq ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll --- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -372,16 +372,6 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind { ; CHECK-LABEL: v2i128: ; CHECK: // %bb.0: -; CHECK-NEXT: subs x8, x2, x6 -; CHECK-NEXT: sbcs x9, x3, x7 -; CHECK-NEXT: cmp x8, x2 -; CHECK-NEXT: cset w10, hi -; CHECK-NEXT: cmp x9, x3 -; CHECK-NEXT: cset w11, hi -; CHECK-NEXT: csel w10, w10, w11, eq -; CHECK-NEXT: cmp w10, #0 // =0 -; CHECK-NEXT: csel x3, xzr, x9, ne -; CHECK-NEXT: csel x2, xzr, x8, ne ; CHECK-NEXT: subs x8, x0, x4 ; CHECK-NEXT: sbcs x9, x1, x5 ; CHECK-NEXT: cmp x8, x0 @@ -390,11 +380,18 @@ ; CHECK-NEXT: cset w11, hi ; CHECK-NEXT: csel w10, w10, w11, eq ; CHECK-NEXT: cmp w10, #0 // =0 -; CHECK-NEXT: csel x8, xzr, x8, ne ; CHECK-NEXT: csel x1, xzr, x9, ne -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: csel x0, xzr, x8, ne +; CHECK-NEXT: subs x8, x2, x6 +; CHECK-NEXT: sbcs x9, x3, x7 +; CHECK-NEXT: cmp x8, x2 +; CHECK-NEXT: cset w10, hi +; CHECK-NEXT: cmp x9, x3 +; CHECK-NEXT: cset w11, hi +; CHECK-NEXT: csel w10, w10, w11, eq +; CHECK-NEXT: cmp w10, #0 // =0 +; CHECK-NEXT: csel x2, xzr, x8, ne +; CHECK-NEXT: csel x3, xzr, x9, ne ; CHECK-NEXT: ret %z = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %x, <2 x i128> %y) ret <2 x i128> %z diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -96,24 +96,18 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK-LABEL: test_v9i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 -; CHECK-NEXT: mov v0.b[9], w8 -; CHECK-NEXT: mov v0.b[10], w8 -; CHECK-NEXT: mov v0.b[11], w8 -; CHECK-NEXT: mov v0.b[12], w8 -; CHECK-NEXT: mov v0.b[13], w8 ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v1.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v1.b[1] +; CHECK-NEXT: umov w8, v0.b[1] ; CHECK-NEXT: umov w9, v1.b[0] ; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: umov w9, v0.b[2] ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v1.b[3] +; CHECK-NEXT: umov w9, v0.b[3] ; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[4] ; CHECK-NEXT: and w8, w8, w9 -; CHECK-NEXT: umov w9, v1.b[5] +; CHECK-NEXT: umov w9, v0.b[5] ; CHECK-NEXT: and w8, w8, w9 ; CHECK-NEXT: umov w9, v0.b[6] ; CHECK-NEXT: and w8, w8, w9 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -537,17 +537,16 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 -; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 +; VI-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1 ; VI-NEXT: s_endpgm %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -108,7 +108,6 @@ ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64 ; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]] -; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], 0 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] @@ -119,6 +118,7 @@ ; VI: v_cvt_u32_f32_e32 v[[R_I64_1_Low:[0-9]+]], v[[A_F32_1]] ; VI: v_cvt_u32_f32_e32 v[[R_I64_0_Low:[0-9]+]], v[[A_F32_0]] ; GCN: v_mov_b32_e32 v[[R_I64_0_High:[0-9]+]], 0 +; GCN: v_mov_b32_e32 v[[R_I64_1_High:[0-9]+]], v[[R_I64_0_High]] ; GCN: buffer_store_dwordx4 v{{\[}}[[R_I64_0_Low]]{{\:}}[[R_I64_1_High]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @fptoui_v2f16_to_v2i64( diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -4735,10 +4735,10 @@ ; GCN-NOHSA-VI-NEXT: s_load_dword s4, s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s4, 0xffff -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -4867,16 +4867,16 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s3, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, s8 ; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -4889,20 +4889,20 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s5, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s4, s2, 16 ; GCN-HSA-NEXT: s_and_b32 s7, s2, s6 ; GCN-HSA-NEXT: s_and_b32 s2, s3, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -4920,16 +4920,16 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s5, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s4, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5113,28 +5113,28 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s9, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s3, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s11, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s2 ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s9, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s10, s2 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -5147,38 +5147,38 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s9, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s9, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s10, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s11, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s11, s4, 16 ; GCN-HSA-NEXT: s_and_b32 s3, s7, s8 ; GCN-HSA-NEXT: s_and_b32 s4, s4, s8 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s8 ; GCN-HSA-NEXT: s_and_b32 s5, s5, s8 +; GCN-HSA-NEXT: s_and_b32 s6, s6, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -5196,26 +5196,26 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s4, s8 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s5, s8 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s6, s8 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s7, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5486,52 +5486,52 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s12 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s13 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s9, 16 ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s14 ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s14 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s14 ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s14 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s14 ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s14 +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s14 +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -5544,74 +5544,74 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s13, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s13, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s16, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s17, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s9, 16 ; GCN-HSA-NEXT: s_and_b32 s3, s9, s12 ; GCN-HSA-NEXT: s_and_b32 s4, s4, s12 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s12 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s12 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s12 ; GCN-HSA-NEXT: s_and_b32 s5, s5, s12 +; GCN-HSA-NEXT: s_and_b32 s6, s6, s12 ; GCN-HSA-NEXT: s_and_b32 s7, s7, s12 +; GCN-HSA-NEXT: s_and_b32 s10, s10, s12 ; GCN-HSA-NEXT: s_and_b32 s11, s11, s12 +; GCN-HSA-NEXT: s_and_b32 s8, s8, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -5629,46 +5629,46 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s4, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s5, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s6, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s7, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s8, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s9, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s10, s12 -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s11, s12 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s12 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s9, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -6124,104 +6124,104 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s17, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s19, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s4, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s6, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s8, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s10, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s12, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s14, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s16, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s18, s2 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s11, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s2 ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s2 ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s2 ; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s2 ; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, s2 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s12, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s13, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s14, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s15, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s16, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s17, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s18, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s19, s2 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s16, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s17, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s19, 16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s33 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -6235,140 +6235,140 @@ ; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_and_b32 s21, s4, s20 -; GCN-HSA-NEXT: s_and_b32 s22, s6, s20 -; GCN-HSA-NEXT: s_and_b32 s23, s8, s20 -; GCN-HSA-NEXT: s_and_b32 s24, s10, s20 -; GCN-HSA-NEXT: s_and_b32 s25, s12, s20 -; GCN-HSA-NEXT: s_and_b32 s26, s14, s20 -; GCN-HSA-NEXT: s_and_b32 s27, s16, s20 -; GCN-HSA-NEXT: s_and_b32 s28, s18, s20 -; GCN-HSA-NEXT: s_and_b32 s29, s5, s20 -; GCN-HSA-NEXT: s_and_b32 s30, s7, s20 -; GCN-HSA-NEXT: s_and_b32 s31, s9, s20 -; GCN-HSA-NEXT: s_and_b32 s33, s11, s20 -; GCN-HSA-NEXT: s_and_b32 s34, s13, s20 -; GCN-HSA-NEXT: s_and_b32 s35, s15, s20 -; GCN-HSA-NEXT: s_and_b32 s36, s17, s20 +; GCN-HSA-NEXT: s_and_b32 s22, s5, s20 +; GCN-HSA-NEXT: s_and_b32 s23, s6, s20 +; GCN-HSA-NEXT: s_and_b32 s24, s7, s20 +; GCN-HSA-NEXT: s_and_b32 s25, s8, s20 +; GCN-HSA-NEXT: s_and_b32 s26, s9, s20 +; GCN-HSA-NEXT: s_and_b32 s27, s10, s20 +; GCN-HSA-NEXT: s_and_b32 s28, s11, s20 +; GCN-HSA-NEXT: s_and_b32 s29, s12, s20 +; GCN-HSA-NEXT: s_and_b32 s30, s13, s20 +; GCN-HSA-NEXT: s_and_b32 s31, s14, s20 +; GCN-HSA-NEXT: s_and_b32 s33, s15, s20 +; GCN-HSA-NEXT: s_and_b32 s34, s16, s20 +; GCN-HSA-NEXT: s_and_b32 s35, s17, s20 +; GCN-HSA-NEXT: s_and_b32 s36, s18, s20 ; GCN-HSA-NEXT: s_and_b32 s20, s19, s20 ; GCN-HSA-NEXT: s_lshr_b32 s19, s19, 16 +; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 ; GCN-HSA-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 ; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 16 ; GCN-HSA-NEXT: s_lshr_b32 s17, s17, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6391,86 +6391,86 @@ ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[4:19], s[6:7], 0x0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s4, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s5, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s6, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s7, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s8, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s9, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s10, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s11, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s12, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s13, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s14, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s15, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s16, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s17, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s18, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s19, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s19, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s37, s19, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s18, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s37 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s17, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s17, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s9, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -5552,14 +5552,14 @@ ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5745,8 +5745,8 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s2, v8 ; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 @@ -5791,20 +5791,20 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v9 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v8 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v9 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v4i16_to_v4i64: @@ -6045,81 +6045,81 @@ ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s12, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s12, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s12, v1 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s12, v3 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, s2, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, s2, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, s2, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, s2, v3 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[4:7], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 ; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, v4 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s4, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s4, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v3, s4, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[3:6] +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v4 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v14, s4, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s4, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s4, v2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[6:9] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[10:13] +; GCN-HSA-NEXT: flat_store_dwordx4 v[3:4], v[14:17] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64: @@ -6133,30 +6133,30 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, v4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s6, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s6, v1 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s6, v2 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, s6, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v15, s6, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, s6, v1 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, s6, v2 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_zextload_v8i16_to_v8i64: @@ -6502,53 +6502,51 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[6:9], off, s[8:11], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v0 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s0, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v1 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, 0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s0, v0 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s0, v1 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v3 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, s0, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, s0, v5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v20 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, s0, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, s0, v7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64: @@ -6556,8 +6554,8 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6569,64 +6567,66 @@ ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v5 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[11:14] +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[11:14] ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[7:10] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v14 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s6, v0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9] -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v10, s6, v3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v3, s6, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v14, s6, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[3:6] +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[14:17] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -6642,48 +6642,48 @@ ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v0 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v1 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, s0, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, s0, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v2 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v3 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v5 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm @@ -7275,7 +7275,7 @@ ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) @@ -7286,136 +7286,121 @@ ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v3 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, s0, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, s0, v2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, v19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v21 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v21, s0, v5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v23 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, s0, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v6 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v9 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, s0, v9 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, s0, v8 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, s0, v11 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, s0, v10 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v13 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, s0, v13 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, s0, v12 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v15 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, s0, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, s0, v14 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v17 +; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v27 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v21, s0, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, s0, v5 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v6 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, s0, v7 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, s0, v9 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, s0, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v12 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, s0, v11 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, s0, v13 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, s0, v14 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v15 -; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, v21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1 -; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill +; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v1 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v27 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, v12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload -; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: s_mov_b32 s16, 0xffff +; GCN-HSA-NEXT: s_mov_b32 s18, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) @@ -7427,9 +7412,9 @@ ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[6:7] ; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[10:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 @@ -7437,126 +7422,124 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[14:15] ; GCN-HSA-NEXT: flat_load_dwordx4 v[18:21], v[18:19] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xf0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xb0 +; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x90 +; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50 +; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v9 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s16 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11 -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) +; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v13 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s13 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v11 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v19 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s17 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: s_waitcnt vmcnt(2) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v21 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v18 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[2:5] -; GCN-HSA-NEXT: v_and_b32_e32 v0, s16, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v18 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v19 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v16 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v17 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v14 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v12 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[2:5] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[2:5] +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v10 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[2:5] +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v10 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v11 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s18, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s18, v20 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s18, v7 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v11, s18, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s18, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, v1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_and_b32_e32 v9, s16, v6 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s16, v20 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s16, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, v7 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[11:14] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, v1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: @@ -7571,92 +7554,94 @@ ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v1 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) ; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s0, v36 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s0, v38 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v38 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, s0, v3 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v0 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v2 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v5 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v6 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v7 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, s0, v32 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s0, v34 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s0, v31 -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, s0, v33 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s0, v5 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s0, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, s0, v7 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v29, s0, v6 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v1 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, s0, v0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s0, v3 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, s0, v2 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v4 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, s0, v4 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, s0, v6 +; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v33 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v33 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 ; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, s0, v35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v37 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, s0, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, 0 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v5 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s0, v7 +; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s0, v32 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, v34 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v33 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v31 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v7 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v37 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v37 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v37 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v34 +; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v34 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v34 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -236,35 +236,35 @@ ; GFX6-LABEL: v_saddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v6, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_bfe_i32 v6, v7, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v8, v5 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v3, s4, v3 ; GFX6-NEXT: s_movk_i32 s5, 0x8000 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_i32_e32 v3, s5, v3 +; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 ; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -499,7 +499,7 @@ ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -529,7 +529,7 @@ ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -671,7 +671,7 @@ ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -701,7 +701,7 @@ ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -738,7 +738,7 @@ ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -768,7 +768,7 @@ ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -805,7 +805,7 @@ ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -835,7 +835,7 @@ ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -125,18 +125,18 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v1, vcc, 64, v0 -; GCN-NEXT: v_lshr_b64 v[2:3], 17, v1 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, 64, v0 -; GCN-NEXT: v_lshl_b64 v[4:5], 17, v1 +; GCN-NEXT: v_lshr_b64 v[1:2], 17, v1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 64, v0 +; GCN-NEXT: v_lshl_b64 v[2:3], 17, v2 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GCN-NEXT: v_lshl_b64 v[4:5], 17, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v1, s[4:5] -; GCN-NEXT: v_lshl_b64 v[0:1], 17, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] %shl = shl i128 17, %rhs ret i128 %shl @@ -146,16 +146,15 @@ ; GCN-LABEL: v_lshr_i128_kv: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_movk_i32 s4, 0x41 ; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_movk_i32 s4, 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -169,11 +168,10 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshr_b64 v[1:2], 33, v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 33, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -480,7 +480,7 @@ ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -512,7 +512,7 @@ ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -551,7 +551,7 @@ ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -583,7 +583,7 @@ ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -676,7 +676,7 @@ ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -708,7 +708,7 @@ ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) @@ -747,7 +747,7 @@ ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -779,7 +779,7 @@ ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -237,35 +237,35 @@ ; GFX6-LABEL: v_ssubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_bfe_i32 v8, v1, 0, 16 +; GFX6-NEXT: v_bfe_i32 v1, v6, 0, 16 +; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: v_bfe_i32 v6, v7, 0, 16 +; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: s_movk_i32 s4, 0x7fff +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v8, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v3, s4, v3 ; GFX6-NEXT: s_movk_i32 s5, 0x8000 +; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 ; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: v_max_i32_e32 v3, s5, v3 +; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 ; GFX6-NEXT: s_mov_b32 s6, 0xffff -; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 -; GFX6-NEXT: v_min_i32_e32 v1, s4, v1 -; GFX6-NEXT: v_min_i32_e32 v2, s4, v2 -; GFX6-NEXT: v_max_i32_e32 v1, s5, v1 -; GFX6-NEXT: v_max_i32_e32 v2, s5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s6, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -447,9 +447,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_pk_sub_i16 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -167,26 +167,26 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v7 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v6 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v8, v5 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_min_u32_e32 v2, s4, v2 -; GFX6-NEXT: v_min_u32_e32 v1, s4, v1 +; GFX6-NEXT: v_min_u32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_uaddsat_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -168,28 +168,28 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v10, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v10, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v8, s4, v4 ; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_and_b32_e32 v11, s4, v5 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v11 -; GFX6-NEXT: v_max_u32_e32 v0, v0, v10 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_and_b32_e32 v8, s4, v6 +; GFX6-NEXT: v_and_b32_e32 v9, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v6 ; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_and_b32_e32 v9, s4, v7 +; GFX6-NEXT: v_and_b32_e32 v11, s4, v7 ; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_max_u32_e32 v2, v2, v8 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_max_u32_e32 v1, v3, v9 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_max_u32_e32 v1, v2, v1 +; GFX6-NEXT: v_max_u32_e32 v3, v3, v11 +; GFX6-NEXT: v_max_u32_e32 v2, v10, v9 +; GFX6-NEXT: v_max_u32_e32 v0, v0, v8 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v4i16: diff --git a/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll b/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll --- a/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll +++ b/llvm/test/CodeGen/ARM/dagcombine-anyexttozeroext.ll @@ -43,7 +43,18 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] ; CHECK-NEXT: vmovl.u8 q8, d16 -; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: vmov.u16 r1, d16[1] +; CHECK-NEXT: vmov.u16 r2, d16[2] +; CHECK-NEXT: vmov.u16 r3, d16[3] +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: vmov.32 d16[0], r0 +; CHECK-NEXT: uxtb r0, r1 +; CHECK-NEXT: vmov.32 d16[1], r0 +; CHECK-NEXT: uxtb r0, r2 +; CHECK-NEXT: vmov.32 d17[0], r0 +; CHECK-NEXT: uxtb r0, r3 +; CHECK-NEXT: vmov.32 d17[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll --- a/llvm/test/CodeGen/ARM/fp16-insert-extract.ll +++ b/llvm/test/CodeGen/ARM/fp16-insert-extract.ll @@ -101,12 +101,10 @@ ; ; CHECKSOFT-LABEL: test_vset_laneq_f16_1: ; CHECKSOFT: @ %bb.0: @ %entry -; CHECKSOFT-NEXT: vmov d1, r2, r3 -; CHECKSOFT-NEXT: vldr s4, [sp] -; CHECKSOFT-NEXT: vmov d0, r0, r1 -; CHECKSOFT-NEXT: vcvtt.f16.f32 s0, s4 -; CHECKSOFT-NEXT: vmov r2, r3, d1 -; CHECKSOFT-NEXT: vmov r0, r1, d0 +; CHECKSOFT-NEXT: vldr s0, [sp] +; CHECKSOFT-NEXT: vmov d2, r0, r1 +; CHECKSOFT-NEXT: vcvtt.f16.f32 s4, s0 +; CHECKSOFT-NEXT: vmov r0, r1, d2 ; CHECKSOFT-NEXT: bx lr entry: %b = fptrunc float %fb to half @@ -126,7 +124,6 @@ ; CHECKSOFT-NEXT: vldr s4, [sp] ; CHECKSOFT-NEXT: vmov d0, r0, r1 ; CHECKSOFT-NEXT: vcvtt.f16.f32 s3, s4 -; CHECKSOFT-NEXT: vmov r0, r1, d0 ; CHECKSOFT-NEXT: vmov r2, r3, d1 ; CHECKSOFT-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/ARM/func-argpassing-endian.ll b/llvm/test/CodeGen/ARM/func-argpassing-endian.ll --- a/llvm/test/CodeGen/ARM/func-argpassing-endian.ll +++ b/llvm/test/CodeGen/ARM/func-argpassing-endian.ll @@ -102,31 +102,33 @@ define <4 x i32> @return_v4i32() { ; CHECK-LE-LABEL: return_v4i32: ; CHECK-LE: @ %bb.0: -; CHECK-LE-NEXT: adr r0, .LCPI6_0 -; CHECK-LE-NEXT: vld1.64 {d16, d17}, [r0:128] +; CHECK-LE-NEXT: vldr d16, .LCPI6_0 +; CHECK-LE-NEXT: vldr d17, .LCPI6_1 ; CHECK-LE-NEXT: vmov r0, r1, d16 ; CHECK-LE-NEXT: vmov r2, r3, d17 ; CHECK-LE-NEXT: bx lr -; CHECK-LE-NEXT: .p2align 4 +; CHECK-LE-NEXT: .p2align 3 ; CHECK-LE-NEXT: @ %bb.1: ; CHECK-LE-NEXT: .LCPI6_0: ; CHECK-LE-NEXT: .long 42 @ double 9.1245819032257467E-313 ; CHECK-LE-NEXT: .long 43 +; CHECK-LE-NEXT: .LCPI6_1: ; CHECK-LE-NEXT: .long 44 @ double 9.5489810615176143E-313 ; CHECK-LE-NEXT: .long 45 ; ; CHECK-BE-LABEL: return_v4i32: ; CHECK-BE: @ %bb.0: -; CHECK-BE-NEXT: adr r0, .LCPI6_0 -; CHECK-BE-NEXT: vld1.64 {d16, d17}, [r0:128] +; CHECK-BE-NEXT: vldr d16, .LCPI6_0 +; CHECK-BE-NEXT: vldr d17, .LCPI6_1 ; CHECK-BE-NEXT: vmov r1, r0, d16 ; CHECK-BE-NEXT: vmov r3, r2, d17 ; CHECK-BE-NEXT: bx lr -; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .p2align 3 ; CHECK-BE-NEXT: @ %bb.1: ; CHECK-BE-NEXT: .LCPI6_0: ; CHECK-BE-NEXT: .long 42 @ double 8.912382324178626E-313 ; CHECK-BE-NEXT: .long 43 +; CHECK-BE-NEXT: .LCPI6_1: ; CHECK-BE-NEXT: .long 44 @ double 9.3367814824704935E-313 ; CHECK-BE-NEXT: .long 45 ret < 4 x i32> < i32 42, i32 43, i32 44, i32 45 > diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll --- a/llvm/test/CodeGen/ARM/vdup.ll +++ b/llvm/test/CodeGen/ARM/vdup.ll @@ -56,7 +56,16 @@ define <16 x i8> @v_dupQ8(i8 %A) nounwind { ; CHECK-LABEL: v_dupQ8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.8 q8, r0 +; CHECK-NEXT: vmov.i32 d17, #0x0 +; CHECK-NEXT: vdup.8 d16, r0 +; CHECK-NEXT: vmov.8 d17[0], r0 +; CHECK-NEXT: vmov.8 d17[1], r0 +; CHECK-NEXT: vmov.8 d17[2], r0 +; CHECK-NEXT: vmov.8 d17[3], r0 +; CHECK-NEXT: vmov.8 d17[4], r0 +; CHECK-NEXT: vmov.8 d17[5], r0 +; CHECK-NEXT: vmov.8 d17[6], r0 +; CHECK-NEXT: vmov.8 d17[7], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -82,7 +91,12 @@ define <8 x i16> @v_dupQ16(i16 %A) nounwind { ; CHECK-LABEL: v_dupQ16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.16 q8, r0 +; CHECK-NEXT: vmov.i32 d17, #0x0 +; CHECK-NEXT: vdup.16 d16, r0 +; CHECK-NEXT: vmov.16 d17[0], r0 +; CHECK-NEXT: vmov.16 d17[1], r0 +; CHECK-NEXT: vmov.16 d17[2], r0 +; CHECK-NEXT: vmov.16 d17[3], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -100,7 +114,9 @@ define <4 x i32> @v_dupQ32(i32 %A) nounwind { ; CHECK-LABEL: v_dupQ32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q8, r0 +; CHECK-NEXT: vdup.32 d16, r0 +; CHECK-NEXT: vmov.32 d17[0], r0 +; CHECK-NEXT: vmov.32 d17[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -114,9 +130,12 @@ define <4 x float> @v_dupQfloat(float %A) nounwind { ; CHECK-LABEL: v_dupQfloat: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q8, r0 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vmov.f32 s1, s0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.f32 s2, s0 +; CHECK-NEXT: vmov.f32 s3, s0 +; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: mov pc, lr %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 @@ -379,7 +398,8 @@ define <4 x i32> @tdupi(i32 %x, i32 %y) { ; CHECK-LABEL: tdupi: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q8, r0 +; CHECK-NEXT: vdup.32 d16, r0 +; CHECK-NEXT: vmov.32 d17[0], r0 ; CHECK-NEXT: vmov.32 d17[1], r1 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 @@ -394,9 +414,11 @@ define <4 x float> @tdupf(float %x, float %y) { ; CHECK-LABEL: tdupf: ; CHECK: @ %bb.0: -; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vmov s3, r1 +; CHECK-NEXT: vmov.f32 s1, s0 ; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov.f32 s2, s0 ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: mov pc, lr %1 = insertelement <4 x float> undef, float %x, i32 0 @@ -412,8 +434,11 @@ ; CHECK-LABEL: tduplane: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d16, r0, r1 -; CHECK-NEXT: mov r0, #255 +; CHECK-NEXT: vmov.32 r0, d16[1] +; CHECK-NEXT: vmov d17, r2, r3 ; CHECK-NEXT: vdup.32 q8, d16[1] +; CHECK-NEXT: vmov.32 d17[0], r0 +; CHECK-NEXT: mov r0, #255 ; CHECK-NEXT: vmov.32 d17[1], r0 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 diff --git a/llvm/test/CodeGen/ARM/vldlane.ll b/llvm/test/CodeGen/ARM/vldlane.ll --- a/llvm/test/CodeGen/ARM/vldlane.ll +++ b/llvm/test/CodeGen/ARM/vldlane.ll @@ -72,13 +72,23 @@ } define <16 x i8> @vld1laneQi8(i8* %A, <16 x i8>* %B) nounwind { -; CHECK-LABEL: vld1laneQi8: -; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.8 {d17[1]}, [r0] -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; DEFAULT-LABEL: vld1laneQi8: +; DEFAULT: @ %bb.0: +; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] +; DEFAULT-NEXT: vorr q9, q8, q8 +; DEFAULT-NEXT: vld1.8 {d19[1]}, [r0] +; DEFAULT-NEXT: vmov r0, r1, d16 +; DEFAULT-NEXT: vmov r2, r3, d19 +; DEFAULT-NEXT: mov pc, lr +; +; BASIC-LABEL: vld1laneQi8: +; BASIC: @ %bb.0: +; BASIC-NEXT: vld1.64 {d18, d19}, [r1] +; BASIC-NEXT: vorr q8, q9, q9 +; BASIC-NEXT: vld1.8 {d17[1]}, [r0] +; BASIC-NEXT: vmov r0, r1, d18 +; BASIC-NEXT: vmov r2, r3, d17 +; BASIC-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %B %tmp2 = load i8, i8* %A, align 8 %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9 @@ -86,13 +96,23 @@ } define <8 x i16> @vld1laneQi16(i16* %A, <8 x i16>* %B) nounwind { -; CHECK-LABEL: vld1laneQi16: -; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.16 {d17[1]}, [r0:16] -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; DEFAULT-LABEL: vld1laneQi16: +; DEFAULT: @ %bb.0: +; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] +; DEFAULT-NEXT: vorr q9, q8, q8 +; DEFAULT-NEXT: vld1.16 {d19[1]}, [r0:16] +; DEFAULT-NEXT: vmov r0, r1, d16 +; DEFAULT-NEXT: vmov r2, r3, d19 +; DEFAULT-NEXT: mov pc, lr +; +; BASIC-LABEL: vld1laneQi16: +; BASIC: @ %bb.0: +; BASIC-NEXT: vld1.64 {d18, d19}, [r1] +; BASIC-NEXT: vorr q8, q9, q9 +; BASIC-NEXT: vld1.16 {d17[1]}, [r0:16] +; BASIC-NEXT: vmov r0, r1, d18 +; BASIC-NEXT: vmov r2, r3, d17 +; BASIC-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %B %tmp2 = load i16, i16* %A, align 8 %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5 @@ -100,13 +120,23 @@ } define <4 x i32> @vld1laneQi32(i32* %A, <4 x i32>* %B) nounwind { -; CHECK-LABEL: vld1laneQi32: -; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.32 {d17[1]}, [r0:32] -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov pc, lr +; DEFAULT-LABEL: vld1laneQi32: +; DEFAULT: @ %bb.0: +; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] +; DEFAULT-NEXT: vorr q9, q8, q8 +; DEFAULT-NEXT: vld1.32 {d19[1]}, [r0:32] +; DEFAULT-NEXT: vmov r0, r1, d16 +; DEFAULT-NEXT: vmov r2, r3, d19 +; DEFAULT-NEXT: mov pc, lr +; +; BASIC-LABEL: vld1laneQi32: +; BASIC: @ %bb.0: +; BASIC-NEXT: vld1.64 {d18, d19}, [r1] +; BASIC-NEXT: vorr q8, q9, q9 +; BASIC-NEXT: vld1.32 {d17[1]}, [r0:32] +; BASIC-NEXT: vmov r0, r1, d18 +; BASIC-NEXT: vmov r2, r3, d17 +; BASIC-NEXT: mov pc, lr %tmp1 = load <4 x i32>, <4 x i32>* %B %tmp2 = load i32, i32* %A, align 8 %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3 @@ -114,13 +144,23 @@ } define <4 x float> @vld1laneQf(float* %A, <4 x float>* %B) nounwind { -; CHECK-LABEL: vld1laneQf: -; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] -; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: mov pc, lr +; DEFAULT-LABEL: vld1laneQf: +; DEFAULT: @ %bb.0: +; DEFAULT-NEXT: vld1.64 {d16, d17}, [r1] +; DEFAULT-NEXT: vorr q9, q8, q8 +; DEFAULT-NEXT: vmov r2, r3, d17 +; DEFAULT-NEXT: vld1.32 {d18[0]}, [r0:32] +; DEFAULT-NEXT: vmov r0, r1, d18 +; DEFAULT-NEXT: mov pc, lr +; +; BASIC-LABEL: vld1laneQf: +; BASIC: @ %bb.0: +; BASIC-NEXT: vld1.64 {d18, d19}, [r1] +; BASIC-NEXT: vorr q8, q9, q9 +; BASIC-NEXT: vmov r2, r3, d19 +; BASIC-NEXT: vld1.32 {d16[0]}, [r0:32] +; BASIC-NEXT: vmov r0, r1, d16 +; BASIC-NEXT: mov pc, lr %tmp1 = load <4 x float>, <4 x float>* %B %tmp2 = load float, float* %A %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0 diff --git a/llvm/test/CodeGen/ARM/vzip.ll b/llvm/test/CodeGen/ARM/vzip.ll --- a/llvm/test/CodeGen/ARM/vzip.ll +++ b/llvm/test/CodeGen/ARM/vzip.ll @@ -291,7 +291,7 @@ ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d19, [r0] ; CHECK-NEXT: vtrn.16 d19, d16 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d19 ; CHECK-NEXT: mov pc, lr entry: diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll --- a/llvm/test/CodeGen/Mips/cconv/vector.ll +++ b/llvm/test/CodeGen/Mips/cconv/vector.ll @@ -960,84 +960,68 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: i8_8: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: sw $6, 24($sp) -; MIPS32R5EB-NEXT: lbu $1, 25($sp) -; MIPS32R5EB-NEXT: lbu $2, 24($sp) -; MIPS32R5EB-NEXT: sw $7, 28($sp) -; MIPS32R5EB-NEXT: insert.h $w0[0], $2 -; MIPS32R5EB-NEXT: insert.h $w0[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 26($sp) -; MIPS32R5EB-NEXT: sw $4, 32($sp) -; MIPS32R5EB-NEXT: insert.h $w0[2], $1 -; MIPS32R5EB-NEXT: lbu $1, 27($sp) -; MIPS32R5EB-NEXT: insert.h $w0[3], $1 -; MIPS32R5EB-NEXT: lbu $1, 28($sp) -; MIPS32R5EB-NEXT: sw $5, 36($sp) -; MIPS32R5EB-NEXT: insert.h $w0[4], $1 -; MIPS32R5EB-NEXT: lbu $1, 33($sp) -; MIPS32R5EB-NEXT: lbu $2, 32($sp) -; MIPS32R5EB-NEXT: insert.h $w1[0], $2 -; MIPS32R5EB-NEXT: insert.h $w1[1], $1 -; MIPS32R5EB-NEXT: lbu $1, 29($sp) -; MIPS32R5EB-NEXT: lbu $2, 34($sp) -; MIPS32R5EB-NEXT: insert.h $w1[2], $2 -; MIPS32R5EB-NEXT: insert.h $w0[5], $1 -; MIPS32R5EB-NEXT: lbu $1, 35($sp) -; MIPS32R5EB-NEXT: lbu $2, 31($sp) -; MIPS32R5EB-NEXT: lbu $3, 30($sp) -; MIPS32R5EB-NEXT: lbu $4, 39($sp) -; MIPS32R5EB-NEXT: insert.h $w0[6], $3 -; MIPS32R5EB-NEXT: insert.h $w0[7], $2 -; MIPS32R5EB-NEXT: insert.h $w1[3], $1 -; MIPS32R5EB-NEXT: lbu $1, 36($sp) -; MIPS32R5EB-NEXT: insert.h $w1[4], $1 -; MIPS32R5EB-NEXT: lbu $1, 37($sp) -; MIPS32R5EB-NEXT: insert.h $w1[5], $1 -; MIPS32R5EB-NEXT: lbu $1, 38($sp) -; MIPS32R5EB-NEXT: insert.h $w1[6], $1 -; MIPS32R5EB-NEXT: insert.h $w1[7], $4 -; MIPS32R5EB-NEXT: addv.h $w0, $w1, $w0 -; MIPS32R5EB-NEXT: copy_s.h $1, $w0[0] -; MIPS32R5EB-NEXT: copy_s.h $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.h $3, $w0[2] -; MIPS32R5EB-NEXT: copy_s.h $4, $w0[3] -; MIPS32R5EB-NEXT: copy_s.h $5, $w0[4] -; MIPS32R5EB-NEXT: copy_s.h $6, $w0[5] -; MIPS32R5EB-NEXT: copy_s.h $7, $w0[6] -; MIPS32R5EB-NEXT: copy_s.h $8, $w0[7] -; MIPS32R5EB-NEXT: sb $8, 23($sp) -; MIPS32R5EB-NEXT: sb $7, 22($sp) -; MIPS32R5EB-NEXT: sb $6, 21($sp) -; MIPS32R5EB-NEXT: sb $5, 20($sp) -; MIPS32R5EB-NEXT: sb $4, 19($sp) -; MIPS32R5EB-NEXT: sb $3, 18($sp) -; MIPS32R5EB-NEXT: sb $2, 17($sp) -; MIPS32R5EB-NEXT: sb $1, 16($sp) -; MIPS32R5EB-NEXT: lw $1, 20($sp) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: lw $1, 16($sp) -; MIPS32R5EB-NEXT: sw $1, 4($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: i8_8: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: addiu $sp, $sp, -24 +; MIPS32R5-NEXT: .cfi_def_cfa_offset 24 +; MIPS32R5-NEXT: sw $6, 8($sp) +; MIPS32R5-NEXT: lbu $1, 9($sp) +; MIPS32R5-NEXT: lbu $2, 8($sp) +; MIPS32R5-NEXT: sw $7, 12($sp) +; MIPS32R5-NEXT: insert.h $w0[0], $2 +; MIPS32R5-NEXT: insert.h $w0[1], $1 +; MIPS32R5-NEXT: lbu $1, 10($sp) +; MIPS32R5-NEXT: sw $4, 16($sp) +; MIPS32R5-NEXT: insert.h $w0[2], $1 +; MIPS32R5-NEXT: lbu $1, 11($sp) +; MIPS32R5-NEXT: insert.h $w0[3], $1 +; MIPS32R5-NEXT: lbu $1, 12($sp) +; MIPS32R5-NEXT: sw $5, 20($sp) +; MIPS32R5-NEXT: insert.h $w0[4], $1 +; MIPS32R5-NEXT: lbu $1, 17($sp) +; MIPS32R5-NEXT: lbu $2, 16($sp) +; MIPS32R5-NEXT: insert.h $w1[0], $2 +; MIPS32R5-NEXT: insert.h $w1[1], $1 +; MIPS32R5-NEXT: lbu $1, 13($sp) +; MIPS32R5-NEXT: lbu $2, 18($sp) +; MIPS32R5-NEXT: insert.h $w1[2], $2 +; MIPS32R5-NEXT: insert.h $w0[5], $1 +; MIPS32R5-NEXT: lbu $1, 19($sp) +; MIPS32R5-NEXT: lbu $2, 15($sp) +; MIPS32R5-NEXT: lbu $3, 14($sp) +; MIPS32R5-NEXT: lbu $4, 23($sp) +; MIPS32R5-NEXT: insert.h $w0[6], $3 +; MIPS32R5-NEXT: insert.h $w0[7], $2 +; MIPS32R5-NEXT: insert.h $w1[3], $1 +; MIPS32R5-NEXT: lbu $1, 20($sp) +; MIPS32R5-NEXT: insert.h $w1[4], $1 +; MIPS32R5-NEXT: lbu $1, 21($sp) +; MIPS32R5-NEXT: insert.h $w1[5], $1 +; MIPS32R5-NEXT: lbu $1, 22($sp) +; MIPS32R5-NEXT: insert.h $w1[6], $1 +; MIPS32R5-NEXT: insert.h $w1[7], $4 +; MIPS32R5-NEXT: addv.h $w0, $w1, $w0 +; MIPS32R5-NEXT: copy_s.h $1, $w0[4] +; MIPS32R5-NEXT: copy_s.h $2, $w0[5] +; MIPS32R5-NEXT: copy_s.h $3, $w0[6] +; MIPS32R5-NEXT: copy_s.h $4, $w0[7] +; MIPS32R5-NEXT: copy_s.h $5, $w0[0] +; MIPS32R5-NEXT: copy_s.h $6, $w0[1] +; MIPS32R5-NEXT: copy_s.h $7, $w0[2] +; MIPS32R5-NEXT: copy_s.h $8, $w0[3] +; MIPS32R5-NEXT: sb $8, 3($sp) +; MIPS32R5-NEXT: sb $7, 2($sp) +; MIPS32R5-NEXT: sb $6, 1($sp) +; MIPS32R5-NEXT: sb $5, 0($sp) +; MIPS32R5-NEXT: sb $4, 7($sp) +; MIPS32R5-NEXT: sb $3, 6($sp) +; MIPS32R5-NEXT: sb $2, 5($sp) +; MIPS32R5-NEXT: sb $1, 4($sp) +; MIPS32R5-NEXT: lw $2, 0($sp) +; MIPS32R5-NEXT: lw $3, 4($sp) +; MIPS32R5-NEXT: addiu $sp, $sp, 24 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: i8_8: ; MIPS64R5: # %bb.0: @@ -1098,85 +1082,6 @@ ; MIPS64R5-NEXT: daddiu $sp, $sp, 32 ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: i8_8: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: sw $6, 24($sp) -; MIPS32R5EL-NEXT: lbu $1, 25($sp) -; MIPS32R5EL-NEXT: lbu $2, 24($sp) -; MIPS32R5EL-NEXT: sw $7, 28($sp) -; MIPS32R5EL-NEXT: insert.h $w0[0], $2 -; MIPS32R5EL-NEXT: insert.h $w0[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 26($sp) -; MIPS32R5EL-NEXT: sw $4, 32($sp) -; MIPS32R5EL-NEXT: insert.h $w0[2], $1 -; MIPS32R5EL-NEXT: lbu $1, 27($sp) -; MIPS32R5EL-NEXT: insert.h $w0[3], $1 -; MIPS32R5EL-NEXT: lbu $1, 28($sp) -; MIPS32R5EL-NEXT: sw $5, 36($sp) -; MIPS32R5EL-NEXT: insert.h $w0[4], $1 -; MIPS32R5EL-NEXT: lbu $1, 33($sp) -; MIPS32R5EL-NEXT: lbu $2, 32($sp) -; MIPS32R5EL-NEXT: insert.h $w1[0], $2 -; MIPS32R5EL-NEXT: insert.h $w1[1], $1 -; MIPS32R5EL-NEXT: lbu $1, 29($sp) -; MIPS32R5EL-NEXT: lbu $2, 34($sp) -; MIPS32R5EL-NEXT: insert.h $w1[2], $2 -; MIPS32R5EL-NEXT: insert.h $w0[5], $1 -; MIPS32R5EL-NEXT: lbu $1, 35($sp) -; MIPS32R5EL-NEXT: lbu $2, 31($sp) -; MIPS32R5EL-NEXT: lbu $3, 30($sp) -; MIPS32R5EL-NEXT: lbu $4, 39($sp) -; MIPS32R5EL-NEXT: insert.h $w0[6], $3 -; MIPS32R5EL-NEXT: insert.h $w0[7], $2 -; MIPS32R5EL-NEXT: insert.h $w1[3], $1 -; MIPS32R5EL-NEXT: lbu $1, 36($sp) -; MIPS32R5EL-NEXT: insert.h $w1[4], $1 -; MIPS32R5EL-NEXT: lbu $1, 37($sp) -; MIPS32R5EL-NEXT: insert.h $w1[5], $1 -; MIPS32R5EL-NEXT: lbu $1, 38($sp) -; MIPS32R5EL-NEXT: insert.h $w1[6], $1 -; MIPS32R5EL-NEXT: insert.h $w1[7], $4 -; MIPS32R5EL-NEXT: addv.h $w0, $w1, $w0 -; MIPS32R5EL-NEXT: copy_s.h $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.h $2, $w0[1] -; MIPS32R5EL-NEXT: copy_s.h $3, $w0[2] -; MIPS32R5EL-NEXT: copy_s.h $4, $w0[3] -; MIPS32R5EL-NEXT: copy_s.h $5, $w0[4] -; MIPS32R5EL-NEXT: copy_s.h $6, $w0[5] -; MIPS32R5EL-NEXT: copy_s.h $7, $w0[6] -; MIPS32R5EL-NEXT: copy_s.h $8, $w0[7] -; MIPS32R5EL-NEXT: sb $8, 23($sp) -; MIPS32R5EL-NEXT: sb $7, 22($sp) -; MIPS32R5EL-NEXT: sb $6, 21($sp) -; MIPS32R5EL-NEXT: sb $5, 20($sp) -; MIPS32R5EL-NEXT: sb $4, 19($sp) -; MIPS32R5EL-NEXT: sb $3, 18($sp) -; MIPS32R5EL-NEXT: sb $2, 17($sp) -; MIPS32R5EL-NEXT: sb $1, 16($sp) -; MIPS32R5EL-NEXT: lw $1, 20($sp) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: lw $1, 16($sp) -; MIPS32R5EL-NEXT: sw $1, 0($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = add <8 x i8> %a, %b ret <8 x i8> %1 } @@ -1642,60 +1547,44 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: i16_4: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EB-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: sw $6, 24($sp) -; MIPS32R5EB-NEXT: sw $7, 28($sp) -; MIPS32R5EB-NEXT: lhu $1, 26($sp) -; MIPS32R5EB-NEXT: lhu $2, 24($sp) -; MIPS32R5EB-NEXT: sw $4, 32($sp) -; MIPS32R5EB-NEXT: insert.w $w0[0], $2 -; MIPS32R5EB-NEXT: insert.w $w0[1], $1 -; MIPS32R5EB-NEXT: lhu $1, 28($sp) -; MIPS32R5EB-NEXT: sw $5, 36($sp) -; MIPS32R5EB-NEXT: insert.w $w0[2], $1 -; MIPS32R5EB-NEXT: lhu $1, 30($sp) -; MIPS32R5EB-NEXT: insert.w $w0[3], $1 -; MIPS32R5EB-NEXT: lhu $1, 34($sp) -; MIPS32R5EB-NEXT: lhu $2, 32($sp) -; MIPS32R5EB-NEXT: insert.w $w1[0], $2 -; MIPS32R5EB-NEXT: insert.w $w1[1], $1 -; MIPS32R5EB-NEXT: lhu $1, 36($sp) -; MIPS32R5EB-NEXT: insert.w $w1[2], $1 -; MIPS32R5EB-NEXT: lhu $1, 38($sp) -; MIPS32R5EB-NEXT: insert.w $w1[3], $1 -; MIPS32R5EB-NEXT: addv.w $w0, $w1, $w0 -; MIPS32R5EB-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $4, $w0[3] -; MIPS32R5EB-NEXT: sh $4, 22($sp) -; MIPS32R5EB-NEXT: sh $3, 20($sp) -; MIPS32R5EB-NEXT: sh $2, 18($sp) -; MIPS32R5EB-NEXT: sh $1, 16($sp) -; MIPS32R5EB-NEXT: lw $1, 20($sp) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: lw $1, 16($sp) -; MIPS32R5EB-NEXT: sw $1, 4($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: i16_4: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: addiu $sp, $sp, -24 +; MIPS32R5-NEXT: .cfi_def_cfa_offset 24 +; MIPS32R5-NEXT: sw $6, 8($sp) +; MIPS32R5-NEXT: sw $7, 12($sp) +; MIPS32R5-NEXT: lhu $1, 10($sp) +; MIPS32R5-NEXT: lhu $2, 8($sp) +; MIPS32R5-NEXT: sw $4, 16($sp) +; MIPS32R5-NEXT: insert.w $w0[0], $2 +; MIPS32R5-NEXT: insert.w $w0[1], $1 +; MIPS32R5-NEXT: lhu $1, 12($sp) +; MIPS32R5-NEXT: sw $5, 20($sp) +; MIPS32R5-NEXT: insert.w $w0[2], $1 +; MIPS32R5-NEXT: lhu $1, 14($sp) +; MIPS32R5-NEXT: insert.w $w0[3], $1 +; MIPS32R5-NEXT: lhu $1, 18($sp) +; MIPS32R5-NEXT: lhu $2, 16($sp) +; MIPS32R5-NEXT: insert.w $w1[0], $2 +; MIPS32R5-NEXT: insert.w $w1[1], $1 +; MIPS32R5-NEXT: lhu $1, 20($sp) +; MIPS32R5-NEXT: insert.w $w1[2], $1 +; MIPS32R5-NEXT: lhu $1, 22($sp) +; MIPS32R5-NEXT: insert.w $w1[3], $1 +; MIPS32R5-NEXT: addv.w $w0, $w1, $w0 +; MIPS32R5-NEXT: copy_s.w $1, $w0[2] +; MIPS32R5-NEXT: copy_s.w $2, $w0[3] +; MIPS32R5-NEXT: copy_s.w $3, $w0[0] +; MIPS32R5-NEXT: copy_s.w $4, $w0[1] +; MIPS32R5-NEXT: sh $4, 2($sp) +; MIPS32R5-NEXT: sh $3, 0($sp) +; MIPS32R5-NEXT: sh $2, 6($sp) +; MIPS32R5-NEXT: sh $1, 4($sp) +; MIPS32R5-NEXT: lw $2, 0($sp) +; MIPS32R5-NEXT: lw $3, 4($sp) +; MIPS32R5-NEXT: addiu $sp, $sp, 24 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: i16_4: ; MIPS64R5: # %bb.0: @@ -1732,61 +1621,6 @@ ; MIPS64R5-NEXT: daddiu $sp, $sp, 32 ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: i16_4: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -48 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48 -; MIPS32R5EL-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 40($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: sw $6, 24($sp) -; MIPS32R5EL-NEXT: sw $7, 28($sp) -; MIPS32R5EL-NEXT: lhu $1, 26($sp) -; MIPS32R5EL-NEXT: lhu $2, 24($sp) -; MIPS32R5EL-NEXT: sw $4, 32($sp) -; MIPS32R5EL-NEXT: insert.w $w0[0], $2 -; MIPS32R5EL-NEXT: insert.w $w0[1], $1 -; MIPS32R5EL-NEXT: lhu $1, 28($sp) -; MIPS32R5EL-NEXT: sw $5, 36($sp) -; MIPS32R5EL-NEXT: insert.w $w0[2], $1 -; MIPS32R5EL-NEXT: lhu $1, 30($sp) -; MIPS32R5EL-NEXT: insert.w $w0[3], $1 -; MIPS32R5EL-NEXT: lhu $1, 34($sp) -; MIPS32R5EL-NEXT: lhu $2, 32($sp) -; MIPS32R5EL-NEXT: insert.w $w1[0], $2 -; MIPS32R5EL-NEXT: insert.w $w1[1], $1 -; MIPS32R5EL-NEXT: lhu $1, 36($sp) -; MIPS32R5EL-NEXT: insert.w $w1[2], $1 -; MIPS32R5EL-NEXT: lhu $1, 38($sp) -; MIPS32R5EL-NEXT: insert.w $w1[3], $1 -; MIPS32R5EL-NEXT: addv.w $w0, $w1, $w0 -; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $4, $w0[3] -; MIPS32R5EL-NEXT: sh $4, 22($sp) -; MIPS32R5EL-NEXT: sh $3, 20($sp) -; MIPS32R5EL-NEXT: sh $2, 18($sp) -; MIPS32R5EL-NEXT: sh $1, 16($sp) -; MIPS32R5EL-NEXT: lw $1, 20($sp) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: lw $1, 16($sp) -; MIPS32R5EL-NEXT: sw $1, 0($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 40($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 44($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = add <4 x i16> %a, %b ret <4 x i16> %1 } @@ -2829,33 +2663,14 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: ret_8_i8: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: lui $1, %hi(gv8i8) -; MIPS32R5EB-NEXT: lw $2, %lo(gv8i8)($1) -; MIPS32R5EB-NEXT: sw $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i8) -; MIPS32R5EB-NEXT: lw $1, 4($1) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: ret_8_i8: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: lui $1, %hi(gv8i8) +; MIPS32R5-NEXT: lw $2, %lo(gv8i8)($1) +; MIPS32R5-NEXT: addiu $1, $1, %lo(gv8i8) +; MIPS32R5-NEXT: lw $3, 4($1) +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: ret_8_i8: ; MIPS64R5: # %bb.0: @@ -2866,34 +2681,6 @@ ; MIPS64R5-NEXT: ld $2, 0($1) ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: ret_8_i8: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: lui $1, %hi(gv8i8) -; MIPS32R5EL-NEXT: lw $2, %lo(gv8i8)($1) -; MIPS32R5EL-NEXT: sw $2, 0($sp) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i8) -; MIPS32R5EL-NEXT: lw $1, 4($1) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = load <8 x i8>, <8 x i8> * @gv8i8 ret <8 x i8> %1 } @@ -3006,33 +2793,14 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: ret_4_i16: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: lui $1, %hi(gv4i16) -; MIPS32R5EB-NEXT: lw $2, %lo(gv4i16)($1) -; MIPS32R5EB-NEXT: sw $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv4i16) -; MIPS32R5EB-NEXT: lw $1, 4($1) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: ret_4_i16: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: lui $1, %hi(gv4i16) +; MIPS32R5-NEXT: lw $2, %lo(gv4i16)($1) +; MIPS32R5-NEXT: addiu $1, $1, %lo(gv4i16) +; MIPS32R5-NEXT: lw $3, 4($1) +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: ret_4_i16: ; MIPS64R5: # %bb.0: @@ -3043,34 +2811,6 @@ ; MIPS64R5-NEXT: ld $2, 0($1) ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: ret_4_i16: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: lui $1, %hi(gv4i16) -; MIPS32R5EL-NEXT: lw $2, %lo(gv4i16)($1) -; MIPS32R5EL-NEXT: sw $2, 0($sp) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv4i16) -; MIPS32R5EL-NEXT: lw $1, 4($1) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = load <4 x i16>, <4 x i16> * @gv4i16 ret <4 x i16> %1 } @@ -3145,33 +2885,14 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5EB-LABEL: ret_2_i32: -; MIPS32R5EB: # %bb.0: -; MIPS32R5EB-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: .cfi_offset 30, -8 -; MIPS32R5EB-NEXT: move $fp, $sp -; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EB-NEXT: addiu $1, $zero, -16 -; MIPS32R5EB-NEXT: and $sp, $sp, $1 -; MIPS32R5EB-NEXT: lui $1, %hi(gv2i32) -; MIPS32R5EB-NEXT: lw $2, %lo(gv2i32)($1) -; MIPS32R5EB-NEXT: sw $2, 4($sp) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2i32) -; MIPS32R5EB-NEXT: lw $1, 4($1) -; MIPS32R5EB-NEXT: sw $1, 12($sp) -; MIPS32R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[3] -; MIPS32R5EB-NEXT: move $sp, $fp -; MIPS32R5EB-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EB-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EB-NEXT: jr $ra -; MIPS32R5EB-NEXT: nop +; MIPS32R5-LABEL: ret_2_i32: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: lui $1, %hi(gv2i32) +; MIPS32R5-NEXT: lw $2, %lo(gv2i32)($1) +; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2i32) +; MIPS32R5-NEXT: lw $3, 4($1) +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: nop ; ; MIPS64R5-LABEL: ret_2_i32: ; MIPS64R5: # %bb.0: @@ -3182,34 +2903,6 @@ ; MIPS64R5-NEXT: ld $2, 0($1) ; MIPS64R5-NEXT: jr $ra ; MIPS64R5-NEXT: nop -; -; MIPS32R5EL-LABEL: ret_2_i32: -; MIPS32R5EL: # %bb.0: -; MIPS32R5EL-NEXT: addiu $sp, $sp, -32 -; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 -; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill -; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: .cfi_offset 30, -8 -; MIPS32R5EL-NEXT: move $fp, $sp -; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5EL-NEXT: addiu $1, $zero, -16 -; MIPS32R5EL-NEXT: and $sp, $sp, $1 -; MIPS32R5EL-NEXT: lui $1, %hi(gv2i32) -; MIPS32R5EL-NEXT: lw $2, %lo(gv2i32)($1) -; MIPS32R5EL-NEXT: sw $2, 0($sp) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2i32) -; MIPS32R5EL-NEXT: lw $1, 4($1) -; MIPS32R5EL-NEXT: sw $1, 8($sp) -; MIPS32R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: move $sp, $fp -; MIPS32R5EL-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload -; MIPS32R5EL-NEXT: addiu $sp, $sp, 32 -; MIPS32R5EL-NEXT: jr $ra -; MIPS32R5EL-NEXT: nop %1 = load <2 x i32>, <2 x i32> * @gv2i32 ret <2 x i32> %1 } @@ -4170,77 +3863,81 @@ ; MIPS64EB-NEXT: jr $ra ; MIPS64EB-NEXT: nop ; -; MIPS32R5-LABEL: calli8_16: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -40 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 40 -; MIPS32R5-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: lui $1, %hi($CPI30_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI30_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5-NEXT: lui $1, %hi($CPI30_1) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI30_1) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: jal i8_16 -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv16i8) -; MIPS32R5-NEXT: insert.w $w0[0], $2 -; MIPS32R5-NEXT: insert.w $w0[1], $3 -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv16i8) -; MIPS32R5-NEXT: insert.w $w0[2], $4 -; MIPS32R5-NEXT: insert.w $w0[3], $5 -; MIPS32R5-NEXT: st.w $w0, 0($1) -; MIPS32R5-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 40 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calli8_16: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: lui $1, 3080 +; MIPS32R5EB-NEXT: ori $1, $1, 2314 +; MIPS32R5EB-NEXT: lui $2, 1801 +; MIPS32R5EB-NEXT: sw $1, 28($sp) +; MIPS32R5EB-NEXT: ori $1, $2, 1801 +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: sw $1, 20($sp) +; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: lui $1, 1543 +; MIPS32R5EB-NEXT: ori $4, $1, 1543 +; MIPS32R5EB-NEXT: ori $7, $1, 2314 +; MIPS32R5EB-NEXT: move $5, $4 +; MIPS32R5EB-NEXT: move $6, $4 +; MIPS32R5EB-NEXT: jal i8_16 +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: insert.w $w0[0], $2 +; MIPS32R5EB-NEXT: insert.w $w0[1], $3 +; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv16i8) +; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv16i8) +; MIPS32R5EB-NEXT: st.w $w0, 0($1) +; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; -; MIPS64R5-LABEL: calli8_16: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI30_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI30_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI30_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI30_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(i8_16)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv16i8)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: calli8_16: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) +; MIPS64R5EB-NEXT: lui $1, 1801 +; MIPS64R5EB-NEXT: daddiu $1, $1, 1801 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 1801 +; MIPS64R5EB-NEXT: lui $2, 1543 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $2, $2, 1543 +; MIPS64R5EB-NEXT: dsll $2, $2, 16 +; MIPS64R5EB-NEXT: daddiu $2, $2, 1543 +; MIPS64R5EB-NEXT: dsll $2, $2, 16 +; MIPS64R5EB-NEXT: daddiu $4, $2, 1543 +; MIPS64R5EB-NEXT: daddiu $5, $2, 2314 +; MIPS64R5EB-NEXT: daddiu $6, $1, 1801 +; MIPS64R5EB-NEXT: lui $1, 225 +; MIPS64R5EB-NEXT: daddiu $1, $1, 8417 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 8577 +; MIPS64R5EB-NEXT: dsll $1, $1, 19 +; MIPS64R5EB-NEXT: daddiu $7, $1, 2314 +; MIPS64R5EB-NEXT: ld $25, %call16(i8_16)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv16i8)($gp) +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS32EL-LABEL: calli8_16: ; MIPS32EL: # %bb.0: # %entry @@ -4320,6 +4017,87 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calli8_16: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: lui $1, 2569 +; MIPS32R5EL-NEXT: ori $2, $1, 2060 +; MIPS32R5EL-NEXT: lui $3, 2311 +; MIPS32R5EL-NEXT: sw $2, 28($sp) +; MIPS32R5EL-NEXT: ori $2, $3, 2311 +; MIPS32R5EL-NEXT: sw $2, 24($sp) +; MIPS32R5EL-NEXT: sw $2, 20($sp) +; MIPS32R5EL-NEXT: sw $2, 16($sp) +; MIPS32R5EL-NEXT: lui $2, 1798 +; MIPS32R5EL-NEXT: ori $4, $2, 1798 +; MIPS32R5EL-NEXT: ori $7, $1, 1798 +; MIPS32R5EL-NEXT: move $5, $4 +; MIPS32R5EL-NEXT: move $6, $4 +; MIPS32R5EL-NEXT: jal i8_16 +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: insert.w $w0[0], $2 +; MIPS32R5EL-NEXT: insert.w $w0[1], $3 +; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv16i8) +; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv16i8) +; MIPS32R5EL-NEXT: st.w $w0, 0($1) +; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop +; +; MIPS64R5EL-LABEL: calli8_16: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli8_16))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16))) +; MIPS64R5EL-NEXT: lui $1, 1285 +; MIPS64R5EL-NEXT: daddiu $1, $1, -31869 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 899 +; MIPS64R5EL-NEXT: lui $2, 2311 +; MIPS64R5EL-NEXT: daddiu $2, $2, 2311 +; MIPS64R5EL-NEXT: dsll $2, $2, 16 +; MIPS64R5EL-NEXT: daddiu $2, $2, 2311 +; MIPS64R5EL-NEXT: dsll $2, $2, 16 +; MIPS64R5EL-NEXT: dsll $1, $1, 17 +; MIPS64R5EL-NEXT: lui $3, 899 +; MIPS64R5EL-NEXT: daddiu $3, $3, 899 +; MIPS64R5EL-NEXT: dsll $3, $3, 16 +; MIPS64R5EL-NEXT: daddiu $3, $3, 899 +; MIPS64R5EL-NEXT: dsll $3, $3, 17 +; MIPS64R5EL-NEXT: daddiu $4, $3, 1798 +; MIPS64R5EL-NEXT: daddiu $5, $1, 1798 +; MIPS64R5EL-NEXT: daddiu $6, $2, 2311 +; MIPS64R5EL-NEXT: lui $1, 642 +; MIPS64R5EL-NEXT: daddiu $1, $1, 16899 +; MIPS64R5EL-NEXT: dsll $1, $1, 18 +; MIPS64R5EL-NEXT: daddiu $1, $1, 2311 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $7, $1, 2311 +; MIPS64R5EL-NEXT: ld $25, %call16(i8_16)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv16i8)($gp) +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <16 x i8> @i8_16(<16 x i8> , <16 x i8> ) store <16 x i8> %0, <16 x i8> * @gv16i8 @@ -4825,36 +4603,26 @@ ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 ; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill ; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: lui $1, 9 +; MIPS32R5EB-NEXT: ori $5, $1, 10 +; MIPS32R5EB-NEXT: sw $5, 28($sp) +; MIPS32R5EB-NEXT: lui $1, 12 +; MIPS32R5EB-NEXT: ori $1, $1, 8 +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: sw $5, 20($sp) ; MIPS32R5EB-NEXT: lui $1, 6 -; MIPS32R5EB-NEXT: ori $1, $1, 7 -; MIPS32R5EB-NEXT: lui $2, 9 -; MIPS32R5EB-NEXT: ori $2, $2, 10 -; MIPS32R5EB-NEXT: fill.w $w0, $2 -; MIPS32R5EB-NEXT: insert.w $w0[1], $1 -; MIPS32R5EB-NEXT: splati.d $w0, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5EB-NEXT: lui $1, %hi($CPI33_0) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo($CPI33_0) -; MIPS32R5EB-NEXT: ld.w $w0, 0($1) -; MIPS32R5EB-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EB-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EB-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EB-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5EB-NEXT: sw $8, 28($sp) -; MIPS32R5EB-NEXT: sw $3, 24($sp) -; MIPS32R5EB-NEXT: sw $2, 20($sp) -; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: ori $4, $1, 7 +; MIPS32R5EB-NEXT: sw $4, 16($sp) +; MIPS32R5EB-NEXT: move $6, $4 +; MIPS32R5EB-NEXT: move $7, $5 ; MIPS32R5EB-NEXT: jal i16_8 ; MIPS32R5EB-NEXT: nop -; MIPS32R5EB-NEXT: lui $1, %hi(gv8i16) -; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EB-NEXT: insert.w $w0[0], $2 ; MIPS32R5EB-NEXT: insert.w $w0[1], $3 ; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv8i16) ; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EB-NEXT: st.w $w0, 0($1) ; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload ; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 @@ -4872,20 +4640,21 @@ ; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_8))) ; MIPS64R5EB-NEXT: daddu $1, $1, $25 ; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli16_8))) -; MIPS64R5EB-NEXT: lui $1, 9 -; MIPS64R5EB-NEXT: ori $1, $1, 10 -; MIPS64R5EB-NEXT: lui $2, 6 -; MIPS64R5EB-NEXT: ori $2, $2, 7 -; MIPS64R5EB-NEXT: dinsu $1, $2, 32, 32 -; MIPS64R5EB-NEXT: fill.d $w0, $1 -; MIPS64R5EB-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5EB-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5EB-NEXT: ld $1, %got_page(.LCPI33_0)($gp) -; MIPS64R5EB-NEXT: daddiu $1, $1, %got_ofst(.LCPI33_0) -; MIPS64R5EB-NEXT: ld.d $w0, 0($1) -; MIPS64R5EB-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5EB-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5EB-NEXT: lui $1, 6 +; MIPS64R5EB-NEXT: daddiu $1, $1, 7 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $1, $1, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $4, $1, 10 +; MIPS64R5EB-NEXT: lui $1, 2 +; MIPS64R5EB-NEXT: daddiu $1, $1, -32767 +; MIPS64R5EB-NEXT: dsll $1, $1, 19 +; MIPS64R5EB-NEXT: daddiu $1, $1, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 16 +; MIPS64R5EB-NEXT: daddiu $7, $1, 10 ; MIPS64R5EB-NEXT: ld $25, %call16(i16_8)($gp) +; MIPS64R5EB-NEXT: move $5, $4 +; MIPS64R5EB-NEXT: move $6, $4 ; MIPS64R5EB-NEXT: jalr $25 ; MIPS64R5EB-NEXT: nop ; MIPS64R5EB-NEXT: ld $1, %got_disp(gv8i16)($gp) @@ -4973,35 +4742,25 @@ ; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill ; MIPS32R5EL-NEXT: .cfi_offset 31, -4 ; MIPS32R5EL-NEXT: lui $1, 10 -; MIPS32R5EL-NEXT: ori $1, $1, 9 -; MIPS32R5EL-NEXT: lui $2, 7 -; MIPS32R5EL-NEXT: ori $2, $2, 6 -; MIPS32R5EL-NEXT: fill.w $w0, $2 -; MIPS32R5EL-NEXT: insert.w $w0[1], $1 -; MIPS32R5EL-NEXT: splati.d $w0, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5EL-NEXT: lui $1, %hi($CPI33_0) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo($CPI33_0) -; MIPS32R5EL-NEXT: ld.w $w0, 0($1) -; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5EL-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5EL-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5EL-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5EL-NEXT: sw $8, 28($sp) -; MIPS32R5EL-NEXT: sw $3, 24($sp) -; MIPS32R5EL-NEXT: sw $2, 20($sp) -; MIPS32R5EL-NEXT: sw $1, 16($sp) +; MIPS32R5EL-NEXT: ori $5, $1, 9 +; MIPS32R5EL-NEXT: sw $5, 28($sp) +; MIPS32R5EL-NEXT: lui $1, 8 +; MIPS32R5EL-NEXT: ori $1, $1, 12 +; MIPS32R5EL-NEXT: sw $1, 24($sp) +; MIPS32R5EL-NEXT: sw $5, 20($sp) +; MIPS32R5EL-NEXT: lui $1, 7 +; MIPS32R5EL-NEXT: ori $4, $1, 6 +; MIPS32R5EL-NEXT: sw $4, 16($sp) +; MIPS32R5EL-NEXT: move $6, $4 +; MIPS32R5EL-NEXT: move $7, $5 ; MIPS32R5EL-NEXT: jal i16_8 ; MIPS32R5EL-NEXT: nop -; MIPS32R5EL-NEXT: lui $1, %hi(gv8i16) -; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EL-NEXT: insert.w $w0[0], $2 ; MIPS32R5EL-NEXT: insert.w $w0[1], $3 ; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv8i16) ; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv8i16) ; MIPS32R5EL-NEXT: st.w $w0, 0($1) ; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload ; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 @@ -5019,20 +4778,21 @@ ; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli16_8))) ; MIPS64R5EL-NEXT: daddu $1, $1, $25 ; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli16_8))) -; MIPS64R5EL-NEXT: lui $1, 7 -; MIPS64R5EL-NEXT: ori $1, $1, 6 -; MIPS64R5EL-NEXT: lui $2, 10 -; MIPS64R5EL-NEXT: ori $2, $2, 9 -; MIPS64R5EL-NEXT: dinsu $1, $2, 32, 32 -; MIPS64R5EL-NEXT: fill.d $w0, $1 -; MIPS64R5EL-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5EL-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5EL-NEXT: ld $1, %got_page(.LCPI33_0)($gp) -; MIPS64R5EL-NEXT: daddiu $1, $1, %got_ofst(.LCPI33_0) -; MIPS64R5EL-NEXT: ld.d $w0, 0($1) -; MIPS64R5EL-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5EL-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5EL-NEXT: lui $1, 10 +; MIPS64R5EL-NEXT: daddiu $1, $1, 9 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 7 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $4, $1, 6 +; MIPS64R5EL-NEXT: lui $1, 1 +; MIPS64R5EL-NEXT: daddiu $1, $1, 16385 +; MIPS64R5EL-NEXT: dsll $1, $1, 16 +; MIPS64R5EL-NEXT: daddiu $1, $1, 8193 +; MIPS64R5EL-NEXT: dsll $1, $1, 19 +; MIPS64R5EL-NEXT: daddiu $7, $1, 12 ; MIPS64R5EL-NEXT: ld $25, %call16(i16_8)($gp) +; MIPS64R5EL-NEXT: move $5, $4 +; MIPS64R5EL-NEXT: move $6, $4 ; MIPS64R5EL-NEXT: jalr $25 ; MIPS64R5EL-NEXT: nop ; MIPS64R5EL-NEXT: ld $1, %got_disp(gv8i16)($gp) @@ -5304,39 +5064,38 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: calli32_4: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI35_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI35_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI35_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI35_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(i32_4)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv4i32)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: calli32_4: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 3 +; MIPS64R5EB-NEXT: dsll $2, $1, 33 +; MIPS64R5EB-NEXT: daddiu $4, $2, 7 +; MIPS64R5EB-NEXT: dsll $1, $1, 34 +; MIPS64R5EB-NEXT: daddiu $6, $1, 8 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 9 +; MIPS64R5EB-NEXT: dsll $1, $1, 32 +; MIPS64R5EB-NEXT: daddiu $5, $1, 10 +; MIPS64R5EB-NEXT: ld $25, %call16(i32_4)($gp) +; MIPS64R5EB-NEXT: move $7, $5 +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4i32)($gp) +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: calli32_4: ; MIPS64EL: # %bb.0: # %entry @@ -5370,6 +5129,40 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: calli32_4: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(calli32_4))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 7 +; MIPS64R5EL-NEXT: dsll $1, $1, 32 +; MIPS64R5EL-NEXT: daddiu $4, $1, 6 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 1 +; MIPS64R5EL-NEXT: dsll $1, $1, 35 +; MIPS64R5EL-NEXT: daddiu $6, $1, 12 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 5 +; MIPS64R5EL-NEXT: dsll $1, $1, 33 +; MIPS64R5EL-NEXT: daddiu $5, $1, 9 +; MIPS64R5EL-NEXT: ld $25, %call16(i32_4)($gp) +; MIPS64R5EL-NEXT: move $7, $5 +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4i32)($gp) +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <4 x i32> @i32_4(<4 x i32> , <4 x i32> ) store <4 x i32> %0, <4 x i32> * @gv4i32 @@ -5433,43 +5226,35 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5-LABEL: calli64_2: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -40 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 40 -; MIPS32R5-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: lui $1, %hi($CPI36_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI36_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $4, $w0[0] -; MIPS32R5-NEXT: copy_s.w $5, $w0[1] -; MIPS32R5-NEXT: copy_s.w $6, $w0[2] -; MIPS32R5-NEXT: copy_s.w $7, $w0[3] -; MIPS32R5-NEXT: lui $1, %hi($CPI36_1) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI36_1) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $1, $w0[0] -; MIPS32R5-NEXT: copy_s.w $2, $w0[1] -; MIPS32R5-NEXT: copy_s.w $3, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: jal i64_2 -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv2i64) -; MIPS32R5-NEXT: insert.w $w0[0], $2 -; MIPS32R5-NEXT: insert.w $w0[1], $3 -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2i64) -; MIPS32R5-NEXT: insert.w $w0[2], $4 -; MIPS32R5-NEXT: insert.w $w0[3], $5 -; MIPS32R5-NEXT: st.w $w0, 0($1) -; MIPS32R5-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 40 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calli64_2: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EB-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: addiu $1, $zero, 8 +; MIPS32R5EB-NEXT: sw $1, 28($sp) +; MIPS32R5EB-NEXT: addiu $1, $zero, 12 +; MIPS32R5EB-NEXT: sw $1, 20($sp) +; MIPS32R5EB-NEXT: sw $zero, 24($sp) +; MIPS32R5EB-NEXT: sw $zero, 16($sp) +; MIPS32R5EB-NEXT: addiu $4, $zero, 0 +; MIPS32R5EB-NEXT: addiu $5, $zero, 6 +; MIPS32R5EB-NEXT: addiu $6, $zero, 0 +; MIPS32R5EB-NEXT: addiu $7, $zero, 7 +; MIPS32R5EB-NEXT: jal i64_2 +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: insert.w $w0[0], $2 +; MIPS32R5EB-NEXT: insert.w $w0[1], $3 +; MIPS32R5EB-NEXT: insert.w $w0[2], $4 +; MIPS32R5EB-NEXT: lui $1, %hi(gv2i64) +; MIPS32R5EB-NEXT: insert.w $w0[3], $5 +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2i64) +; MIPS32R5EB-NEXT: st.w $w0, 0($1) +; MIPS32R5EB-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; ; MIPS64R5-LABEL: calli64_2: ; MIPS64R5: # %bb.0: # %entry @@ -5527,6 +5312,36 @@ ; MIPS32EL-NEXT: addiu $sp, $sp, 40 ; MIPS32EL-NEXT: jr $ra ; MIPS32EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calli64_2: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -40 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 40 +; MIPS32R5EL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: addiu $1, $zero, 8 +; MIPS32R5EL-NEXT: sw $1, 24($sp) +; MIPS32R5EL-NEXT: addiu $1, $zero, 12 +; MIPS32R5EL-NEXT: sw $1, 16($sp) +; MIPS32R5EL-NEXT: sw $zero, 28($sp) +; MIPS32R5EL-NEXT: sw $zero, 20($sp) +; MIPS32R5EL-NEXT: addiu $4, $zero, 6 +; MIPS32R5EL-NEXT: addiu $5, $zero, 0 +; MIPS32R5EL-NEXT: addiu $6, $zero, 7 +; MIPS32R5EL-NEXT: addiu $7, $zero, 0 +; MIPS32R5EL-NEXT: jal i64_2 +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: insert.w $w0[0], $2 +; MIPS32R5EL-NEXT: insert.w $w0[1], $3 +; MIPS32R5EL-NEXT: insert.w $w0[2], $4 +; MIPS32R5EL-NEXT: lui $1, %hi(gv2i64) +; MIPS32R5EL-NEXT: insert.w $w0[3], $5 +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2i64) +; MIPS32R5EL-NEXT: st.w $w0, 0($1) +; MIPS32R5EL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 40 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop entry: %0 = call <2 x i64> @i64_2(<2 x i64> , <2 x i64> ) store <2 x i64> %0, <2 x i64> * @gv2i64 @@ -5618,35 +5433,33 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: callfloat_2: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI37_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI37_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI37_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI37_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $5, $w0[0] -; MIPS64R5-NEXT: ld $25, %call16(float2_extern)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: ld $1, %got_disp(gv2f32)($gp) -; MIPS64R5-NEXT: sd $2, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: callfloat_2: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 383 +; MIPS64R5EB-NEXT: dsll $4, $1, 23 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 261 +; MIPS64R5EB-NEXT: dsll $1, $1, 33 +; MIPS64R5EB-NEXT: daddiu $1, $1, 523 +; MIPS64R5EB-NEXT: dsll $5, $1, 21 +; MIPS64R5EB-NEXT: ld $25, %call16(float2_extern)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv2f32)($gp) +; MIPS64R5EB-NEXT: sd $2, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: callfloat_2: ; MIPS64EL: # %bb.0: # %entry @@ -5675,6 +5488,34 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: callfloat_2: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_2))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 383 +; MIPS64R5EL-NEXT: dsll $4, $1, 55 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 523 +; MIPS64R5EL-NEXT: dsll $1, $1, 31 +; MIPS64R5EL-NEXT: daddiu $1, $1, 261 +; MIPS64R5EL-NEXT: dsll $5, $1, 22 +; MIPS64R5EL-NEXT: ld $25, %call16(float2_extern)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv2f32)($gp) +; MIPS64R5EL-NEXT: sd $2, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <2 x float> @float2_extern(<2 x float> , <2 x float> ) store <2 x float> %0, <2 x float> * @gv2f32 @@ -5777,27 +5618,21 @@ ; MIPS32R5-NEXT: .cfi_def_cfa_register 30 ; MIPS32R5-NEXT: addiu $1, $zero, -16 ; MIPS32R5-NEXT: and $sp, $sp, $1 -; MIPS32R5-NEXT: lui $1, %hi($CPI38_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI38_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $6, $w0[0] -; MIPS32R5-NEXT: copy_s.w $7, $w0[1] -; MIPS32R5-NEXT: copy_s.w $1, $w0[2] -; MIPS32R5-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5-NEXT: lui $3, %hi($CPI38_1) -; MIPS32R5-NEXT: addiu $3, $3, %lo($CPI38_1) -; MIPS32R5-NEXT: ld.w $w0, 0($3) -; MIPS32R5-NEXT: copy_s.w $3, $w0[0] -; MIPS32R5-NEXT: copy_s.w $4, $w0[1] -; MIPS32R5-NEXT: copy_s.w $5, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 36($sp) -; MIPS32R5-NEXT: sw $5, 32($sp) -; MIPS32R5-NEXT: sw $4, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) +; MIPS32R5-NEXT: lui $1, 16704 +; MIPS32R5-NEXT: lui $2, 16736 +; MIPS32R5-NEXT: lui $3, 16752 +; MIPS32R5-NEXT: lui $4, 16768 +; MIPS32R5-NEXT: sw $4, 36($sp) +; MIPS32R5-NEXT: sw $3, 32($sp) +; MIPS32R5-NEXT: sw $2, 28($sp) +; MIPS32R5-NEXT: sw $1, 24($sp) +; MIPS32R5-NEXT: lui $1, 16512 +; MIPS32R5-NEXT: sw $1, 20($sp) +; MIPS32R5-NEXT: lui $1, 16384 ; MIPS32R5-NEXT: sw $1, 16($sp) ; MIPS32R5-NEXT: addiu $4, $sp, 48 +; MIPS32R5-NEXT: addiu $6, $zero, 0 +; MIPS32R5-NEXT: lui $7, 49024 ; MIPS32R5-NEXT: jal float4_extern ; MIPS32R5-NEXT: nop ; MIPS32R5-NEXT: lui $1, %hi(gv4f32) @@ -5811,39 +5646,43 @@ ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: callfloat_4: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -16 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R5-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill -; MIPS64R5-NEXT: .cfi_offset 31, -8 -; MIPS64R5-NEXT: .cfi_offset 28, -16 -; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) -; MIPS64R5-NEXT: daddu $1, $1, $25 -; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI38_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI38_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI38_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI38_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] -; MIPS64R5-NEXT: ld $25, %call16(float4_extern)($gp) -; MIPS64R5-NEXT: jalr $25 -; MIPS64R5-NEXT: nop -; MIPS64R5-NEXT: insert.d $w0[0], $2 -; MIPS64R5-NEXT: insert.d $w0[1], $3 -; MIPS64R5-NEXT: ld $1, %got_disp(gv4f32)($gp) -; MIPS64R5-NEXT: st.d $w0, 0($1) -; MIPS64R5-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload -; MIPS64R5-NEXT: daddiu $sp, $sp, 16 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: callfloat_4: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EB-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EB-NEXT: .cfi_offset 31, -8 +; MIPS64R5EB-NEXT: .cfi_offset 28, -16 +; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EB-NEXT: daddu $1, $1, $25 +; MIPS64R5EB-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EB-NEXT: daddiu $1, $zero, 1 +; MIPS64R5EB-NEXT: dsll $1, $1, 39 +; MIPS64R5EB-NEXT: daddiu $1, $1, 129 +; MIPS64R5EB-NEXT: daddiu $2, $zero, 261 +; MIPS64R5EB-NEXT: dsll $2, $2, 33 +; MIPS64R5EB-NEXT: daddiu $3, $zero, 383 +; MIPS64R5EB-NEXT: dsll $4, $3, 23 +; MIPS64R5EB-NEXT: dsll $5, $1, 23 +; MIPS64R5EB-NEXT: daddiu $1, $2, 523 +; MIPS64R5EB-NEXT: dsll $6, $1, 21 +; MIPS64R5EB-NEXT: daddiu $1, $zero, 1047 +; MIPS64R5EB-NEXT: dsll $1, $1, 29 +; MIPS64R5EB-NEXT: daddiu $1, $1, 131 +; MIPS64R5EB-NEXT: dsll $7, $1, 23 +; MIPS64R5EB-NEXT: ld $25, %call16(float4_extern)($gp) +; MIPS64R5EB-NEXT: jalr $25 +; MIPS64R5EB-NEXT: nop +; MIPS64R5EB-NEXT: insert.d $w0[0], $2 +; MIPS64R5EB-NEXT: insert.d $w0[1], $3 +; MIPS64R5EB-NEXT: ld $1, %got_disp(gv4f32)($gp) +; MIPS64R5EB-NEXT: st.d $w0, 0($1) +; MIPS64R5EB-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: callfloat_4: ; MIPS64EL: # %bb.0: # %entry @@ -5881,6 +5720,44 @@ ; MIPS64EL-NEXT: daddiu $sp, $sp, 16 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: callfloat_4: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -16 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 16 +; MIPS64R5EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: sd $gp, 0($sp) # 8-byte Folded Spill +; MIPS64R5EL-NEXT: .cfi_offset 31, -8 +; MIPS64R5EL-NEXT: .cfi_offset 28, -16 +; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EL-NEXT: daddu $1, $1, $25 +; MIPS64R5EL-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4))) +; MIPS64R5EL-NEXT: daddiu $1, $zero, 129 +; MIPS64R5EL-NEXT: dsll $1, $1, 25 +; MIPS64R5EL-NEXT: daddiu $1, $1, 1 +; MIPS64R5EL-NEXT: daddiu $2, $zero, 523 +; MIPS64R5EL-NEXT: dsll $2, $2, 31 +; MIPS64R5EL-NEXT: daddiu $3, $zero, 383 +; MIPS64R5EL-NEXT: dsll $4, $3, 55 +; MIPS64R5EL-NEXT: dsll $5, $1, 30 +; MIPS64R5EL-NEXT: daddiu $1, $2, 261 +; MIPS64R5EL-NEXT: dsll $6, $1, 22 +; MIPS64R5EL-NEXT: daddiu $1, $zero, 131 +; MIPS64R5EL-NEXT: dsll $1, $1, 35 +; MIPS64R5EL-NEXT: daddiu $1, $1, 1047 +; MIPS64R5EL-NEXT: dsll $7, $1, 20 +; MIPS64R5EL-NEXT: ld $25, %call16(float4_extern)($gp) +; MIPS64R5EL-NEXT: jalr $25 +; MIPS64R5EL-NEXT: nop +; MIPS64R5EL-NEXT: insert.d $w0[0], $2 +; MIPS64R5EL-NEXT: insert.d $w0[1], $3 +; MIPS64R5EL-NEXT: ld $1, %got_disp(gv4f32)($gp) +; MIPS64R5EL-NEXT: st.d $w0, 0($1) +; MIPS64R5EL-NEXT: ld $gp, 0($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = call <4 x float> @float4_extern(<4 x float> , <4 x float> ) store <4 x float> %0, <4 x float> * @gv4f32 @@ -5957,51 +5834,42 @@ ; MIPS64-NEXT: jr $ra ; MIPS64-NEXT: nop ; -; MIPS32R5-LABEL: calldouble_2: -; MIPS32R5: # %bb.0: # %entry -; MIPS32R5-NEXT: addiu $sp, $sp, -80 -; MIPS32R5-NEXT: .cfi_def_cfa_offset 80 -; MIPS32R5-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill -; MIPS32R5-NEXT: .cfi_offset 31, -4 -; MIPS32R5-NEXT: .cfi_offset 30, -8 -; MIPS32R5-NEXT: move $fp, $sp -; MIPS32R5-NEXT: .cfi_def_cfa_register 30 -; MIPS32R5-NEXT: addiu $1, $zero, -16 -; MIPS32R5-NEXT: and $sp, $sp, $1 -; MIPS32R5-NEXT: lui $1, %hi($CPI39_0) -; MIPS32R5-NEXT: addiu $1, $1, %lo($CPI39_0) -; MIPS32R5-NEXT: ld.w $w0, 0($1) -; MIPS32R5-NEXT: copy_s.w $6, $w0[0] -; MIPS32R5-NEXT: copy_s.w $7, $w0[1] -; MIPS32R5-NEXT: copy_s.w $1, $w0[2] -; MIPS32R5-NEXT: copy_s.w $2, $w0[3] -; MIPS32R5-NEXT: lui $3, %hi($CPI39_1) -; MIPS32R5-NEXT: addiu $3, $3, %lo($CPI39_1) -; MIPS32R5-NEXT: ld.w $w0, 0($3) -; MIPS32R5-NEXT: copy_s.w $3, $w0[0] -; MIPS32R5-NEXT: copy_s.w $4, $w0[1] -; MIPS32R5-NEXT: copy_s.w $5, $w0[2] -; MIPS32R5-NEXT: copy_s.w $8, $w0[3] -; MIPS32R5-NEXT: sw $8, 36($sp) -; MIPS32R5-NEXT: sw $5, 32($sp) -; MIPS32R5-NEXT: sw $4, 28($sp) -; MIPS32R5-NEXT: sw $3, 24($sp) -; MIPS32R5-NEXT: sw $2, 20($sp) -; MIPS32R5-NEXT: sw $1, 16($sp) -; MIPS32R5-NEXT: addiu $4, $sp, 48 -; MIPS32R5-NEXT: jal double2_extern -; MIPS32R5-NEXT: nop -; MIPS32R5-NEXT: lui $1, %hi(gv2f64) -; MIPS32R5-NEXT: addiu $1, $1, %lo(gv2f64) -; MIPS32R5-NEXT: ld.d $w0, 48($sp) -; MIPS32R5-NEXT: st.d $w0, 0($1) -; MIPS32R5-NEXT: move $sp, $fp -; MIPS32R5-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload -; MIPS32R5-NEXT: addiu $sp, $sp, 80 -; MIPS32R5-NEXT: jr $ra -; MIPS32R5-NEXT: nop +; MIPS32R5EB-LABEL: calldouble_2: +; MIPS32R5EB: # %bb.0: # %entry +; MIPS32R5EB-NEXT: addiu $sp, $sp, -80 +; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 80 +; MIPS32R5EB-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill +; MIPS32R5EB-NEXT: .cfi_offset 31, -4 +; MIPS32R5EB-NEXT: .cfi_offset 30, -8 +; MIPS32R5EB-NEXT: move $fp, $sp +; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EB-NEXT: addiu $1, $zero, -16 +; MIPS32R5EB-NEXT: and $sp, $sp, $1 +; MIPS32R5EB-NEXT: lui $1, 16424 +; MIPS32R5EB-NEXT: lui $2, 16428 +; MIPS32R5EB-NEXT: sw $2, 32($sp) +; MIPS32R5EB-NEXT: sw $1, 24($sp) +; MIPS32R5EB-NEXT: lui $1, 49136 +; MIPS32R5EB-NEXT: sw $1, 16($sp) +; MIPS32R5EB-NEXT: sw $zero, 36($sp) +; MIPS32R5EB-NEXT: sw $zero, 28($sp) +; MIPS32R5EB-NEXT: sw $zero, 20($sp) +; MIPS32R5EB-NEXT: addiu $4, $sp, 48 +; MIPS32R5EB-NEXT: addiu $6, $zero, 0 +; MIPS32R5EB-NEXT: addiu $7, $zero, 0 +; MIPS32R5EB-NEXT: jal double2_extern +; MIPS32R5EB-NEXT: nop +; MIPS32R5EB-NEXT: lui $1, %hi(gv2f64) +; MIPS32R5EB-NEXT: addiu $1, $1, %lo(gv2f64) +; MIPS32R5EB-NEXT: ld.d $w0, 48($sp) +; MIPS32R5EB-NEXT: st.d $w0, 0($1) +; MIPS32R5EB-NEXT: move $sp, $fp +; MIPS32R5EB-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload +; MIPS32R5EB-NEXT: addiu $sp, $sp, 80 +; MIPS32R5EB-NEXT: jr $ra +; MIPS32R5EB-NEXT: nop ; ; MIPS64R5-LABEL: calldouble_2: ; MIPS64R5: # %bb.0: # %entry @@ -6014,17 +5882,14 @@ ; MIPS64R5-NEXT: lui $1, %hi(%neg(%gp_rel(calldouble_2))) ; MIPS64R5-NEXT: daddu $1, $1, $25 ; MIPS64R5-NEXT: daddiu $gp, $1, %lo(%neg(%gp_rel(calldouble_2))) -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI39_0)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI39_0) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $4, $w0[0] -; MIPS64R5-NEXT: copy_s.d $5, $w0[1] -; MIPS64R5-NEXT: ld $1, %got_page(.LCPI39_1)($gp) -; MIPS64R5-NEXT: daddiu $1, $1, %got_ofst(.LCPI39_1) -; MIPS64R5-NEXT: ld.d $w0, 0($1) -; MIPS64R5-NEXT: copy_s.d $6, $w0[0] -; MIPS64R5-NEXT: copy_s.d $7, $w0[1] +; MIPS64R5-NEXT: daddiu $1, $zero, 3071 +; MIPS64R5-NEXT: dsll $5, $1, 52 +; MIPS64R5-NEXT: daddiu $1, $zero, 2053 +; MIPS64R5-NEXT: dsll $6, $1, 51 +; MIPS64R5-NEXT: daddiu $1, $zero, 4107 +; MIPS64R5-NEXT: dsll $7, $1, 50 ; MIPS64R5-NEXT: ld $25, %call16(double2_extern)($gp) +; MIPS64R5-NEXT: daddiu $4, $zero, 0 ; MIPS64R5-NEXT: jalr $25 ; MIPS64R5-NEXT: nop ; MIPS64R5-NEXT: insert.d $w0[0], $2 @@ -6075,6 +5940,43 @@ ; MIPS32EL-NEXT: addiu $sp, $sp, 80 ; MIPS32EL-NEXT: jr $ra ; MIPS32EL-NEXT: nop +; +; MIPS32R5EL-LABEL: calldouble_2: +; MIPS32R5EL: # %bb.0: # %entry +; MIPS32R5EL-NEXT: addiu $sp, $sp, -80 +; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 80 +; MIPS32R5EL-NEXT: sw $ra, 76($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: sw $fp, 72($sp) # 4-byte Folded Spill +; MIPS32R5EL-NEXT: .cfi_offset 31, -4 +; MIPS32R5EL-NEXT: .cfi_offset 30, -8 +; MIPS32R5EL-NEXT: move $fp, $sp +; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30 +; MIPS32R5EL-NEXT: addiu $1, $zero, -16 +; MIPS32R5EL-NEXT: and $sp, $sp, $1 +; MIPS32R5EL-NEXT: lui $1, 16424 +; MIPS32R5EL-NEXT: lui $2, 16428 +; MIPS32R5EL-NEXT: sw $2, 36($sp) +; MIPS32R5EL-NEXT: sw $1, 28($sp) +; MIPS32R5EL-NEXT: lui $1, 49136 +; MIPS32R5EL-NEXT: sw $1, 20($sp) +; MIPS32R5EL-NEXT: sw $zero, 32($sp) +; MIPS32R5EL-NEXT: sw $zero, 24($sp) +; MIPS32R5EL-NEXT: sw $zero, 16($sp) +; MIPS32R5EL-NEXT: addiu $4, $sp, 48 +; MIPS32R5EL-NEXT: addiu $6, $zero, 0 +; MIPS32R5EL-NEXT: addiu $7, $zero, 0 +; MIPS32R5EL-NEXT: jal double2_extern +; MIPS32R5EL-NEXT: nop +; MIPS32R5EL-NEXT: lui $1, %hi(gv2f64) +; MIPS32R5EL-NEXT: addiu $1, $1, %lo(gv2f64) +; MIPS32R5EL-NEXT: ld.d $w0, 48($sp) +; MIPS32R5EL-NEXT: st.d $w0, 0($1) +; MIPS32R5EL-NEXT: move $sp, $fp +; MIPS32R5EL-NEXT: lw $fp, 72($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: lw $ra, 76($sp) # 4-byte Folded Reload +; MIPS32R5EL-NEXT: addiu $sp, $sp, 80 +; MIPS32R5EL-NEXT: jr $ra +; MIPS32R5EL-NEXT: nop entry: %0 = call <2 x double> @double2_extern(<2 x double> , <2 x double> ) store <2 x double> %0, <2 x double> * @gv2f64 diff --git a/llvm/test/CodeGen/PowerPC/pr45709.ll b/llvm/test/CodeGen/PowerPC/pr45709.ll --- a/llvm/test/CodeGen/PowerPC/pr45709.ll +++ b/llvm/test/CodeGen/PowerPC/pr45709.ll @@ -10,7 +10,7 @@ define dso_local void @_ZN1a1bEv(<4 x float> %in) local_unnamed_addr #0 align 2 { ; CHECK-LABEL: _ZN1a1bEv: ; CHECK: # %bb.0: -; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_6 +; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB0_4 ; CHECK-NEXT: b .LBB0_1 ; CHECK-NEXT: .LBB0_1: # %.preheader ; CHECK-NEXT: b .LBB0_2 @@ -21,26 +21,18 @@ ; CHECK-NEXT: addi r3, r3, .LCPI0_0@toc@l ; CHECK-NEXT: lvx v3, 0, r3 ; CHECK-NEXT: vperm v2, v2, v2, v3 -; CHECK-NEXT: vxor v3, v3, v3 -; CHECK-NEXT: addi r3, r1, -48 -; CHECK-NEXT: stvx v3, 0, r3 ; CHECK-NEXT: addi r3, r1, -32 ; CHECK-NEXT: stvx v2, 0, r3 -; CHECK-NEXT: lwz r3, -48(r1) -; CHECK-NEXT: lwz r4, -32(r1) -; CHECK-NEXT: cmpw r4, r3 -; CHECK-NEXT: bc 12, gt, .LBB0_4 -; CHECK-NEXT: b .LBB0_5 -; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: addi r3, r4, 0 -; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: lwz r3, -32(r1) +; CHECK-NEXT: srawi r4, r3, 31 +; CHECK-NEXT: andc r3, r3, r4 ; CHECK-NEXT: cmpw r3, r3 -; CHECK-NEXT: stw r3, -64(r1) -; CHECK-NEXT: addi r3, r1, -64 +; CHECK-NEXT: stw r3, -48(r1) +; CHECK-NEXT: addi r3, r1, -48 ; CHECK-NEXT: lvx v2, 0, r3 ; CHECK-NEXT: addi r3, r1, -16 ; CHECK-NEXT: stvx v2, 0, r3 -; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: blr br i1 undef, label %7, label %1 diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll --- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -80,12 +80,12 @@ ; CHECK-NEXT: ldr r1, [sp, #24] ; CHECK-NEXT: vmov q1[2], q1[0], r2, r1 ; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: strd r3, r2, [r0, #16] -; CHECK-NEXT: str r1, [r0, #24] +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: strd r1, r3, [r0, #16] +; CHECK-NEXT: str r2, [r0, #24] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: @@ -407,116 +407,94 @@ define void @test_width2(i32* nocapture readnone %x, i32* nocapture %y, i8 zeroext %m) { ; CHECK-LABEL: test_width2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: sub sp, #8 ; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: beq.w .LBB4_3 +; CHECK-NEXT: beq .LBB4_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: adds r0, r2, #1 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r2 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: bic r0, r0, #1 -; CHECK-NEXT: adr r2, .LCPI4_0 +; CHECK-NEXT: vldr s0, .LCPI4_0 ; CHECK-NEXT: subs r0, #2 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff -; CHECK-NEXT: vldrw.u32 q2, [r2] +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: add.w lr, r3, r0, lsr #1 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: adr r0, .LCPI4_1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vand q1, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3[2], q3[0], r8, r8 -; CHECK-NEXT: vmov r7, s6 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: vmov r6, s7 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: add.w r8, r8, #2 -; CHECK-NEXT: vmov r9, s12 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r3, #1 -; CHECK-NEXT: vmov q3[2], q3[0], r9, r3 -; CHECK-NEXT: vand q3, q3, q0 -; CHECK-NEXT: adc r12, r2, #0 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov r4, s15 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: subs r7, r5, r7 -; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: sbcs r4, r6 -; CHECK-NEXT: vmov r6, s13 -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: adds r0, r3, #1 +; CHECK-NEXT: adc r4, r12, #0 +; CHECK-NEXT: subs r5, r3, r2 +; CHECK-NEXT: sbcs r5, r12, #0 +; CHECK-NEXT: add.w r3, r3, #2 +; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r4, #1 -; CHECK-NEXT: cmp r4, #0 -; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: subs r2, r7, r2 -; CHECK-NEXT: sbcs.w r0, r6, r0 +; CHECK-NEXT: movlo r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: subs r0, r0, r2 +; CHECK-NEXT: sbcs r0, r12, #0 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: vmov q3[2], q3[0], r0, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r4 -; CHECK-NEXT: eor.w r0, r5, r3 -; CHECK-NEXT: orrs.w r0, r0, r12 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: vmov q2[2], q2[0], r5, r0 +; CHECK-NEXT: vmov q2[3], q2[1], r5, r0 ; CHECK-NEXT: cset r0, ne ; CHECK-NEXT: tst.w r0, #1 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: teq.w r7, r9 -; CHECK-NEXT: cset r2, ne -; CHECK-NEXT: tst.w r2, #1 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r0 -; CHECK-NEXT: veor q4, q4, q2 -; CHECK-NEXT: vand q4, q4, q3 -; CHECK-NEXT: @ implicit-def: $q3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: and r2, r2, #1 -; CHECK-NEXT: orr.w r3, r2, r0, lsl #1 -; CHECK-NEXT: sub.w r2, r1, #8 -; CHECK-NEXT: lsls r0, r3, #31 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vmov.f32 s1, s0 +; CHECK-NEXT: vmov.f32 s3, s2 +; CHECK-NEXT: veor q3, q0, q1 +; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: @ implicit-def: $q2 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: and r4, r4, #1 +; CHECK-NEXT: orr.w r4, r4, r0, lsl #1 +; CHECK-NEXT: lsls r0, r4, #31 +; CHECK-NEXT: and r4, r4, #3 +; CHECK-NEXT: sub.w r0, r1, #8 ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne r0, [r2] -; CHECK-NEXT: vmovne.32 q3[0], r0 -; CHECK-NEXT: and r0, r3, #3 -; CHECK-NEXT: lsls r0, r0, #30 +; CHECK-NEXT: ldrne r5, [r0] +; CHECK-NEXT: vmovne.32 q2[0], r5 +; CHECK-NEXT: lsls r4, r4, #30 ; CHECK-NEXT: itt mi -; CHECK-NEXT: ldrmi r0, [r2, #4] -; CHECK-NEXT: vmovmi.32 q3[2], r0 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: and r2, r2, #1 -; CHECK-NEXT: orr.w r2, r2, r0, lsl #1 -; CHECK-NEXT: lsls r0, r2, #31 +; CHECK-NEXT: ldrmi r0, [r0, #4] +; CHECK-NEXT: vmovmi.32 q2[2], r0 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: and r4, r4, #1 +; CHECK-NEXT: orr.w r0, r4, r0, lsl #1 +; CHECK-NEXT: lsls r4, r0, #31 +; CHECK-NEXT: and r0, r0, #3 ; CHECK-NEXT: itt ne -; CHECK-NEXT: vmovne r0, s12 -; CHECK-NEXT: strne r0, [r1] -; CHECK-NEXT: and r0, r2, #3 +; CHECK-NEXT: vmovne r4, s8 +; CHECK-NEXT: strne r4, [r1] ; CHECK-NEXT: lsls r0, r0, #30 ; CHECK-NEXT: itt mi -; CHECK-NEXT: vmovmi r0, s14 +; CHECK-NEXT: vmovmi r0, s10 ; CHECK-NEXT: strmi r0, [r1, #4] ; CHECK-NEXT: adds r1, #8 ; CHECK-NEXT: le lr, .LBB4_2 ; CHECK-NEXT: .LBB4_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: -; CHECK-NEXT: .LCPI4_0: +; CHECK-NEXT: .LCPI4_1: ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .LCPI4_0: +; CHECK-NEXT: .long 0x00000000 @ float 0 entry: %cmp9.not = icmp eq i8 %m, 0 br i1 %cmp9.not, label %for.cond.cleanup, label %for.body.preheader diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll @@ -52,12 +52,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q5, [r0] -; CHECK-NEXT: vmov.f64 d8, d10 -; CHECK-NEXT: vmov.f32 s18, s21 +; CHECK-NEXT: vmov.f64 d8, d11 +; CHECK-NEXT: vmov.f32 s18, s23 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: bl __aeabi_l2d @@ -67,24 +67,23 @@ ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov.f64 d12, d11 -; CHECK-NEXT: vmov.f32 s26, s23 +; CHECK-NEXT: vmov.f32 s22, s21 ; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov r2, s22 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov d11, r0, r1 ; CHECK-NEXT: asrs r3, r2, #31 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d ; CHECK-NEXT: vmov d10, r0, r1 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} entry: %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -103,14 +103,18 @@ ; CHECK-NEXT: vmov.u16 r0, q1[6] ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.f32 s2, s10 +; CHECK-NEXT: vmov.16 q3[6], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.f32 s3, s15 ; CHECK-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> @@ -387,19 +391,23 @@ ; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vmov r0, s3 ; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s8, s0 ; CHECK-NEXT: vmov.16 q1[2], r1 ; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.16 q2[4], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov.16 q2[5], r0 ; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q3[6], r1 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.f32 s7, s15 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll --- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll +++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll @@ -71,10 +71,12 @@ ; CHECK-FP-NEXT: vmov r0, s5 ; CHECK-FP-NEXT: adds r1, r1, r3 ; CHECK-FP-NEXT: vmov q0[2], q0[0], r1, lr +; CHECK-FP-NEXT: vmov.32 q1[0], r1 ; CHECK-FP-NEXT: adcs r0, r2 ; CHECK-FP-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-FP-NEXT: vmov r0, r1, d0 +; CHECK-FP-NEXT: vmov.32 q1[1], r0 ; CHECK-FP-NEXT: vmov r2, r3, d1 +; CHECK-FP-NEXT: vmov r0, r1, d2 ; CHECK-FP-NEXT: pop {r7, pc} entry: %sum = add <2 x i64> %lhs, %rhs diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -189,20 +189,20 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vmov.f32 s16, s6 +; CHECK-NEXT: vmov.f32 s14, s3 ; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov.f32 s18, s7 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: asrs r1, r0, #31 ; CHECK-NEXT: subs r0, r0, r2 ; CHECK-NEXT: sbc.w r1, r1, r2, asr #31 ; CHECK-NEXT: add.w r0, r0, r1, asr #31 ; CHECK-NEXT: eor.w r0, r0, r1, asr #31 ; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f32 s2, s1 ; CHECK-NEXT: asrs r2, r1, #31 ; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: sbc.w r2, r2, r3, asr #31 @@ -424,51 +424,40 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.i64 q4, #0xffffffff -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s10, s7 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vand q2, q2, q4 -; CHECK-NEXT: vand q3, q3, q4 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s13 -; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov.f32 s12, s6 +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vmov.f32 s16, s2 +; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vand q1, q1, q4 -; CHECK-NEXT: vand q4, q0, q4 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: sbc r2, r0, #0 +; CHECK-NEXT: add.w r1, r1, r2, asr #31 +; CHECK-NEXT: eor.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: sbc.w r0, r1, r0 -; CHECK-NEXT: add.w r1, r2, r0, asr #31 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: eor.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: subs r0, r0, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: sbc.w r1, r2, r1 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: add.w r0, r0, r1, asr #31 -; CHECK-NEXT: eor.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r12 -; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: sbc r3, r0, #0 +; CHECK-NEXT: add.w r2, r2, r3, asr #31 +; CHECK-NEXT: eor.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: sbc r2, r0, #0 +; CHECK-NEXT: add.w r1, r1, r2, asr #31 +; CHECK-NEXT: eor.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: subs r2, r3, r2 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: sbc.w r0, r1, r0 -; CHECK-NEXT: add.w r1, r2, r0, asr #31 -; CHECK-NEXT: vmov r2, s19 -; CHECK-NEXT: eor.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: subs r0, r0, r3 -; CHECK-NEXT: sbc.w r1, r2, r1 -; CHECK-NEXT: add.w r0, r0, r1, asr #31 -; CHECK-NEXT: eor.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-NEXT: sbc r0, r0, #0 +; CHECK-NEXT: add.w r2, r2, r0, asr #31 +; CHECK-NEXT: eor.w r0, r2, r0, asr #31 +; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> @@ -596,75 +585,65 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vabd_loop_s32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: mov.w r12, #1 -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov.f32 s12, s4 -; CHECK-NEXT: vmov.f32 s16, s8 -; CHECK-NEXT: vmov.f32 s14, s5 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov.f32 s18, s9 -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov r7, s18 -; CHECK-NEXT: asrs r4, r3, #31 -; CHECK-NEXT: subs.w r8, r3, r5 -; CHECK-NEXT: sbc.w r4, r4, r5, asr #31 -; CHECK-NEXT: asrs r5, r4, #31 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: bfi r4, r5, #0, #4 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: vmov.f32 s12, s6 -; CHECK-NEXT: vmov.f32 s14, s7 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: vldrw.u32 q4, [r1], #16 ; CHECK-NEXT: vmov.f32 s4, s10 ; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s14, s19 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov.f32 s10, s9 +; CHECK-NEXT: vmov.f32 s18, s17 +; CHECK-NEXT: vmov r7, s8 +; CHECK-NEXT: vmov r6, s18 +; CHECK-NEXT: asr.w r12, r3, #31 +; CHECK-NEXT: subs.w r8, r3, r4 +; CHECK-NEXT: sbc.w r12, r12, r4, asr #31 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: asrs r3, r4, #31 +; CHECK-NEXT: subs r4, r4, r6 +; CHECK-NEXT: sbc.w r9, r3, r6, asr #31 +; CHECK-NEXT: vmov r6, s16 ; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: subs.w r9, r5, r7 -; CHECK-NEXT: asr.w r6, r5, #31 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: sbc.w r6, r6, r7, asr #31 -; CHECK-NEXT: and.w r6, r12, r6, asr #31 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r4, r6, #4, #4 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: subs.w r10, r6, r3 -; CHECK-NEXT: asr.w r7, r6, #31 -; CHECK-NEXT: sbc.w r3, r7, r3, asr #31 -; CHECK-NEXT: vmov r7, s4 -; CHECK-NEXT: asrs r6, r5, #31 -; CHECK-NEXT: asr.w r11, r3, #31 -; CHECK-NEXT: and.w r3, r12, r3, asr #31 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: subs r5, r5, r7 -; CHECK-NEXT: sbc.w r6, r6, r7, asr #31 +; CHECK-NEXT: subs r5, r7, r6 +; CHECK-NEXT: asr.w r7, r7, #31 +; CHECK-NEXT: vmov q2[2], q2[0], r5, r8 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: sbc.w r6, r7, r6, asr #31 ; CHECK-NEXT: asrs r6, r6, #31 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r11 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov q1[2], q1[0], r8, r5 -; CHECK-NEXT: vmov q1[3], q1[1], r9, r10 -; CHECK-NEXT: and r6, r6, #1 -; CHECK-NEXT: rsbs r6, r6, #0 -; CHECK-NEXT: bfi r4, r6, #8, #4 -; CHECK-NEXT: bfi r4, r3, #12, #4 -; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: subs r7, r3, r5 +; CHECK-NEXT: asr.w r3, r3, #31 +; CHECK-NEXT: vmov q2[3], q2[1], r4, r7 +; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: sbc.w r3, r3, r5, asr #31 +; CHECK-NEXT: bfi r7, r6, #0, #4 +; CHECK-NEXT: asr.w r4, r9, #31 +; CHECK-NEXT: asr.w r6, r12, #31 +; CHECK-NEXT: bfi r7, r4, #4, #4 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: bfi r7, r6, #8, #4 +; CHECK-NEXT: bfi r7, r3, #12, #4 +; CHECK-NEXT: vmsr p0, r7 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.i32 q1, q0, q1 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vsubt.i32 q2, q0, q2 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} entry: br label %vector.body @@ -809,85 +788,59 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { ; CHECK-LABEL: vabd_loop_u32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: mov.w lr, #256 -; CHECK-NEXT: vmov.i64 q0, #0xffffffff +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: .LBB11_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov.f32 s12, s8 -; CHECK-NEXT: vmov.f32 s14, s9 -; CHECK-NEXT: vand q4, q3, q0 -; CHECK-NEXT: vldrw.u32 q3, [r0], #16 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov.f32 s20, s12 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vand q5, q5, q0 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmov r6, s20 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: vmov r7, s23 -; CHECK-NEXT: subs.w r8, r6, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: sbc.w r4, r5, r4 -; CHECK-NEXT: vmov r6, s19 -; CHECK-NEXT: asrs r5, r4, #31 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: bfi r4, r5, #0, #4 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov.f32 s16, s10 -; CHECK-NEXT: vmov.f32 s18, s11 -; CHECK-NEXT: vand q2, q4, q0 -; CHECK-NEXT: vmov.f32 s16, s14 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vand q3, q4, q0 -; CHECK-NEXT: vmov r12, s12 -; CHECK-NEXT: subs.w r9, r3, r5 -; CHECK-NEXT: vmov r5, s14 -; CHECK-NEXT: sbc.w r3, r7, r6 -; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: vmov r6, s15 -; CHECK-NEXT: and.w r3, r7, r3, asr #31 -; CHECK-NEXT: vmov r7, s10 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r4, r3, #4, #4 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: subs.w r10, r5, r7 -; CHECK-NEXT: vmov r7, s9 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: sbc.w r3, r6, r3 -; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: asr.w r11, r3, #31 -; CHECK-NEXT: subs.w r6, r12, r6 -; CHECK-NEXT: sbc.w r7, r5, r7 -; CHECK-NEXT: asrs r7, r7, #31 -; CHECK-NEXT: vmov q2[2], q2[0], r7, r11 +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s12, s18 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov.f32 s14, s19 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov.f32 s10, s9 +; CHECK-NEXT: vmov.f32 s18, s17 ; CHECK-NEXT: vmov r7, s8 -; CHECK-NEXT: vmov q2[2], q2[0], r8, r6 -; CHECK-NEXT: vmov q2[3], q2[1], r9, r10 -; CHECK-NEXT: and r7, r7, #1 -; CHECK-NEXT: rsbs r7, r7, #0 -; CHECK-NEXT: bfi r4, r7, #8, #4 -; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: and.w r3, r7, r3, asr #31 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r4, r3, #12, #4 -; CHECK-NEXT: vmsr p0, r4 +; CHECK-NEXT: vmov r5, s10 +; CHECK-NEXT: vmov r6, s18 +; CHECK-NEXT: subs r4, r4, r3 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: sbc r8, r12, #0 +; CHECK-NEXT: subs r5, r6, r5 +; CHECK-NEXT: sbc r6, r12, #0 +; CHECK-NEXT: subs r3, r3, r7 +; CHECK-NEXT: vmov q2[2], q2[0], r3, r4 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: sbc r7, r12, #0 +; CHECK-NEXT: subs r3, r4, r3 +; CHECK-NEXT: asr.w r4, r7, #31 +; CHECK-NEXT: vmov q2[3], q2[1], r5, r3 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: asr.w r3, r6, #31 +; CHECK-NEXT: bfi r5, r4, #0, #4 +; CHECK-NEXT: bfi r5, r3, #4, #4 +; CHECK-NEXT: asr.w r3, r8, #31 +; CHECK-NEXT: bfi r5, r3, #8, #4 +; CHECK-NEXT: sbc r3, r12, #0 +; CHECK-NEXT: asrs r3, r3, #31 +; CHECK-NEXT: bfi r5, r3, #12, #4 +; CHECK-NEXT: vmsr p0, r5 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.i32 q2, q1, q2 +; CHECK-NEXT: vsubt.i32 q2, q0, q2 ; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB11_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: br label %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-vcreate.ll b/llvm/test/CodeGen/Thumb2/mve-vcreate.ll --- a/llvm/test/CodeGen/Thumb2/mve-vcreate.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcreate.ll @@ -236,19 +236,21 @@ ; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: ldr r5, [sp, #28] -; CHECK-NEXT: lsll r2, r9, #16 ; CHECK-NEXT: lsll r12, r7, #16 -; CHECK-NEXT: orr.w r5, r5, r12 -; CHECK-NEXT: orrs r2, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r5 +; CHECK-NEXT: lsll r2, r9, #16 +; CHECK-NEXT: orr.w r12, r12, r5 +; CHECK-NEXT: orr.w r5, r2, r3 ; CHECK-NEXT: ldrd r2, r3, [sp, #16] ; CHECK-NEXT: orr.w r0, r1, r0, lsl #16 +; CHECK-NEXT: vmov q0[2], q0[0], r5, r12 ; CHECK-NEXT: orr.w r0, r0, r9 ; CHECK-NEXT: orr.w r2, r3, r2, lsl #16 ; CHECK-NEXT: orrs r2, r7 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r2 -; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov.32 q0[0], r5 +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov r0, r1, d0 ; CHECK-NEXT: pop.w {r5, r7, r9, pc} entry: %conv = zext i16 %a to i64 @@ -308,59 +310,59 @@ define hidden <16 x i8> @create_i8(i8 zeroext %a1, i8 zeroext %b1, i8 zeroext %c1, i8 zeroext %d1, i8 zeroext %a2, i8 zeroext %b2, i8 zeroext %c2, i8 zeroext %d2, i8 zeroext %a3, i8 zeroext %b3, i8 zeroext %c3, i8 zeroext %d3, i8 zeroext %a4, i8 zeroext %b4, i8 zeroext %c4, i8 zeroext %d4) local_unnamed_addr #0 { ; CHECK-LABEL: create_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r11, lr} -; CHECK-NEXT: ldr r4, [sp, #68] +; CHECK-NEXT: .save {r4, r5, r6, r7, r9, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r9, r11, lr} +; CHECK-NEXT: ldr.w r12, [sp, #32] ; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: ldr r6, [sp, #64] +; CHECK-NEXT: ldr r4, [sp, #28] ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: lsll r4, r11, #16 -; CHECK-NEXT: mov lr, r1 -; CHECK-NEXT: lsll r6, r5, #24 +; CHECK-NEXT: lsll r12, r11, #16 +; CHECK-NEXT: lsls r1, r1, #16 +; CHECK-NEXT: lsll r4, r5, #24 +; CHECK-NEXT: orr.w r0, r1, r0, lsl #22 +; CHECK-NEXT: orr.w r12, r12, r4 +; CHECK-NEXT: ldr r4, [sp, #36] ; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: orr.w r1, r6, r4 -; CHECK-NEXT: ldr r4, [sp, #72] -; CHECK-NEXT: mov r12, r3 -; CHECK-NEXT: ldr r3, [sp, #76] +; CHECK-NEXT: orr.w r0, r0, r2, lsl #8 ; CHECK-NEXT: lsll r4, r7, #8 -; CHECK-NEXT: ldr r6, [sp, #36] -; CHECK-NEXT: orrs r1, r4 -; CHECK-NEXT: ldr r4, [sp, #32] -; CHECK-NEXT: orr.w r8, r1, r3 +; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: orr.w r12, r12, r4 +; CHECK-NEXT: ldr r4, [sp, #40] +; CHECK-NEXT: orrs r0, r5 +; CHECK-NEXT: ldr r6, [sp, #60] +; CHECK-NEXT: orr.w r12, r12, r4 +; CHECK-NEXT: orr.w r0, r0, r11 +; CHECK-NEXT: ldr r4, [sp, #64] ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: lsll r6, r3, #16 -; CHECK-NEXT: lsll r4, r1, #24 +; CHECK-NEXT: orr.w r2, r0, r7 +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: lsll r4, r3, #16 +; CHECK-NEXT: lsll r6, r7, #24 +; CHECK-NEXT: orrs r6, r4 +; CHECK-NEXT: ldr r4, [sp, #68] ; CHECK-NEXT: mov.w r9, #0 -; CHECK-NEXT: orrs r4, r6 -; CHECK-NEXT: ldr r6, [sp, #40] -; CHECK-NEXT: lsll r6, r9, #8 -; CHECK-NEXT: orrs r4, r6 -; CHECK-NEXT: ldr r6, [sp, #44] -; CHECK-NEXT: orrs r4, r6 -; CHECK-NEXT: ldr r6, [sp, #48] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r8 -; CHECK-NEXT: ldr r4, [sp, #52] -; CHECK-NEXT: lsls r4, r4, #16 -; CHECK-NEXT: orr.w r4, r4, r6, lsl #22 -; CHECK-NEXT: ldr r6, [sp, #56] -; CHECK-NEXT: orr.w r4, r4, r6, lsl #8 -; CHECK-NEXT: ldr r6, [sp, #60] -; CHECK-NEXT: add r4, r6 -; CHECK-NEXT: orrs r4, r5 -; CHECK-NEXT: orr.w r4, r4, r11 -; CHECK-NEXT: orrs r4, r7 -; CHECK-NEXT: lsl.w r7, lr, #16 -; CHECK-NEXT: orr.w r0, r7, r0, lsl #22 -; CHECK-NEXT: orr.w r0, r0, r2, lsl #8 -; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: orrs r0, r3 -; CHECK-NEXT: orr.w r0, r0, r9 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 +; CHECK-NEXT: ldr r5, [sp, #72] +; CHECK-NEXT: lsll r4, r9, #8 +; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK-NEXT: orrs r6, r4 +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: orrs r6, r5 ; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov q0[2], q0[0], r12, r6 +; CHECK-NEXT: ldr r6, [sp, #48] +; CHECK-NEXT: ldr r5, [sp, #44] +; CHECK-NEXT: lsls r6, r6, #16 +; CHECK-NEXT: orr.w r6, r6, r5, lsl #22 +; CHECK-NEXT: ldr r5, [sp, #52] +; CHECK-NEXT: orr.w r6, r6, r5, lsl #8 +; CHECK-NEXT: ldr r5, [sp, #56] +; CHECK-NEXT: add r6, r5 +; CHECK-NEXT: orrs r7, r6 +; CHECK-NEXT: orrs r3, r7 +; CHECK-NEXT: orr.w r3, r3, r9 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r9, r11, pc} entry: %conv = zext i8 %a1 to i64 %shl = shl nuw nsw i64 %conv, 54 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll @@ -36,14 +36,11 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x) { ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> @@ -55,11 +52,8 @@ ; CHECK-LABEL: add_v2i32_v2i64_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr @@ -136,38 +130,35 @@ ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 ; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i16> %x to <8 x i64> @@ -178,57 +169,37 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) { ; CHECK-LABEL: add_v8i16_v8i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: adc.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: sxth r1, r1 -; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: adc.w r1, r0, r1, asr #31 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: sxth r3, r0 -; CHECK-NEXT: adds r0, r2, r3 -; CHECK-NEXT: adc.w r1, r1, r3, asr #31 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i16> %x to <8 x i64> @@ -263,12 +234,12 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) { ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov.i32 q1, #0xffff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -282,11 +253,8 @@ ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: sxth r0, r0 -; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr @@ -455,86 +423,75 @@ ; CHECK-NEXT: vand q2, q2, q1 ; CHECK-NEXT: vmov.u8 r2, q0[2] ; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 ; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 -; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[9] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[13] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i64> @@ -545,117 +502,69 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) { ; CHECK-LABEL: add_v16i8_v16i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: adc.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: adc.w r1, r0, r1, asr #31 -; CHECK-NEXT: vmov.u8 r0, q0[15] -; CHECK-NEXT: sxtb r3, r0 -; CHECK-NEXT: adds r0, r2, r3 -; CHECK-NEXT: adc.w r1, r1, r3, asr #31 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i64> @@ -673,39 +582,36 @@ ; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 ; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r1, s10 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <8 x i8> %x to <8 x i64> @@ -716,57 +622,37 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x) { ; CHECK-LABEL: add_v8i8_v8i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: adc.w r12, r1, r0, asr #31 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: adc.w r12, r0, r1, asr #31 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r1 -; CHECK-NEXT: asrs r0, r1, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r0, r0, r1, asr #31 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: adds r2, r2, r1 -; CHECK-NEXT: adc.w r1, r0, r1, asr #31 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: sxtb r3, r0 -; CHECK-NEXT: adds r0, r2, r3 -; CHECK-NEXT: adc.w r1, r1, r3, asr #31 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: asrs r1, r0, #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr entry: %xx = sext <8 x i8> %x to <8 x i64> @@ -803,12 +689,12 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) { ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -822,11 +708,8 @@ ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: vmov.32 q1[0], r0 ; CHECK-NEXT: asrs r1, r0, #31 -; CHECK-NEXT: vmov.32 q1[1], r1 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adc.w r1, r1, r2, asr #31 ; CHECK-NEXT: bx lr @@ -889,19 +772,14 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) { ; CHECK-LABEL: add_v2i32_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i64 q1, #0xffffffff -; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i32> %x to <2 x i64> %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx) @@ -913,15 +791,12 @@ ; CHECK-LABEL: add_v2i32_v2i64_acc_sext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.32 q1[0], r2 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: adc.w r2, r2, r3, asr #31 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: asr.w r12, r2, #31 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i32> %x to <2 x i64> @@ -995,8 +870,8 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u16 r2, q0[1] ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 @@ -1007,38 +882,35 @@ ; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: add r2, r12 ; CHECK-NEXT: add.w r12, r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add r12, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r4, r12, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adc.w r12, r2, lr +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: adc r12, r12, #0 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds.w lr, lr, r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <8 x i16> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1049,62 +921,42 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v8i16_v8i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov.u16 r3, q0[1] ; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 -; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: adds.w lr, r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 ; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adc.w r2, r3, r2, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r4, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4, asr #31 -; CHECK-NEXT: vmov.u16 r4, q0[6] -; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: vmov.u16 r4, q0[7] -; CHECK-NEXT: sxth r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w lr, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[7] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: adds.w r2, r12, r3 +; CHECK-NEXT: adc.w r3, lr, r3, asr #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <8 x i16> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1141,14 +993,13 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xffff +; CHECK-NEXT: vmov.i32 q1, #0xffff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -1163,15 +1014,12 @@ ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxth r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: asr.w r12, r2, #31 ; CHECK-NEXT: sxth r3, r3 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: adc.w r2, r2, r3, asr #31 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i16> %x to <2 x i64> @@ -1343,8 +1191,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u8 r2, q0[1] ; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 @@ -1355,86 +1203,75 @@ ; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: add r2, r12 ; CHECK-NEXT: add.w r12, r2, r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add r12, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r4, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: adc.w r12, r2, lr +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: adc r12, r12, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w lr, lr, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: adc.w r3, r12, r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w lr, r4, r2 -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w lr, r4, r2 -; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r2 ; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w lr, r4, r2 -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r4 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r3, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r12 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) @@ -1445,122 +1282,74 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov.u8 r2, q0[1] -; CHECK-NEXT: vmov.u8 r3, q0[0] +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov.u8 r3, q0[1] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 -; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: adds.w lr, r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 ; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adc.w r2, r3, r2, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[3] ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r4, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[7] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q0[8] -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[9] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds.w lr, r3, r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: adc.w r12, r12, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[13] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[9] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[13] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w lr, r2, r3, asr #31 +; CHECK-NEXT: vmov.u8 r3, q0[15] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r2, r12, r3 +; CHECK-NEXT: adc.w r3, lr, r3, asr #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx) @@ -1571,8 +1360,8 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v8i8_v8i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmovlb.u8 q0, q0 ; CHECK-NEXT: vmov.i64 q1, #0xffff ; CHECK-NEXT: vmov.u16 r2, q0[1] @@ -1584,38 +1373,35 @@ ; CHECK-NEXT: add.w r12, r3, r2 ; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: add r2, r12 ; CHECK-NEXT: add.w r12, r2, r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vand q3, q3, q1 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: add r12, r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds.w r4, r12, r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: adc.w r12, r2, lr +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: add r2, r12 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: adds.w lr, r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: adc r12, r12, #0 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds.w lr, lr, r2 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: adc r3, r12, #0 +; CHECK-NEXT: adds.w r2, r2, lr +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = zext <8 x i8> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1626,62 +1412,42 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v8i8_v8i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov.u16 r3, q0[1] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: asr.w r12, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r12 -; CHECK-NEXT: vmov lr, s6 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r12, s5 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: adds.w lr, r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 ; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: adds.w r12, lr, r2 +; CHECK-NEXT: adc.w r2, r3, r2, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[3] ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: asrs r4, r2, #31 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: adc.w r12, r12, r2, asr #31 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 -; CHECK-NEXT: asrs r3, r4, #31 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4, asr #31 -; CHECK-NEXT: vmov.u16 r4, q0[6] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 -; CHECK-NEXT: vmov.u16 r4, q0[7] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adc.w r3, r3, r4, asr #31 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w r2, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: adc.w lr, r2, r3, asr #31 +; CHECK-NEXT: vmov.u16 r3, q0[7] +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: adds.w r2, r12, r3 +; CHECK-NEXT: adc.w r3, lr, r3, asr #31 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <8 x i8> %x to <8 x i64> %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx) @@ -1720,14 +1486,13 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.i64 q1, #0xff +; CHECK-NEXT: vmov.i32 q1, #0xff ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -1742,15 +1507,12 @@ ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: asrs r2, r2, #31 -; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: asr.w r12, r2, #31 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: adds.w r12, r12, r3 -; CHECK-NEXT: adc.w r2, r2, r3, asr #31 -; CHECK-NEXT: adds.w r0, r0, r12 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r12, r3, asr #31 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: bx lr entry: %xx = sext <2 x i8> %x to <2 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -46,9 +46,8 @@ ; CHECK-LABEL: add_v2i32_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: cset r0, eq ; CHECK-NEXT: tst.w r0, #1 @@ -58,14 +57,11 @@ ; CHECK-NEXT: tst.w r1, #1 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer @@ -200,6 +196,7 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.u16 r1, q2[1] +; CHECK-NEXT: vmov.u16 r3, q2[4] ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vmov.i64 q1, #0xffff @@ -209,83 +206,70 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 ; CHECK-NEXT: vmov.u16 r1, q0[1] ; CHECK-NEXT: vmov.u16 r2, q0[0] ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: ubfx r3, r0, #12, #1 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: ubfx r2, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 ; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u16 r1, q2[6] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r1 -; CHECK-NEXT: vmov.u16 r1, q2[7] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r1 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: and r0, r1, #1 -; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r0, r2, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 ; CHECK-NEXT: vmov.u16 r0, q0[5] ; CHECK-NEXT: vmov.u16 r3, q0[4] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r12, r3 -; CHECK-NEXT: ubfx r3, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: ubfx r2, r2, #8, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -467,14 +451,11 @@ ; CHECK-NEXT: tst.w r1, #1 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -803,6 +784,7 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q5[3] ; CHECK-NEXT: vmov.u16 r1, q5[1] +; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vmov.i64 q1, #0xff @@ -812,83 +794,69 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r1 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r1 ; CHECK-NEXT: vmov.u8 r1, q0[1] ; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vmov q7[2], q7[0], r2, r1 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r1, s27 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: ubfx r3, r0, #12, #1 +; CHECK-NEXT: vmov r1, s26 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: ubfx r2, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q6[2], q6[0], r0, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r0, r3 +; CHECK-NEXT: vmov q6[2], q6[0], r0, r2 ; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov q7[2], q7[0], r3, r0 +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov q7[2], q7[0], r2, r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: vmov r1, s27 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u16 r1, q5[6] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r1 -; CHECK-NEXT: vmov.u16 r1, q5[7] +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] -; CHECK-NEXT: vmov q6[3], q6[1], r3, r1 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q6, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: and r0, r1, #1 -; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r0, r2, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r0, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r0, r3 ; CHECK-NEXT: vmov.u8 r0, q0[5] ; CHECK-NEXT: vmov.u8 r3, q0[4] ; CHECK-NEXT: vmov q6[2], q6[0], r3, r0 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r12, r3 -; CHECK-NEXT: ubfx r3, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q5[2], q5[0], r1, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r1, r3 -; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: ubfx r2, r2, #8, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: vmov q6[2], q6[0], r3, r1 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u8 r2, q4[8] ; CHECK-NEXT: vmov.16 q5[0], r2 ; CHECK-NEXT: vmov.u8 r2, q4[9] @@ -905,6 +873,7 @@ ; CHECK-NEXT: vmov.16 q5[6], r2 ; CHECK-NEXT: vmov.u8 r2, q4[15] ; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q2, q3, q2 ; CHECK-NEXT: vmov.u16 r2, q2[2] @@ -920,44 +889,38 @@ ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 ; CHECK-NEXT: vmov.u8 r0, q0[9] ; CHECK-NEXT: vmov.u8 r3, q0[8] ; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 ; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov.u16 r3, q2[4] ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds.w r12, r0, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds.w r12, r0, r2 ; CHECK-NEXT: vmov.u16 r2, q2[6] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.u16 r3, q2[5] +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr ; CHECK-NEXT: vmrs r2, p0 @@ -966,39 +929,33 @@ ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 ; CHECK-NEXT: vmov.u8 r0, q0[13] ; CHECK-NEXT: vmov.u8 r3, q0[12] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds.w r3, r3, r12 -; CHECK-NEXT: adc.w r12, r1, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 ; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r1, r1, r12 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[15] ; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1294,6 +1251,7 @@ ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 ; CHECK-NEXT: vmov.u16 r0, q2[3] ; CHECK-NEXT: vmov.u16 r1, q2[1] +; CHECK-NEXT: vmov.u16 r3, q2[4] ; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q1, zr ; CHECK-NEXT: vmov.i64 q1, #0xffff @@ -1303,83 +1261,70 @@ ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r1, r1, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r1 ; CHECK-NEXT: vmov.u16 r1, q0[1] ; CHECK-NEXT: vmov.u16 r2, q0[0] ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: orrs r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: ubfx r3, r0, #12, #1 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: add r1, r2 +; CHECK-NEXT: ubfx r2, r0, #12, #1 ; CHECK-NEXT: ubfx r0, r0, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 +; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r0, r3 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 ; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov q4[2], q4[0], r3, r0 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adcs r0, r1 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u16 r1, q2[6] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r1 -; CHECK-NEXT: vmov.u16 r1, q2[7] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r3, r1 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: and r0, r1, #1 -; CHECK-NEXT: ubfx r3, r1, #4, #1 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r0, r2, #1 +; CHECK-NEXT: ubfx r3, r2, #4, #1 ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r0, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r0, r3 ; CHECK-NEXT: vmov.u16 r0, q0[5] ; CHECK-NEXT: vmov.u16 r3, q0[4] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r12, r3 -; CHECK-NEXT: ubfx r3, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r1, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r3 +; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: ubfx r2, r2, #8, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov q0[2], q0[0], r3, r1 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: @@ -1565,14 +1510,11 @@ ; CHECK-NEXT: tst.w r1, #1 ; CHECK-NEXT: csetm r1, ne ; CHECK-NEXT: vmov q1[2], q1[0], r1, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r1, r0 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: orrs r1, r2 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer @@ -1702,12 +1644,9 @@ define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %b, i64 %a) { ; CHECK-LABEL: add_v2i32_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.i64 q2, #0xffffffff +; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: cset r2, eq ; CHECK-NEXT: tst.w r2, #1 @@ -1717,17 +1656,14 @@ ; CHECK-NEXT: tst.w r3, #1 ; CHECK-NEXT: csetm r3, ne ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s3 -; CHECK-NEXT: vmov lr, s1 ; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, lr, r12 +; CHECK-NEXT: adc r3, r12, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i32> %b, zeroinitializer %xx = zext <2 x i32> %x to <2 x i64> @@ -1883,39 +1819,33 @@ ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r3, r12 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r12 ; CHECK-NEXT: vmov.u16 r12, q0[1] ; CHECK-NEXT: vmov.u16 r3, q0[0] ; CHECK-NEXT: vmov q4[2], q4[0], r3, r12 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r12, s15 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov lr, s14 -; CHECK-NEXT: orr.w r12, r12, r3 +; CHECK-NEXT: vmov r12, s14 ; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: add lr, r3 +; CHECK-NEXT: add r12, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.u16 r3, q0[2] ; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov.u16 r3, q2[4] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.u16 r3, q2[5] @@ -1927,39 +1857,33 @@ ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r4, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r4, r3 ; CHECK-NEXT: vmov.u16 r3, q0[5] ; CHECK-NEXT: vmov.u16 r4, q0[4] ; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: adds.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adc r4, lr, #0 +; CHECK-NEXT: adds.w r12, r12, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: adc r4, r4, #0 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 ; CHECK-NEXT: vmov.u16 r2, q0[7] ; CHECK-NEXT: vmov.u16 r3, q0[6] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r12, r4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9} @@ -2117,16 +2041,12 @@ ; CHECK-NEXT: tst.w r3, #1 ; CHECK-NEXT: csetm r3, ne ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: orr.w r12, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i16> %b, zeroinitializer @@ -2336,8 +2256,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vcmp.i8 eq, q1, zr @@ -2376,39 +2296,33 @@ ; CHECK-NEXT: and r3, r2, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r12 -; CHECK-NEXT: vmov q6[3], q6[1], r3, r12 ; CHECK-NEXT: vmov.u8 r12, q0[1] ; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: vmov q7[2], q7[0], r3, r12 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r12, s27 -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: vmov lr, s26 -; CHECK-NEXT: orr.w r12, r12, r3 +; CHECK-NEXT: vmov r12, s26 ; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: add lr, r3 +; CHECK-NEXT: add r12, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r2, r3 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: vmov.u8 r3, q0[2] ; CHECK-NEXT: vmov q7[2], q7[0], r3, r2 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vand q7, q7, q1 ; CHECK-NEXT: vand q6, q7, q6 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds.w r12, r12, r2 ; CHECK-NEXT: vmov.u16 r2, q5[6] +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov.u16 r3, q5[4] ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[7] ; CHECK-NEXT: vmov.u16 r3, q5[5] @@ -2420,60 +2334,54 @@ ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r4, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r4, r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov.u8 r4, q0[4] ; CHECK-NEXT: vmov q6[2], q6[0], r4, r3 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: adds.w lr, lr, r4 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: adds.w r4, r4, lr -; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: adc r4, lr, #0 +; CHECK-NEXT: adds.w r12, r12, r3 ; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: adc r4, r4, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 -; CHECK-NEXT: vmov q5[3], q5[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[7] ; CHECK-NEXT: vmov.u8 r3, q0[6] ; CHECK-NEXT: vmov q6[2], q6[0], r3, r2 ; CHECK-NEXT: vand q6, q6, q1 ; CHECK-NEXT: vand q5, q6, q5 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: adds.w lr, r4, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: adc.w r4, r12, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: adds.w r12, lr, r3 -; CHECK-NEXT: adc.w lr, r4, r2 -; CHECK-NEXT: vmov.u8 r2, q4[8] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u8 r2, q4[9] -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u8 r2, q4[10] -; CHECK-NEXT: vmov.16 q5[2], r2 -; CHECK-NEXT: vmov.u8 r2, q4[11] -; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.u8 r2, q4[12] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u8 r2, q4[13] -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov.u8 r2, q4[14] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u8 r2, q4[15] -; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r4, #0 +; CHECK-NEXT: vmov r4, s22 +; CHECK-NEXT: adds.w r12, r2, r4 +; CHECK-NEXT: vmov.u8 r4, q4[8] +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: vmov.u8 r4, q4[9] +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov.u8 r4, q4[10] +; CHECK-NEXT: vmov.16 q5[2], r4 +; CHECK-NEXT: vmov.u8 r4, q4[11] +; CHECK-NEXT: vmov.16 q5[3], r4 +; CHECK-NEXT: vmov.u8 r4, q4[12] +; CHECK-NEXT: vmov.16 q5[4], r4 +; CHECK-NEXT: vmov.u8 r4, q4[13] +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: vmov.u8 r4, q4[14] +; CHECK-NEXT: vmov.16 q5[6], r4 +; CHECK-NEXT: vmov.u8 r4, q4[15] +; CHECK-NEXT: vmov.16 q5[7], r4 +; CHECK-NEXT: adc lr, r3, #0 ; CHECK-NEXT: vcmp.i16 ne, q5, zr ; CHECK-NEXT: vpsel q2, q3, q2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.u16 r4, q2[0] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 +; CHECK-NEXT: vmov.u16 r4, q2[2] +; CHECK-NEXT: vmov.u16 r2, q2[0] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 ; CHECK-NEXT: vmov.u16 r2, q2[3] ; CHECK-NEXT: vmov.u16 r4, q2[1] ; CHECK-NEXT: vmov q3[3], q3[1], r4, r2 @@ -2484,89 +2392,77 @@ ; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r4, r4, #0 ; CHECK-NEXT: vmov q3[2], q3[0], r3, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r4 ; CHECK-NEXT: vmov.u8 r3, q0[9] ; CHECK-NEXT: vmov.u8 r4, q0[8] ; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds.w r5, r12, r4 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: adc.w r12, lr, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: ubfx r4, r2, #12, #1 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: adc r4, lr, #0 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: ubfx r3, r2, #12, #1 ; CHECK-NEXT: ubfx r2, r2, #8, #1 -; CHECK-NEXT: rsb.w r4, r4, #0 +; CHECK-NEXT: rsb.w r3, r3, #0 ; CHECK-NEXT: rsb.w r2, r2, #0 -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r4 +; CHECK-NEXT: adc r4, r4, #0 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q0[11] -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r2 +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vand q4, q4, q1 ; CHECK-NEXT: vand q3, q4, q3 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r5, r5, r4 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r4, #0 ; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov.u16 r4, q2[4] -; CHECK-NEXT: adc.w r12, r2, r3 -; CHECK-NEXT: vmov.u16 r3, q2[6] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov.u16 r3, q2[7] +; CHECK-NEXT: adds.w r12, r2, r4 +; CHECK-NEXT: vmov.u16 r4, q2[6] +; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 +; CHECK-NEXT: vmov.u16 r2, q2[7] ; CHECK-NEXT: vmov.u16 r4, q2[5] -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 +; CHECK-NEXT: vmov q3[3], q3[1], r4, r2 ; CHECK-NEXT: vcmp.i32 ne, q3, zr -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: ubfx r4, r3, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 +; CHECK-NEXT: vmrs r2, p0 +; CHECK-NEXT: and r3, r2, #1 +; CHECK-NEXT: ubfx r4, r2, #4, #1 +; CHECK-NEXT: rsbs r3, r3, #0 ; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r4 -; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov q2[2], q2[0], r3, r4 +; CHECK-NEXT: vmov.u8 r3, q0[13] ; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r2 +; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 ; CHECK-NEXT: vand q3, q3, q1 ; CHECK-NEXT: vand q2, q3, q2 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adc.w r5, r12, r4 -; CHECK-NEXT: ubfx r4, r3, #12, #1 -; CHECK-NEXT: ubfx r3, r3, #8, #1 -; CHECK-NEXT: rsbs r4, r4, #0 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: vmov q2[2], q2[0], r3, r4 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: adc r4, lr, #0 +; CHECK-NEXT: adds.w r12, r12, r3 +; CHECK-NEXT: ubfx r3, r2, #12, #1 +; CHECK-NEXT: ubfx r2, r2, #8, #1 +; CHECK-NEXT: rsb.w r3, r3, #0 +; CHECK-NEXT: rsb.w r2, r2, #0 +; CHECK-NEXT: adc r4, r4, #0 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r4, #0 ; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: adcs r3, r5 -; CHECK-NEXT: vmov r5, s3 ; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %c = icmp eq <16 x i8> %b, zeroinitializer %xx = zext <16 x i8> %x to <16 x i64> @@ -2867,16 +2763,12 @@ ; CHECK-NEXT: tst.w r3, #1 ; CHECK-NEXT: csetm r3, ne ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: orr.w r12, r3, r2 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: bx lr entry: %c = icmp eq <2 x i8> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -245,14 +245,17 @@ ; CHECK-LABEL: add_v2i16_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q2, #0xffff -; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umlal r0, r1, r3, r2 +; CHECK-NEXT: vand q1, q1, q2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> @@ -265,10 +268,10 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) { ; CHECK-LABEL: add_v2i16_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxth r0, r0 ; CHECK-NEXT: sxth r1, r1 ; CHECK-NEXT: smull r0, r1, r1, r0 @@ -549,8 +552,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i64_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.u8 r0, q1[1] @@ -562,168 +563,137 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r12, r1, r1, r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: orr.w lr, r3, r1 -; CHECK-NEXT: vmov.u8 r3, q1[3] -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vmov.u8 r3, q0[2] +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: smlabb r0, r2, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q1[3] +; CHECK-NEXT: vmov.u8 r2, q1[2] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov q4[2], q4[0], r1, r3 -; CHECK-NEXT: vmov r0, s14 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r1 -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adc.w r0, r0, lr -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q1[4] -; CHECK-NEXT: adc.w r12, r0, r1 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: smlabb r0, r2, r1, r0 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: smlabb r0, r2, r1, r0 ; CHECK-NEXT: vmov.u8 r1, q1[5] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r1 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r1, r3, r3, r1 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r1 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[6] ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: muls r1, r2, r1 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: adc r1, r2, #0 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: adc.w r12, r0, r3 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q1[6] +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.u8 r0, q0[6] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: muls r0, r2, r0 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov.u8 r2, q1[9] -; CHECK-NEXT: adc.w r12, r0, r3 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q1[8] +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[9] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.u8 r0, q0[8] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: muls r0, r2, r0 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: adc.w r12, r0, r3 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q1[10] +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[11] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.u8 r0, q0[10] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: muls r0, r2, r0 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: adc.w r12, r0, r3 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q1[12] +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[13] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 +; CHECK-NEXT: vmov.u8 r0, q0[12] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: adc.w r2, r12, r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 -; CHECK-NEXT: vmov.u8 r2, q1[15] +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: muls r0, r2, r0 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: vmov.u8 r3, q1[14] +; CHECK-NEXT: adds.w r12, r0, r2 +; CHECK-NEXT: vmov.u8 r2, q1[15] ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: vmov.u8 r0, q0[14] ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vmov q0[2], q0[0], r0, r3 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umlal r0, r1, r3, r2 -; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umlal r0, r1, r3, r2 +; CHECK-NEXT: muls r0, r2, r0 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: adds.w r0, r0, r12 +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: bx lr entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -735,139 +705,76 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u8 r0, q1[1] ; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q1[0] -; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: vmov.u8 r3, q0[0] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r1 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q1[2] +; CHECK-NEXT: vmov.u8 r3, q0[2] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u8 r1, q1[3] -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.u8 r2, q0[2] -; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: vmov.u8 r3, q0[4] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: smull r1, r3, r3, r1 -; CHECK-NEXT: smull r0, r2, r2, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r1 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[5] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.u8 r1, q0[4] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q1[6] +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[7] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.u8 r1, q0[6] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q1[8] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[9] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.u8 r1, q0[8] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q1[10] +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[11] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.u8 r1, q0[10] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r0, r0, r12 -; CHECK-NEXT: adds.w lr, r1, r2 +; CHECK-NEXT: smlal r0, r1, r3, r2 +; CHECK-NEXT: vmov.u8 r2, q1[12] +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[13] -; CHECK-NEXT: adc.w r12, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.u8 r1, q0[12] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r0, r0 -; CHECK-NEXT: sxtb r1, r1 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: smull r0, r1, r1, r0 -; CHECK-NEXT: vmov q2[2], q2[0], r0, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r1, r3 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: adds.w r1, r1, lr -; CHECK-NEXT: adc.w r2, r12, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: adds r0, r0, r1 -; CHECK-NEXT: adc.w r1, r2, r3 +; CHECK-NEXT: smlal r0, r1, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[14] ; CHECK-NEXT: vmov.u8 r3, q0[14] ; CHECK-NEXT: sxtb r2, r2 @@ -878,7 +785,7 @@ ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: smlal r0, r1, r3, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i64> %yy = sext <16 x i8> %y to <16 x i64> @@ -954,16 +861,15 @@ ; CHECK-LABEL: add_v2i8_v2i64_zext: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i64 q2, #0xff -; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 +; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: orrs r1, r3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: mla r0, r2, r1, r0 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> @@ -976,10 +882,10 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: add_v2i8_v2i64_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxtb r0, r0 ; CHECK-NEXT: sxtb r1, r1 ; CHECK-NEXT: smull r0, r1, r1, r0 @@ -1000,25 +906,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r4, s5 ; CHECK-NEXT: umull r12, r3, r1, r0 ; CHECK-NEXT: mla r1, r1, r2, r3 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: mla lr, r2, r0, r1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: umull r3, r1, r2, r0 +; CHECK-NEXT: mla r1, r2, r4, r1 ; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.32 q2[0], r12 ; CHECK-NEXT: mla r1, r2, r0, r1 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov r12, s8 -; CHECK-NEXT: umull lr, r0, r3, r2 -; CHECK-NEXT: mla r0, r3, r4, r0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: mla r2, r3, r2, r0 -; CHECK-NEXT: adds.w r0, r12, lr -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: adds.w r0, r3, r12 +; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: pop {r4, pc} entry: %m = mul <2 x i64> %x, %y @@ -1262,20 +1165,21 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) { ; CHECK-LABEL: add_v2i16_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xffff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r12, s6 -; CHECK-NEXT: umull r2, lr, r3, r2 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umlal r2, lr, r3, r12 +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r1, r1, lr -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i16> %x to <2 x i64> %yy = zext <2 x i16> %y to <2 x i64> @@ -1290,14 +1194,14 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxth r2, r2 ; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: smull r2, r12, r3, r2 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: sxth.w lr, r3 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxth r3, r3 ; CHECK-NEXT: smlal r2, r12, r3, lr ; CHECK-NEXT: adds r0, r0, r2 @@ -1499,8 +1403,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.u8 r2, q1[1] @@ -1512,170 +1416,139 @@ ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r12, s14 +; CHECK-NEXT: vmov r12, s12 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.u8 r4, q0[6] +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: mul r12, r2, r12 ; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov.u8 r4, q1[2] -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov.u8 r5, q0[2] -; CHECK-NEXT: umull r12, lr, r2, r12 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: orr.w lr, lr, r3 +; CHECK-NEXT: smlabb r12, r2, r3, r12 ; CHECK-NEXT: vmov.u8 r3, q1[3] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[3] -; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 +; CHECK-NEXT: vmov.u8 r2, q1[2] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: vmov.u8 r2, q0[2] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov lr, s12 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: umull r5, r6, r6, r5 -; CHECK-NEXT: vmov q3[2], q3[0], r5, r3 -; CHECK-NEXT: vmov.u8 r5, q1[4] -; CHECK-NEXT: vmov q3[3], q3[1], r6, r4 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds r2, r2, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r6 -; CHECK-NEXT: vmov.u8 r6, q1[5] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov.u8 r5, q0[5] -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: smlabb r12, r2, lr, r12 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: smlabb r12, r2, r3, r12 +; CHECK-NEXT: vmov.u8 r3, q1[5] +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r3 +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 +; CHECK-NEXT: vmov lr, s12 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[6] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds.w r6, r6, r12 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[7] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[6] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[7] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: mul r2, r2, lr +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: vmov.u8 r3, q1[6] +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[8] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[9] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[8] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[9] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: adc lr, lr, #0 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: vmov.u8 r4, q1[9] +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q1[8] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov.u8 r4, q0[9] +; CHECK-NEXT: vmov.u8 r3, q0[8] +; CHECK-NEXT: vmov q4[2], q4[0], r3, r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[10] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[11] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[10] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[11] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: vmov.u8 r4, q1[11] +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q1[10] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: vmov.u8 r3, q0[10] +; CHECK-NEXT: vmov q4[2], q4[0], r3, r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[12] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[13] -; CHECK-NEXT: adc.w r12, r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[12] -; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[13] -; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: vmov.u8 r4, q1[13] +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q1[12] +; CHECK-NEXT: vmov q3[2], q3[0], r2, r4 +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: vmov.u8 r3, q0[12] +; CHECK-NEXT: vmov q4[2], q4[0], r3, r4 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r5, s18 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: umull r6, r5, r5, r6 -; CHECK-NEXT: umull r2, r4, r2, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r6 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r5 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: vmov.u8 r6, q1[15] -; CHECK-NEXT: adcs r2, r5 -; CHECK-NEXT: vmov.u8 r5, q1[14] -; CHECK-NEXT: vmov q1[2], q1[0], r5, r6 -; CHECK-NEXT: vmov.u8 r5, q0[15] -; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: vmov.u8 r4, q1[15] +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov.u8 r2, q1[14] +; CHECK-NEXT: vmov q1[2], q1[0], r2, r4 +; CHECK-NEXT: adc lr, r3, #0 +; CHECK-NEXT: vmov.u8 r4, q0[15] +; CHECK-NEXT: vmov.u8 r3, q0[14] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r4 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: umlal r3, r2, r5, r6 -; CHECK-NEXT: vmov r6, s6 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: umlal r3, r2, r5, r6 -; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: muls r2, r3, r2 +; CHECK-NEXT: adds.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: adc r3, lr, #0 +; CHECK-NEXT: muls r2, r4, r2 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adc r3, r3, #0 +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> %yy = zext <16 x i8> %y to <16 x i64> @@ -1688,152 +1561,91 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v16i8_v16i64_acc_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.u8 r2, q1[1] ; CHECK-NEXT: vmov.u8 r3, q0[1] ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r12, r3, r3, r2 +; CHECK-NEXT: smull r3, r12, r3, r2 ; CHECK-NEXT: vmov.u8 r2, q1[0] ; CHECK-NEXT: sxtb.w lr, r2 ; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: vmov.u8 r4, q1[2] -; CHECK-NEXT: smull r2, lr, r2, lr -; CHECK-NEXT: vmov.u8 r5, q0[2] -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r12 -; CHECK-NEXT: smull r4, r5, r5, r4 -; CHECK-NEXT: vmov q2[3], q2[1], lr, r3 -; CHECK-NEXT: vmov lr, s10 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r12, s9 -; CHECK-NEXT: adds.w lr, lr, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[2] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[3] -; CHECK-NEXT: adc.w r12, r12, r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[3] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r3 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: adds.w r5, lr, r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r5, r4 -; CHECK-NEXT: vmov.u8 r4, q0[5] -; CHECK-NEXT: adc.w r12, r2, r3 -; CHECK-NEXT: vmov.u8 r3, q1[5] +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[4] -; CHECK-NEXT: vmov.u8 r5, q0[4] -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[4] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: smull r3, r4, r4, r3 -; CHECK-NEXT: smull r2, r5, r5, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r5, r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r5 -; CHECK-NEXT: vmov.u8 r5, q1[7] -; CHECK-NEXT: adc.w r12, r2, r4 -; CHECK-NEXT: vmov.u8 r4, q0[7] +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[5] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[6] -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[6] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r5 -; CHECK-NEXT: vmov.u8 r5, q1[9] -; CHECK-NEXT: adc.w r12, r2, r4 -; CHECK-NEXT: vmov.u8 r4, q0[9] +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[8] -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r5 -; CHECK-NEXT: vmov.u8 r5, q1[11] -; CHECK-NEXT: adc.w r12, r2, r4 -; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[10] -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[10] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, r3, r5 -; CHECK-NEXT: vmov.u8 r5, q1[13] -; CHECK-NEXT: adc.w r12, r2, r4 -; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[11] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: vmov.u8 r2, q1[12] -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[12] ; CHECK-NEXT: sxtb r2, r2 -; CHECK-NEXT: sxtb r3, r3 -; CHECK-NEXT: smull r5, r4, r4, r5 -; CHECK-NEXT: smull r2, r3, r3, r2 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r5 -; CHECK-NEXT: vmov q2[3], q2[1], r3, r4 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: vmov.u8 r5, q1[14] -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov.u8 r4, q0[14] -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smlal r3, r2, r4, r5 -; CHECK-NEXT: vmov.u8 r5, q1[15] -; CHECK-NEXT: vmov.u8 r4, q0[15] -; CHECK-NEXT: sxtb r5, r5 -; CHECK-NEXT: sxtb r4, r4 -; CHECK-NEXT: smlal r3, r2, r4, r5 +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[13] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[14] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[15] +; CHECK-NEXT: sxtb.w lr, r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: smlal r3, r12, r2, lr ; CHECK-NEXT: adds r0, r0, r3 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: adc.w r1, r1, r12 +; CHECK-NEXT: pop {r7, pc} entry: %xx = sext <16 x i8> %x to <16 x i64> %yy = sext <16 x i8> %y to <16 x i64> @@ -1846,22 +1658,18 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) { ; CHECK-LABEL: add_v2i8_v2i64_acc_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: umull r12, lr, r3, r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: orr.w r3, r3, lr +; CHECK-NEXT: mul r12, r3, r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: mla r2, r2, r3, r12 ; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: adc r1, r1, #0 +; CHECK-NEXT: bx lr entry: %xx = zext <2 x i8> %x to <2 x i64> %yy = zext <2 x i8> %y to <2 x i64> @@ -1876,14 +1684,14 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: sxtb r2, r2 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: smull r2, r12, r3, r2 -; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: sxtb.w lr, r3 -; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: sxtb r3, r3 ; CHECK-NEXT: smlal r2, r12, r3, lr ; CHECK-NEXT: adds r0, r0, r2 @@ -1903,27 +1711,24 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov r6, s7 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: vmov r6, s5 ; CHECK-NEXT: umull r12, lr, r3, r2 ; CHECK-NEXT: mla r3, r3, r4, lr -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov.32 q2[0], r12 -; CHECK-NEXT: mla r2, r4, r2, r3 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.32 q2[1], r2 -; CHECK-NEXT: vmov r12, s8 -; CHECK-NEXT: umull lr, r5, r3, r4 -; CHECK-NEXT: mla r3, r3, r6, r5 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: adds.w r6, r12, lr -; CHECK-NEXT: mla r3, r5, r4, r3 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds r0, r0, r6 -; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: mla lr, r4, r2, r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: umull r2, r5, r4, r3 +; CHECK-NEXT: mla r4, r4, r6, r5 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: mla r3, r5, r3, r4 +; CHECK-NEXT: adc.w r3, r3, lr +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %m = mul <2 x i64> %x, %y diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -66,32 +66,32 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) { ; CHECK-LABEL: vld2_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] ; CHECK-NEXT: vldrw.u32 q0, [r0], #32 -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov.f64 d4, d0 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov lr, s10 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r2, s7 ; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r12, s7 -; CHECK-NEXT: vmov lr, s3 -; CHECK-NEXT: adds r6, r3, r2 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adc.w r12, r12, lr -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r6 -; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-NEXT: adcs r2, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %l1 = load <4 x i64>, <4 x i64>* %src, align 4 %s1 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -262,27 +262,27 @@ ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.f64 d4, d0 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r3, s7 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r12, s7 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: adds.w lr, r2, r0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: pop {r4, pc} entry: @@ -302,51 +302,51 @@ ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] -; CHECK-NEXT: vmov.f64 d4, d1 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s10, s22 -; CHECK-NEXT: vmov.f32 s2, s20 -; CHECK-NEXT: vmov.f32 s11, s23 -; CHECK-NEXT: vmov.f32 s3, s21 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov.f64 d6, d3 -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s7, s17 -; CHECK-NEXT: vmov.f32 s15, s19 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vmov.f64 d10, d0 +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vmov.f32 s22, s16 +; CHECK-NEXT: vmov.f32 s23, s17 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vmov r12, s23 +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vmov.f32 s15, s9 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov.f64 d10, d3 +; CHECK-NEXT: vmov.f32 s21, s7 ; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds r5, r5, r6 +; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: adds.w lr, r2, r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: vmov r3, s21 +; CHECK-NEXT: adds r4, r4, r6 ; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov q3[2], q3[0], r5, r3 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 -; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov q3[2], q3[0], r4, r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vstrw.32 q3, [r1, #16] -; CHECK-NEXT: adds r4, r4, r6 -; CHECK-NEXT: vmov q1[2], q1[0], r4, lr +; CHECK-NEXT: adds r5, r5, r6 +; CHECK-NEXT: vmov q2[2], q2[0], r5, lr ; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r0, r12 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov q2[3], q2[1], r0, r12 +; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: @@ -459,18 +459,19 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrd r2, r0, [r0] ; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q2[0], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -328,78 +328,92 @@ define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) { ; CHECK-LABEL: vld3_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vldrw.u32 q2, [r0] ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vmov.u16 r2, q2[0] +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov.u16 r0, q0[2] ; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.f32 s22, s8 +; CHECK-NEXT: vmov.u16 r2, q1[1] ; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmovnb.i32 q3, q4 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov q3[3], q3[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q4[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q4[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.16 q5[4], r2 ; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.f32 s13, s17 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q2[1] +; CHECK-NEXT: vmov.f32 s14, s22 ; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov.f32 s15, s27 ; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q6[2], r0 ; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q5[3], r0 +; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmov.u16 r0, q4[6] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmovnb.i32 q4, q7 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r2, s25 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r0 +; CHECK-NEXT: vmov.u16 r0, q2[2] +; CHECK-NEXT: vadd.i16 q3, q3, q4 +; CHECK-NEXT: vmov.16 q4[0], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[6] +; CHECK-NEXT: vmov.u16 r0, q1[7] ; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovnb.i32 q1, q5 +; CHECK-NEXT: vmovnb.i32 q1, q4 ; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov q1[3], q1[1], r2, r0 -; CHECK-NEXT: vadd.i16 q0, q4, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q3 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 +; CHECK-NEXT: vadd.i16 q0, q3, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %l1 = load <24 x i16>, <24 x i16>* %src, align 4 @@ -417,144 +431,170 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.u16 r2, q2[4] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s8 -; CHECK-NEXT: vmov q3, q5 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmovnb.i32 q3, q4 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] +; CHECK-NEXT: vldrw.u32 q0, [r0, #80] +; CHECK-NEXT: vmov.u16 r2, q2[0] +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[3] +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov.16 q4[2], r2 ; CHECK-NEXT: vmov.u16 r2, q1[1] ; CHECK-NEXT: vmov.16 q4[3], r2 ; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[2] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[5] -; CHECK-NEXT: vmov.16 q5[7], r2 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.f32 s13, s17 +; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov.16 q6[6], r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: vmov.16 q6[7], r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.16 q5[5], r2 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vmov.f32 s14, s22 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov.u16 r2, q4[3] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.u16 r2, q2[1] ; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov.u16 r2, q2[4] ; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.16 q5[2], r2 +; CHECK-NEXT: vmov.f32 s15, s27 +; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.16 q6[2], r2 ; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov.16 q6[3], r2 ; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q7[4], r2 +; CHECK-NEXT: vmov.u16 r2, q4[6] ; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmovnb.i32 q4, q7 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov q4[2], q4[0], r3, r2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: vmov q4[3], q4[1], r3, r2 +; CHECK-NEXT: vmov.u16 r2, q2[2] +; CHECK-NEXT: vadd.i16 q3, q3, q4 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[5] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[0] +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[3] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmov.f32 s18, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmovnb.i32 q1, q5 +; CHECK-NEXT: vmovnb.i32 q1, q4 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov q1[2], q1[0], r3, r2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r2 -; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vadd.i16 q0, q4, q1 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vadd.i16 q0, q0, q3 -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q2[6] -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q3[1] -; CHECK-NEXT: vmov.u16 r0, q4[2] -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 +; CHECK-NEXT: vadd.i16 q0, q3, q1 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vmov.u16 r2, q3[0] +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u16 r2, q3[6] +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.16 q5[2], r2 +; CHECK-NEXT: vmov.u16 r2, q2[1] +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov.16 q7[6], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.u16 r2, q2[4] +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vmov.16 q6[4], r2 +; CHECK-NEXT: vmov.u16 r0, q2[7] +; CHECK-NEXT: vmov.16 q6[5], r0 +; CHECK-NEXT: vmov.u16 r0, q3[1] +; CHECK-NEXT: vmov.f32 s17, s21 +; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.u16 r0, q3[4] +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov.u16 r0, q3[7] -; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.f32 s19, s31 +; CHECK-NEXT: vmov.16 q6[2], r0 ; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.f32 s7, s23 -; CHECK-NEXT: vmov.16 q5[0], r0 +; CHECK-NEXT: vmov.16 q6[3], r0 ; CHECK-NEXT: vmov.u16 r0, q2[5] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vmov.16 q5[2], r0 -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.16 q7[7], r0 -; CHECK-NEXT: vmov.f32 s22, s15 -; CHECK-NEXT: vmov.f32 s30, s16 -; CHECK-NEXT: vmov q6, q7 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.16 q1[7], r0 ; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovnb.i32 q6, q5 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r0 -; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r0 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q2[1] -; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.16 q7[1], r0 -; CHECK-NEXT: vmov.u16 r0, q2[7] -; CHECK-NEXT: vmov.16 q7[2], r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmovnb.i32 q0, q7 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov r2, s25 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 ; CHECK-NEXT: vmov.u16 r0, q3[2] -; CHECK-NEXT: vmov.16 q7[3], r0 +; CHECK-NEXT: vmov.16 q5[0], r0 ; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.16 q7[4], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: vmovnb.i32 q2, q7 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r2, s29 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r0 -; CHECK-NEXT: vadd.i16 q1, q1, q2 -; CHECK-NEXT: vadd.i16 q1, q1, q6 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vadd.i16 q4, q4, q0 +; CHECK-NEXT: vmov.u16 r0, q2[0] +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u16 r0, q2[3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmovnb.i32 q1, q5 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 +; CHECK-NEXT: vadd.i16 q0, q4, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -723,125 +763,125 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vmov.u8 r2, q1[0] ; CHECK-NEXT: vmov.8 q3[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.8 q3[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov.u8 r2, q1[6] ; CHECK-NEXT: vmov.8 q3[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[9] -; CHECK-NEXT: vmov.8 q3[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[12] -; CHECK-NEXT: vmov.8 q3[4], r2 -; CHECK-NEXT: vmov.u8 r2, q0[15] -; CHECK-NEXT: vmov.8 q3[5], r2 -; CHECK-NEXT: vmov.u8 r2, q2[2] -; CHECK-NEXT: vmov.8 q3[6], r2 -; CHECK-NEXT: vmov.u8 r2, q2[8] +; CHECK-NEXT: vmov.u8 r2, q0[8] ; CHECK-NEXT: vmov.8 q4[8], r2 -; CHECK-NEXT: vmov.u8 r2, q2[11] +; CHECK-NEXT: vmov.u8 r2, q0[11] ; CHECK-NEXT: vmov.8 q4[9], r2 -; CHECK-NEXT: vmov.u8 r2, q2[14] +; CHECK-NEXT: vmov.u8 r2, q0[14] ; CHECK-NEXT: vmov.8 q4[10], r2 -; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.u8 r0, q2[1] ; CHECK-NEXT: vmov.8 q4[11], r0 -; CHECK-NEXT: vmov.u8 r0, q2[5] -; CHECK-NEXT: vmov.8 q3[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.8 q3[3], r0 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.u8 r2, q1[1] ; CHECK-NEXT: vmov.8 q5[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.8 q6[0], r2 +; CHECK-NEXT: vmov.u8 r2, q1[4] ; CHECK-NEXT: vmov.8 q5[9], r0 -; CHECK-NEXT: vmov.u8 r0, q2[15] +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.8 q6[1], r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] ; CHECK-NEXT: vmov.8 q5[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.8 q6[2], r2 +; CHECK-NEXT: vmov.u8 r2, q1[10] ; CHECK-NEXT: vmov.8 q5[11], r0 -; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: vmov.8 q6[3], r2 ; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.8 q5[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[4] -; CHECK-NEXT: vmov.8 q5[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[7] -; CHECK-NEXT: vmov.8 q5[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[10] -; CHECK-NEXT: vmov.8 q5[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[13] -; CHECK-NEXT: vmov.8 q5[4], r2 -; CHECK-NEXT: vmov.u8 r2, q2[0] -; CHECK-NEXT: vmov.8 q5[5], r2 -; CHECK-NEXT: vmov.u8 r2, q2[3] -; CHECK-NEXT: vmov.8 q5[6], r2 -; CHECK-NEXT: vmov.u8 r2, q2[6] -; CHECK-NEXT: vmov.8 q5[7], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov q6[2], q6[0], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.8 q6[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.8 q6[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[3] +; CHECK-NEXT: vmov.8 q6[6], r0 +; CHECK-NEXT: vmov.u8 r0, q2[5] ; CHECK-NEXT: vmov.8 q7[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.u8 r0, q2[8] ; CHECK-NEXT: vmov.8 q7[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: vmov.u8 r0, q2[11] ; CHECK-NEXT: vmov.8 q7[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.8 q7[15], r0 -; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov.u8 r0, q0[6] +; CHECK-NEXT: vmov.8 q6[7], r0 ; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: vmov q6[3], q6[1], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q1[4] -; CHECK-NEXT: vmov.8 q5[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q5[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q5[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q5[15], r0 +; CHECK-NEXT: vmov r2, s25 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r0 +; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.u8 r2, q1[12] +; CHECK-NEXT: vmov.8 q6[12], r0 +; CHECK-NEXT: vmov.u8 r0, q2[7] +; CHECK-NEXT: vmov.8 q3[4], r2 +; CHECK-NEXT: vmov.u8 r2, q1[15] +; CHECK-NEXT: vmov.8 q6[13], r0 +; CHECK-NEXT: vmov.u8 r0, q2[10] +; CHECK-NEXT: vmov.8 q3[5], r2 +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov.8 q6[14], r0 +; CHECK-NEXT: vmov.u8 r0, q2[13] +; CHECK-NEXT: vmov.8 q3[6], r2 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.8 q6[15], r0 +; CHECK-NEXT: vmov.8 q3[7], r2 +; CHECK-NEXT: vmov r0, s27 ; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov r0, s23 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q2[10] -; CHECK-NEXT: vadd.i8 q3, q4, q6 +; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vadd.i8 q3, q4, q5 ; CHECK-NEXT: vmov.8 q4[8], r0 -; CHECK-NEXT: vmov.u8 r0, q2[13] -; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.u8 r2, q1[2] ; CHECK-NEXT: vmov.8 q4[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.u8 r0, q2[0] ; CHECK-NEXT: vmov.8 q4[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.u8 r0, q2[3] ; CHECK-NEXT: vmov.8 q4[11], r0 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov.8 q4[0], r2 -; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.u8 r2, q1[5] ; CHECK-NEXT: vmov.8 q4[1], r2 -; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vmov.u8 r2, q1[8] ; CHECK-NEXT: vmov.8 q4[2], r2 -; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.u8 r2, q1[11] ; CHECK-NEXT: vmov.8 q4[3], r2 -; CHECK-NEXT: vmov.u8 r2, q0[14] -; CHECK-NEXT: vmov.8 q4[4], r2 -; CHECK-NEXT: vmov.u8 r2, q2[1] -; CHECK-NEXT: vmov.8 q4[5], r2 -; CHECK-NEXT: vmov.u8 r2, q2[4] -; CHECK-NEXT: vmov.8 q4[6], r2 -; CHECK-NEXT: vmov.u8 r2, q2[7] -; CHECK-NEXT: vmov.8 q4[7], r2 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q2[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q2[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q2[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q2[15], r0 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.8 q4[4], r0 +; CHECK-NEXT: vmov.u8 r0, q0[1] +; CHECK-NEXT: vmov.8 q4[5], r0 +; CHECK-NEXT: vmov.u8 r0, q0[4] +; CHECK-NEXT: vmov.8 q4[6], r0 +; CHECK-NEXT: vmov.u8 r0, q2[6] +; CHECK-NEXT: vmov.8 q1[12], r0 +; CHECK-NEXT: vmov.u8 r0, q2[9] +; CHECK-NEXT: vmov.8 q1[13], r0 +; CHECK-NEXT: vmov.u8 r0, q2[12] +; CHECK-NEXT: vmov.8 q1[14], r0 +; CHECK-NEXT: vmov.u8 r0, q2[15] +; CHECK-NEXT: vmov.8 q1[15], r0 +; CHECK-NEXT: vmov.u8 r0, q0[7] +; CHECK-NEXT: vmov.8 q4[7], r0 +; CHECK-NEXT: vmov r0, s7 ; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r0 -; CHECK-NEXT: vadd.i8 q0, q3, q0 +; CHECK-NEXT: vmov q5[3], q5[1], r2, r0 +; CHECK-NEXT: vadd.i8 q0, q3, q5 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr @@ -863,45 +903,36 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov.f64 d6, d3 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s14, s16 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.f32 s15, s17 -; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov.f64 d4, d1 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: adc.w r12, r12, r0 ; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vmov.f32 s2, s18 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov r12, s15 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adc.w r2, r2, r12 -; CHECK-NEXT: adds.w lr, lr, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: adc.w r12, r2, r3 +; CHECK-NEXT: adds.w lr, r2, r0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: adcs r0, r3 ; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 +; CHECK-NEXT: adds r2, r2, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: adcs r0, r3 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <6 x i64>, <6 x i64>* %src, align 4 @@ -919,86 +950,69 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #80] -; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] ; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vmov.f64 d0, d4 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f64 d8, d5 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vmov.f32 s19, s5 -; CHECK-NEXT: vmov.f64 d12, d11 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f32 s25, s23 -; CHECK-NEXT: vmov.f32 s26, s4 -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmov.f32 s27, s5 -; CHECK-NEXT: vmov.f32 s23, s15 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.f64 d14, d6 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vmov.f64 d12, d9 +; CHECK-NEXT: vldrw.u32 q1, [r0, #80] +; CHECK-NEXT: vmov.f32 s25, s19 +; CHECK-NEXT: vmov.f32 s26, s20 +; CHECK-NEXT: vmov.f32 s27, s21 +; CHECK-NEXT: vmov lr, s26 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vldrw.u32 q4, [r0, #64] +; CHECK-NEXT: vmov.f64 d4, d1 ; CHECK-NEXT: vmov r12, s27 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov r6, s18 +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov r7, s1 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov.f32 s29, s13 -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vmov.f32 s31, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r0, s30 -; CHECK-NEXT: adc.w r3, r2, r12 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: adds.w lr, lr, r0 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: adc.w r12, r3, r2 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: adc.w r8, r2, r3 -; CHECK-NEXT: vmov r3, s17 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s25 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: vmov r5, s10 +; CHECK-NEXT: adc.w r8, r4, r2 +; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: vmov r6, s6 +; CHECK-NEXT: adcs r0, r4 +; CHECK-NEXT: vmov r4, s7 ; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s21 -; CHECK-NEXT: vmov q1[2], q1[0], r5, r0 -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmov r6, s8 +; CHECK-NEXT: adcs r0, r4 +; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: vmov r6, s17 +; CHECK-NEXT: adcs r4, r7 +; CHECK-NEXT: vmov r7, s16 ; CHECK-NEXT: adds r2, r2, r7 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r8 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: vmov r6, s28 -; CHECK-NEXT: adds r2, r2, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: adcs r0, r4 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r5 +; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: vmov q1[3], q1[1], r4, r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-NEXT: adc.w r0, r0, r8 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <12 x i64>, <12 x i64>* %src, align 4 @@ -1223,30 +1237,35 @@ define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) { ; CHECK-LABEL: vld3_v2f16: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: ldrd r2, r3, [r0] ; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: vmov.32 q0[2], r0 -; CHECK-NEXT: vmovx.f16 s4, s0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmovx.f16 s0, s4 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmovx.f16 s16, s1 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.16 q1[0], r2 ; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vadd.f16 q1, q2, q1 -; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vmov.16 q1[1], r0 ; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vadd.f16 q1, q1, q3 ; CHECK-NEXT: vmov.16 q0[1], r0 ; CHECK-NEXT: vadd.f16 q0, q1, q0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: bx lr entry: %l1 = load <6 x half>, <6 x half>* %src, align 4 @@ -1262,48 +1281,49 @@ define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) { ; CHECK-LABEL: vld3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: .vsave {d8, d9, d10} +; CHECK-NEXT: vpush {d8, d9, d10} ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmovx.f16 s4, s0 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmovx.f16 s4, s3 ; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmovx.f16 s12, s1 +; CHECK-NEXT: vmovx.f16 s16, s1 ; CHECK-NEXT: vmov.16 q2[1], r2 ; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmov.16 q2[2], r2 ; CHECK-NEXT: ldrd r2, r0, [r0, #16] -; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q3[0], r2 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmovx.f16 s20, s12 ; CHECK-NEXT: vmov.32 q1[1], r0 ; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmovx.f16 s16, s4 +; CHECK-NEXT: vmovx.f16 s4, s5 ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q3[0], r2 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov.16 q4[1], r0 ; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov.16 q4[2], r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov.16 q4[3], r0 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vadd.f16 q2, q4, q2 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vadd.f16 q2, q3, q2 ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vadd.f16 q0, q2, q0 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: strd r0, r2, [r1] -; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: vpop {d8, d9, d10} ; CHECK-NEXT: bx lr entry: %l1 = load <12 x half>, <12 x half>* %src, align 4 @@ -1319,93 +1339,105 @@ define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vld3_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmov r3, s13 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmovx.f16 s12, s19 -; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.16 q0[2], r3 ; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov.f32 s2, s7 -; CHECK-NEXT: vmovx.f16 s20, s16 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov lr, s3 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vmov.16 q5[0], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vmov.16 q0[2], r3 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vmovx.f16 s0, s13 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.16 q7[0], r2 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vmov.16 q7[1], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q3[6], r3 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmovx.f16 s24, s11 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov.f32 s14, s16 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov r12, s22 -; CHECK-NEXT: vmovx.f16 s20, s17 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q5[6], r2 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmovx.f16 s8, s10 +; CHECK-NEXT: vmov.16 q3[4], r2 ; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov lr, s23 -; CHECK-NEXT: vmovx.f16 s20, s6 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q5[4], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov.f32 s29, s1 +; CHECK-NEXT: vmov.16 q2[5], r2 ; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov.16 q5[2], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov q2[2], q2[0], r0, r12 ; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovx.f16 s20, s9 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vmov.16 q5[0], r5 -; CHECK-NEXT: vmov.16 q5[1], r4 -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: vmovx.f16 s8, s4 -; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: vmovx.f16 s8, s18 -; CHECK-NEXT: vmov.16 q5[3], r4 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov.16 q5[4], r4 -; CHECK-NEXT: vmov r4, s17 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmov r5, s8 -; CHECK-NEXT: vmov.16 q2[6], r4 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q5[5], r4 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.16 q2[7], r5 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r12 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov q0[3], q0[1], r0, lr -; CHECK-NEXT: vmov.f32 s23, s11 -; CHECK-NEXT: vmov q1[3], q1[1], r4, r5 -; CHECK-NEXT: vadd.f16 q0, q5, q0 -; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s30, s14 +; CHECK-NEXT: vmov.f32 s31, s27 +; CHECK-NEXT: vmov q0[3], q0[1], r3, lr +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vadd.f16 q0, q7, q0 +; CHECK-NEXT: vmov q2[3], q2[1], r0, r2 +; CHECK-NEXT: vadd.f16 q0, q0, q2 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r7, pc} entry: %l1 = load <24 x half>, <24 x half>* %src, align 4 %s1 = shufflevector <24 x half> %l1, <24 x half> undef, <8 x i32> @@ -1420,174 +1452,203 @@ define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) { ; CHECK-LABEL: vld3_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q2, [r0, #80] -; CHECK-NEXT: vldrw.u32 q3, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmov r2, s10 -; CHECK-NEXT: vmovx.f16 s20, s17 +; CHECK-NEXT: .pad #56 +; CHECK-NEXT: sub sp, #56 +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] +; CHECK-NEXT: vldrw.u32 q6, [r0, #80] +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r12, s4 -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q0[7], r3 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.16 q4[1], r3 +; CHECK-NEXT: vmovx.f16 s0, s13 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: vmov r3, s26 +; CHECK-NEXT: vmov.f32 s17, s1 +; CHECK-NEXT: vmovx.f16 s0, s27 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: vmov.f32 s2, s24 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r12, s18 +; CHECK-NEXT: vmov.16 q0[4], r12 +; CHECK-NEXT: vmov.16 q0[5], r2 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmovx.f16 s28, s16 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q5[4], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q5[5], r12 -; CHECK-NEXT: vmov r12, s22 -; CHECK-NEXT: vmovx.f16 s20, s10 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.16 q5[7], r3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q6[0], r3 -; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmovx.f16 s28, s19 -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmovx.f16 s28, s18 -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q2[6], r3 -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov lr, s30 -; CHECK-NEXT: vmov r6, s11 -; CHECK-NEXT: vmovx.f16 s8, s12 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmovx.f16 s12, s15 -; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q5[0], r3 +; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov q7, q5 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q7[2], r2 +; CHECK-NEXT: vmov.16 q7[3], r3 +; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov q2, q7 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q2[4], r3 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov r2, s27 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.16 q6[6], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov.16 q6[7], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmovx.f16 s12, s7 +; CHECK-NEXT: vmov.16 q0[1], r3 ; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov.16 q2[1], r4 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.16 q2[2], r3 -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: vmov.16 q2[3], r3 -; CHECK-NEXT: vmov.f32 s27, s23 -; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q3[2], r3 +; CHECK-NEXT: vmov.f32 s21, s29 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov lr, s27 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r12 +; CHECK-NEXT: vmov r4, s13 +; CHECK-NEXT: vmov q0[3], q0[1], r4, lr +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vadd.f16 q0, q5, q0 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r3 ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] -; CHECK-NEXT: vmov r4, s9 -; CHECK-NEXT: vmov q2[2], q2[0], r5, r12 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: vmov q0[2], q0[0], r3, lr -; CHECK-NEXT: vmov q0[3], q0[1], r4, r6 -; CHECK-NEXT: vmov q2[3], q2[1], r2, r5 -; CHECK-NEXT: vadd.f16 q0, q6, q0 -; CHECK-NEXT: vmovx.f16 s12, s16 -; CHECK-NEXT: vadd.f16 q1, q0, q2 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: vmovx.f16 s4, s19 -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s10 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.16 q3[1], r3 +; CHECK-NEXT: vadd.f16 q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q5, [r0, #16] +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s20, s1 -; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov q0, q3 ; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovx.f16 s24, s9 -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmovx.f16 s28, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.f32 s13, s1 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov.f32 s14, s23 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r3 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r2 ; CHECK-NEXT: vmov r2, s17 -; CHECK-NEXT: vmov.16 q5[5], r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmovx.f16 s20, s18 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.16 q5[7], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov r12, s2 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q6[0], r3 ; CHECK-NEXT: vmov.16 q6[1], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov.16 q6[3], r2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmovx.f16 s28, s3 -; CHECK-NEXT: vmov.16 q6[4], r2 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmovx.f16 s28, s2 -; CHECK-NEXT: vmov.16 q6[5], r2 -; CHECK-NEXT: vmov r2, s28 +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov q7, q6 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q7[2], r2 +; CHECK-NEXT: vmov.16 q7[3], r3 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: vmov q2, q7 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q2[4], r3 +; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmovx.f16 s16, s17 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: vmov.16 q4[6], r6 -; CHECK-NEXT: vmov r2, s30 -; CHECK-NEXT: vmov.16 q4[7], r3 -; CHECK-NEXT: vmov.f32 s27, s23 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: vmovx.f16 s16, s8 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmov.16 q4[0], r6 -; CHECK-NEXT: vmov r6, s8 -; CHECK-NEXT: vmov.16 q4[1], r5 -; CHECK-NEXT: vmov.16 q4[2], r6 -; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: vmov.16 q4[3], r6 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r0 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r2 -; CHECK-NEXT: vmov r5, s17 -; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: vmov q0[3], q0[1], r4, r0 -; CHECK-NEXT: vadd.f16 q1, q6, q1 -; CHECK-NEXT: vadd.f16 q0, q1, q0 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.16 q4[6], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov.16 q4[7], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmovx.f16 s20, s7 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmov r3, s20 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q5[2], r3 +; CHECK-NEXT: vmov.f32 s25, s29 +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s26, s10 +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.f32 s27, s3 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov q0[3], q0[1], r0, r4 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov q1[2], q1[0], r0, r12 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vadd.f16 q0, q6, q0 +; CHECK-NEXT: vmov q1[3], q1[1], r0, r2 +; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: add sp, #56 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, pc} entry: %l1 = load <48 x half>, <48 x half>* %src, align 4 %s1 = shufflevector <48 x half> %l1, <48 x half> undef, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll @@ -106,56 +106,56 @@ ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q0, [r0], #64 -; CHECK-NEXT: vldrw.u32 q3, [r0, #-48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #-48] ; CHECK-NEXT: vldrw.u32 q5, [r0, #-16] -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov.f64 d8, d7 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f64 d6, d0 +; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vmov.f32 s19, s21 +; CHECK-NEXT: vmov lr, s18 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmov.f32 s13, s1 +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmov.f32 s15, s5 ; CHECK-NEXT: vmov r12, s19 -; CHECK-NEXT: vmov lr, s15 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: vmov.f64 d12, d5 +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f64 d8, d1 +; CHECK-NEXT: vmov.f32 s17, s3 ; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: adds r6, r3, r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: adc.w r12, r12, lr -; CHECK-NEXT: adds r5, r5, r4 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: adcs r2, r3 -; CHECK-NEXT: adds.w lr, r5, r6 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r6, s1 +; CHECK-NEXT: adc.w r12, r12, r2 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: vmov r5, s25 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: adds.w lr, lr, r3 +; CHECK-NEXT: vmov r4, s24 ; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r6, s17 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: adcs r6, r5 -; CHECK-NEXT: vmov r5, s5 -; CHECK-NEXT: adds r3, r3, r7 -; CHECK-NEXT: adcs r4, r5 -; CHECK-NEXT: adds r2, r2, r3 -; CHECK-NEXT: adc.w r3, r4, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: vmov q0[3], q0[1], r3, r12 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: adds r5, r5, r7 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: adds r3, r3, r5 +; CHECK-NEXT: adcs r2, r4 +; CHECK-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -533,56 +533,56 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] ; CHECK-NEXT: vldrw.u32 q5, [r0, #48] ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov.f64 d8, d7 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s18, s22 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s19, s23 -; CHECK-NEXT: vmov.f32 s15, s21 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.f64 d2, d1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vmov.f64 d8, d4 +; CHECK-NEXT: vmov.f32 s17, s9 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vmov.f32 s19, s21 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmov.f64 d6, d0 ; CHECK-NEXT: vmov r12, s19 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov.f32 s13, s1 +; CHECK-NEXT: vmov.f32 s14, s4 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov.f64 d12, d5 +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f64 d8, d1 +; CHECK-NEXT: adds.w lr, r2, r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.f32 s17, s3 ; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: adcs r2, r3 +; CHECK-NEXT: adds.w lr, lr, r0 +; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: adcs r0, r3 -; CHECK-NEXT: adds.w lr, lr, r2 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s25 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: adcs r2, r3 ; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r4, r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: adcs r3, r4 -; CHECK-NEXT: adds r0, r0, r5 +; CHECK-NEXT: adds r4, r4, r6 +; CHECK-NEXT: adcs r3, r5 +; CHECK-NEXT: adds r0, r0, r4 ; CHECK-NEXT: adcs r2, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r0, lr ; CHECK-NEXT: vmov q0[3], q0[1], r2, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %l1 = load <8 x i64>, <8 x i64>* %src, align 4 @@ -604,112 +604,107 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 -; CHECK-NEXT: vldrw.u32 q1, [r0, #64] +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: vldrw.u32 q3, [r0, #64] ; CHECK-NEXT: vldrw.u32 q0, [r0, #96] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vmov.f64 d4, d3 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vmov.f64 d14, d9 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vmov.f64 d10, d6 ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s29, s19 -; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vmov.f64 d4, d13 -; CHECK-NEXT: vmov.f32 s31, s3 -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vmov.f32 s9, s27 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s26, s0 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s27, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov.f64 d10, d7 -; CHECK-NEXT: vmov r12, s11 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: vmov.f32 s21, s15 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s14, s4 -; CHECK-NEXT: vmov.f32 s15, s5 -; CHECK-NEXT: vmov.f32 s23, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s14 -; CHECK-NEXT: vmov.f32 s18, s0 -; CHECK-NEXT: vmov.f32 s19, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: adc.w r12, r12, r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov r4, s28 -; CHECK-NEXT: adcs r0, r2 -; CHECK-NEXT: adds.w lr, lr, r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: adc.w r12, r12, r0 -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r3, r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [r0, #80] +; CHECK-NEXT: vldrw.u32 q6, [r0, #112] +; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s21, s13 +; CHECK-NEXT: vmov.f32 s22, s0 +; CHECK-NEXT: vmov.f32 s23, s1 +; CHECK-NEXT: vmov.f64 d0, d8 +; CHECK-NEXT: vmov.f32 s1, s17 +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: vmov.f32 s2, s4 ; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.f64 d14, d4 +; CHECK-NEXT: vmov r12, s3 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.f64 d2, d8 +; CHECK-NEXT: vmov.f32 s5, s17 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.f32 s29, s9 +; CHECK-NEXT: vmov.f32 s30, s24 +; CHECK-NEXT: vmov.f32 s31, s25 +; CHECK-NEXT: vmov r7, s13 +; CHECK-NEXT: adds.w lr, r3, r2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov r6, s2 +; CHECK-NEXT: vmov r5, s3 +; CHECK-NEXT: vmov.f64 d0, d5 +; CHECK-NEXT: adc.w r12, r12, r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s1, s11 +; CHECK-NEXT: adds r0, r0, r4 +; CHECK-NEXT: vmov r4, s27 +; CHECK-NEXT: adcs r3, r2 +; CHECK-NEXT: adds.w lr, lr, r0 +; CHECK-NEXT: adc.w r12, r12, r3 +; CHECK-NEXT: vmov r3, s30 +; CHECK-NEXT: vmov r2, s26 +; CHECK-NEXT: vmov r0, s31 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r0, r4 +; CHECK-NEXT: vmov r4, s22 +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: adds r4, r4, r6 +; CHECK-NEXT: vmov r6, s8 +; CHECK-NEXT: adcs r5, r0 +; CHECK-NEXT: adds.w r9, r4, r2 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: adc.w r8, r5, r3 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov r5, s9 +; CHECK-NEXT: vmov.f64 d0, d7 +; CHECK-NEXT: vmov.f32 s1, s15 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: adds r4, r4, r6 ; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: adcs r2, r5 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov.f64 d0, d9 +; CHECK-NEXT: vmov.f32 s1, s19 +; CHECK-NEXT: adds r3, r3, r6 +; CHECK-NEXT: vmov r6, s17 +; CHECK-NEXT: adcs r5, r7 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adc.w r10, r5, r2 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r7, s16 ; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov r7, s6 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: adcs r4, r0 -; CHECK-NEXT: adds.w r9, r5, r2 -; CHECK-NEXT: vmov r5, s30 -; CHECK-NEXT: adc.w r8, r4, r3 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: adds r5, r5, r6 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: adcs r2, r4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: adds r3, r3, r7 -; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: adcs r4, r6 -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: vmov r6, s20 -; CHECK-NEXT: adc.w r10, r4, r2 -; CHECK-NEXT: vmov r4, s21 -; CHECK-NEXT: vmov q1[2], q1[0], r9, r3 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: vmov q1[3], q1[1], r8, r10 -; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmov.f64 d0, d3 +; CHECK-NEXT: vmov.f32 s1, s7 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r9 +; CHECK-NEXT: vmov q1[3], q1[1], r10, r8 ; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: adds r6, r6, r7 -; CHECK-NEXT: vmov r7, s25 -; CHECK-NEXT: adcs r4, r5 -; CHECK-NEXT: vmov r5, s9 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r7, r5 -; CHECK-NEXT: adds r0, r0, r6 -; CHECK-NEXT: vmov q0[2], q0[0], r0, lr -; CHECK-NEXT: adc.w r0, r4, r2 +; CHECK-NEXT: adds r5, r5, r7 +; CHECK-NEXT: vmov r7, s0 +; CHECK-NEXT: adcs r4, r6 +; CHECK-NEXT: vmov r6, s1 +; CHECK-NEXT: adds r2, r2, r7 +; CHECK-NEXT: adcs r0, r6 +; CHECK-NEXT: adds r2, r2, r5 +; CHECK-NEXT: vmov q0[2], q0[0], r2, lr +; CHECK-NEXT: adcs r0, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -8,15 +8,15 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #80 -; CHECK-NEXT: sub sp, #80 +; CHECK-NEXT: .pad #192 +; CHECK-NEXT: sub sp, #192 ; CHECK-NEXT: mul r12, r3, r2 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: cmp.w r2, r12, lsr #2 ; CHECK-NEXT: beq.w .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: mvn r3, #7 -; CHECK-NEXT: ldr r5, [sp, #160] +; CHECK-NEXT: ldr r5, [sp, #272] ; CHECK-NEXT: and.w r3, r3, r12, lsr #2 ; CHECK-NEXT: sub.w r12, r3, #8 ; CHECK-NEXT: movs r3, #1 @@ -24,213 +24,251 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q5, [r0, #32] -; CHECK-NEXT: vldrh.u16 q3, [r0, #48] -; CHECK-NEXT: vldrh.u16 q7, [r0], #64 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmovx.f16 s8, s12 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vldrh.u16 q4, [r0], #64 +; CHECK-NEXT: vldrh.u16 q7, [r0, #-32] +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vldrh.u16 q6, [r0, #-48] -; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r3 ; CHECK-NEXT: vmov r3, s30 -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vldrh.u16 q6, [r0, #-16] +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vldrh.u16 q5, [r0, #-48] +; CHECK-NEXT: vmov r3, s24 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.16 q2[6], r3 ; CHECK-NEXT: vmov r2, s26 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 -; CHECK-NEXT: vmul.f16 q0, q1, r5 -; CHECK-NEXT: vmovx.f16 s4, s24 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r3, s22 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vmovx.f16 s12, s22 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmovx.f16 s4, s26 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vmovx.f16 s8, s18 +; CHECK-NEXT: vmul.f16 q0, q0, r5 +; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill ; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmovx.f16 s0, s30 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s28 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s22 -; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: vmov.16 q0[5], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s20 +; CHECK-NEXT: vmovx.f16 s4, s24 ; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vmov.16 q1[4], r4 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s14 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.16 q1[6], r4 ; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmul.f16 q0, q0, r5 -; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmovx.f16 s8, s16 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov.16 q2[0], r4 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s12, s20 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.16 q3[2], r4 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmul.f16 q0, q2, r5 +; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill ; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vstrw.32 q0, [sp, #160] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r2, s29 +; CHECK-NEXT: vmov.16 q0[1], r3 ; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q0[5], r3 -; CHECK-NEXT: vmov r3, s29 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r2, s31 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov r3, s31 +; CHECK-NEXT: vmov.16 q1[5], r3 ; CHECK-NEXT: vmov r2, s25 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r2, s27 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov r3, s27 +; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.16 q2[7], r3 +; CHECK-NEXT: vmov r3, s23 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vmovx.f16 s12, s23 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmovx.f16 s4, s27 +; CHECK-NEXT: vmov.f32 s3, s11 +; CHECK-NEXT: vmovx.f16 s8, s19 +; CHECK-NEXT: vmul.f16 q0, q0, r5 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmovx.f16 s0, s31 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s29 ; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmul.f16 q4, q1, r5 -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: vmovx.f16 s4, s25 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s23 -; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[4], r4 +; CHECK-NEXT: vmov.16 q0[5], r2 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s21 +; CHECK-NEXT: vmovx.f16 s4, s25 +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload ; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q1[4], r4 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov.16 q1[5], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s8, s27 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.16 q1[6], r4 ; CHECK-NEXT: vmov.16 q1[7], r2 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov.16 q2[0], r4 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s12, s21 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.16 q3[2], r4 +; CHECK-NEXT: vmov.16 q3[3], r2 +; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vldrw.u32 q3, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmovx.f16 s0, s24 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmul.f16 q6, q0, r5 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vmul.f16 q2, q2, r5 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov.16 q1[3], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s24 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s4 ; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vmov.16 q5[1], r3 -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: vmov.16 q5[4], r2 +; CHECK-NEXT: vmov.16 q7[0], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q5[5], r2 -; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.16 q7[1], r3 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vmov q1, q6 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s5 ; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov q1, q4 ; CHECK-NEXT: vmov.16 q3[3], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s25 -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmov.16 q3[7], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q7[0], r2 +; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vmov.16 q3[7], r3 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmov.16 q4[0], r2 ; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov.16 q7[1], r3 +; CHECK-NEXT: vmov q5, q6 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q4[1], r3 ; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov.16 q7[4], r2 +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r2, s22 +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q7[5], r2 -; CHECK-NEXT: vmov r3, s26 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmov.16 q1[3], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[6], r2 ; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov.16 q6[0], r2 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q2[3], r3 +; CHECK-NEXT: vmov.16 q6[1], r3 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vmov q1, q6 +; CHECK-NEXT: vmovx.f16 s0, s14 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov r2, s23 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: vldrw.u32 q5, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vmov.f32 s29, s21 +; CHECK-NEXT: vldrw.u32 q5, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vmov.f32 s30, s22 +; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q3[3], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.f32 s31, s23 +; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov.f32 s17, s21 +; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s0, s3 ; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmovx.f16 s4, s6 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmovx.f16 s4, s18 -; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vmov.16 q2[7], r3 +; CHECK-NEXT: vmov.f32 s25, s21 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s26, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[5], r2 -; CHECK-NEXT: vmov r2, s19 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmovx.f16 s16, s19 -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmovx.f16 s16, s27 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vmov.f32 s1, s9 -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s16, s19 -; CHECK-NEXT: vmov.f32 s3, s11 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s21, s25 -; CHECK-NEXT: vstrh.16 q0, [r1, #32] -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov.f32 s29, s13 -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov.f32 s23, s27 -; CHECK-NEXT: vmov.f32 s9, s5 -; CHECK-NEXT: vmov.f32 s11, s7 -; CHECK-NEXT: vstrh.16 q2, [r1, #48] -; CHECK-NEXT: vstrh.16 q5, [r1], #64 -; CHECK-NEXT: vmov.f32 s31, s15 -; CHECK-NEXT: vstrh.16 q7, [r1, #-48] +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[5], r3 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vstrh.16 q6, [r1, #32] +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vstrh.16 q1, [r1, #48] +; CHECK-NEXT: vstrh.16 q7, [r1], #64 +; CHECK-NEXT: vstrh.16 q4, [r1, #-48] ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: @ %while.end -; CHECK-NEXT: add sp, #80 +; CHECK-NEXT: add sp, #192 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -46,28 +46,27 @@ define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) { ; CHECK-LABEL: vmulhs_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s12, s0 -; CHECK-NEXT: vmov.f32 s10, s5 -; CHECK-NEXT: vmov.f32 s14, s1 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov.f32 s16, s6 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmullb.s32 q5, q1, q4 -; CHECK-NEXT: smmul r0, r1, r0 -; CHECK-NEXT: vmov r1, s21 -; CHECK-NEXT: vmov q0[2], q0[0], r0, r1 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s16, s2 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmullb.s32 q3, q4, q2 +; CHECK-NEXT: vmov.f32 s2, s1 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: smmul r1, r2, r1 +; CHECK-NEXT: vmov q2[2], q2[0], r1, r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: smmul r0, r1, r0 -; CHECK-NEXT: vmov r1, s23 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: vmov q2[3], q2[1], r0, r1 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %s0s = sext <4 x i32> %s0 to <4 x i64> diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-loop.ll @@ -4,8 +4,8 @@ define arm_aapcs_vfpcc void @test32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32* nocapture %z, i32 %n) { ; CHECK-LABEL: test32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: cmp r3, #1 ; CHECK-NEXT: blt .LBB0_2 ; CHECK-NEXT: .LBB0_1: @ %vector.body @@ -16,34 +16,31 @@ ; CHECK-NEXT: vmullt.s32 q0, q2, q1 ; CHECK-NEXT: vmullb.s32 q3, q2, q1 ; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: vmov r7, s1 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: lsrl r4, r5, #31 +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vmov r5, s1 ; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: lsrl r4, r7, #31 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r12 -; CHECK-NEXT: vmov r12, s14 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r5 +; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: vmov.32 q0[1], r5 ; CHECK-NEXT: vmov r5, s15 -; CHECK-NEXT: vmov r7, s13 -; CHECK-NEXT: lsrl r12, r5, #31 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: lsrl r4, r7, #31 -; CHECK-NEXT: vmov q1[2], q1[0], r4, r12 -; CHECK-NEXT: vmov q1[3], q1[1], r7, r5 -; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s7, s1 -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: vmov q0[2], q0[0], r12, r4 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: lsrl r4, r5, #31 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: lsrl r6, r5, #31 +; CHECK-NEXT: vmov.32 q1[1], r5 +; CHECK-NEXT: vmov.32 q2[0], r6 +; CHECK-NEXT: vmov q1[2], q1[0], r6, r4 +; CHECK-NEXT: vmov.f32 s7, s2 +; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.f32 s10, s6 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: .LBB0_2: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %0 = and i32 %n, 3 %cmp = icmp eq i32 %0, 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn.ll @@ -189,22 +189,22 @@ ; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 ; CHECK-NEXT: adr r1, .LCPI12_0 ; CHECK-NEXT: vldrw.u32 q2, [r1] -; CHECK-NEXT: vand q0, q0, q1 ; CHECK-NEXT: vbic q2, q2, q1 -; CHECK-NEXT: vorr q0, q0, q2 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: sbcs.w r1, r12, r1 +; CHECK-NEXT: vand q1, q0, q1 +; CHECK-NEXT: vorr q0, q1, q2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r12, r2 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: mov.w r1, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r1, #1 ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: csetm r1, ne -; CHECK-NEXT: rsbs.w r3, r3, #-2147483648 -; CHECK-NEXT: sbcs.w r2, r12, r2 +; CHECK-NEXT: rsbs.w r2, r2, #-2147483648 +; CHECK-NEXT: sbcs.w r2, r12, r3 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqshrn.ll @@ -211,22 +211,22 @@ ; CHECK-NEXT: vmov q0[3], q0[1], r0, r5 ; CHECK-NEXT: adr r0, .LCPI12_0 ; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vand q2, q2, q0 +; CHECK-NEXT: vmov r1, s11 ; CHECK-NEXT: vbic q1, q1, q0 -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: sbcs.w r0, r2, r0 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vorr q0, q2, q1 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: rsbs.w r0, r0, #-2147483648 +; CHECK-NEXT: sbcs.w r0, r2, r1 +; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: mov.w r0, #0 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #1 ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: rsbs.w r3, r3, #-2147483648 -; CHECK-NEXT: sbcs.w r1, r2, r1 +; CHECK-NEXT: rsbs.w r1, r1, #-2147483648 +; CHECK-NEXT: sbcs.w r1, r2, r3 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt.w lr, #1 ; CHECK-NEXT: cmp.w lr, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll @@ -6,20 +6,17 @@ define void @vst2_v2i32(<2 x i32> *%src, <4 x i32> *%dst) { ; CHECK-LABEL: vst2_v2i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldrd r12, r3, [r0] -; CHECK-NEXT: ldrd r2, r0, [r0, #8] -; CHECK-NEXT: vmov q0[2], q0[0], r12, r3 -; CHECK-NEXT: vmov.f64 d2, d1 -; CHECK-NEXT: vmov q2[2], q2[0], r2, r0 -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s2, s8 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s7, s11 -; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: ldrd r2, r12, [r0] +; CHECK-NEXT: ldrd r3, r0, [r0, #8] +; CHECK-NEXT: vmov q0[2], q0[0], r2, r12 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r0 ; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vstrw.32 q2, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 @@ -440,12 +437,12 @@ ; CHECK-NEXT: vmov.16 q2[1], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmov.16 q2[2], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strd r0, r2, [r1] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: strd r2, r0, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -460,33 +457,33 @@ define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vst2_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: ldm.w r0, {r2, r3, r12} -; CHECK-NEXT: vmov.32 q0[0], r12 -; CHECK-NEXT: ldr r0, [r0, #12] +; CHECK-NEXT: ldrd r2, r12, [r0] +; CHECK-NEXT: ldrd r3, r0, [r0, #8] ; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.32 q2[1], r3 -; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov.32 q1[0], r3 ; CHECK-NEXT: vmov r2, s8 ; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[0], r2 -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.32 q2[1], r12 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.16 q0[1], r3 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov r2, s9 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r0, s5 ; CHECK-NEXT: vmovx.f16 s8, s9 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.16 q0[5], r0 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -8,23 +8,27 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: ldm.w r0, {r2, r3, r12, lr} ; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 +; CHECK-NEXT: vmov q2[2], q2[0], r2, r12 ; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: vmov.f32 s8, s7 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.f64 d4, d2 +; CHECK-NEXT: vmov q2[3], q2[1], r3, lr +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.32 q1[1], r0 +; CHECK-NEXT: vmov.f32 s4, s11 +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmov q1[2], q1[0], r2, r12 ; CHECK-NEXT: vmov.f32 s9, s6 ; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vmov.f32 s11, s13 ; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: strd r2, r0, [r1, #16] +; CHECK-NEXT: strd r4, r0, [r1, #16] ; CHECK-NEXT: pop {r4, pc} entry: %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0 @@ -300,22 +304,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrh r2, [r0, #6] +; CHECK-NEXT: ldrh r4, [r0, #6] ; CHECK-NEXT: ldrh.w lr, [r0, #4] +; CHECK-NEXT: ldrh r3, [r0] +; CHECK-NEXT: vmov.16 q0[4], r4 ; CHECK-NEXT: ldrh.w r12, [r0, #8] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: ldrh r3, [r0, #2] -; CHECK-NEXT: vmov q1[2], q1[0], lr, r2 -; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: vmov.32 q1[0], lr +; CHECK-NEXT: ldrh r2, [r0, #2] +; CHECK-NEXT: vmov.32 q2[0], r3 ; CHECK-NEXT: ldrh r0, [r0, #10] +; CHECK-NEXT: vmov.f32 s9, s4 ; CHECK-NEXT: vmov.16 q0[5], r0 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vdup.32 q1, r12 -; CHECK-NEXT: vmov.f32 s3, s2 -; CHECK-NEXT: vmov.f32 s2, s6 -; CHECK-NEXT: vstrh.32 q0, [r1] +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vstrh.32 q2, [r1] ; CHECK-NEXT: str r0, [r1, #8] ; CHECK-NEXT: pop {r4, pc} entry: @@ -385,79 +390,100 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: vldrw.u32 q0, [r0, #32] ; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q4, [r0, #16] -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vldrw.u32 q5, [r0, #16] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.16 q3[1], r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov q0, q3 ; CHECK-NEXT: vmov.16 q0[6], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] ; CHECK-NEXT: vmov.16 q0[7], r2 ; CHECK-NEXT: vmov.u16 r2, q2[0] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.u16 r0, q4[0] -; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[1] -; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r0, q5[0] +; CHECK-NEXT: vmov.16 q4[1], r0 +; CHECK-NEXT: vmov.u16 r0, q5[1] +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vmov.f32 s17, s8 +; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q3[7], r0 -; CHECK-NEXT: vdup.32 q5, r2 -; CHECK-NEXT: vmov.f32 s13, s8 -; CHECK-NEXT: vmov.u16 r2, q5[2] -; CHECK-NEXT: vmov.u16 r0, q3[3] -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmov.u16 r0, q3[4] -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q5[5] -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q5[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[6] -; CHECK-NEXT: vmov.16 q5[3], r0 -; CHECK-NEXT: vmov.u16 r0, q4[7] -; CHECK-NEXT: vmov.f32 s13, s25 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.f32 s14, s26 -; CHECK-NEXT: vmov.16 q5[7], r0 +; CHECK-NEXT: vmov.16 q7[6], r0 +; CHECK-NEXT: vmov.u16 r0, q5[2] +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vmov.16 q7[7], r0 ; CHECK-NEXT: vdup.32 q6, r2 -; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.f32 s19, s31 ; CHECK-NEXT: vmov.u16 r2, q6[2] -; CHECK-NEXT: vmov.f32 s22, s7 -; CHECK-NEXT: vrev32.16 q4, q4 -; CHECK-NEXT: vmov.16 q7[2], r2 -; CHECK-NEXT: vmov.u16 r0, q5[3] -; CHECK-NEXT: vmov.u16 r2, q4[2] -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vmov.16 q7[3], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.u16 r0, q4[3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q4[4] +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vmov.f32 s17, s1 ; CHECK-NEXT: vmov.16 q7[4], r0 ; CHECK-NEXT: vmov.u16 r0, q6[5] -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] ; CHECK-NEXT: vmov.16 q7[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov.f32 s21, s29 -; CHECK-NEXT: vmov.f32 s1, s13 -; CHECK-NEXT: vmov.f32 s22, s30 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vmov.u16 r0, q5[5] +; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov.u16 r0, q5[6] +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vmov.f32 s18, s30 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q5[7] +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vmov.f32 s25, s1 +; CHECK-NEXT: vmov.16 q7[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: vmov r2, s11 +; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vmov.f32 s26, s7 +; CHECK-NEXT: vdup.32 q1, r2 +; CHECK-NEXT: vmov.f32 s27, s31 +; CHECK-NEXT: vmov.u16 r2, q1[2] +; CHECK-NEXT: vmov.u16 r0, q6[3] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u16 r0, q6[4] +; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vmov.f32 s25, s1 +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vrev32.16 q0, q5 +; CHECK-NEXT: vmov.16 q7[5], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [r1] +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.f32 s26, s30 +; CHECK-NEXT: vmov.f32 s13, s1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s14, s10 +; CHECK-NEXT: vstrw.32 q6, [r1, #32] +; CHECK-NEXT: vmov.f32 s15, s3 +; CHECK-NEXT: vmov.u16 r2, q3[3] +; CHECK-NEXT: vmov.16 q4[3], r2 +; CHECK-NEXT: vmov.u16 r2, q3[4] +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s13, s17 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vstrw.32 q3, [r1, #16] +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -479,185 +505,218 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #160 -; CHECK-NEXT: sub sp, #160 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: .pad #240 +; CHECK-NEXT: sub sp, #240 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q7, [r0, #64] ; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.u16 r2, q2[0] +; CHECK-NEXT: vmov.16 q4[1], r2 +; CHECK-NEXT: vmov.u16 r2, q2[1] +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s17, s12 +; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vstrw.32 q1, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s18, s2 ; CHECK-NEXT: vldrw.u32 q0, [r0, #80] -; CHECK-NEXT: vmov.u16 r2, q2[3] -; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.u16 r2, q2[2] ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vstrw.32 q0, [sp, #224] @ 16-byte Spill ; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s19, s7 ; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[2], r3 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q3[4] +; CHECK-NEXT: vmov.u16 r2, q4[3] +; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.u16 r2, q4[4] ; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vstrw.32 q1, [sp, #160] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[4], r2 ; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov.u16 r2, q3[5] -; CHECK-NEXT: vmov.16 q5[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.16 q5[1], r2 +; CHECK-NEXT: vmov.16 q6[0], r2 +; CHECK-NEXT: vmov.u16 r2, q7[5] +; CHECK-NEXT: vmov.16 q6[1], r2 ; CHECK-NEXT: vmov.u16 r2, q3[6] -; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vmov.u16 r2, q3[7] -; CHECK-NEXT: vmov.16 q5[6], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.16 q5[7], r2 -; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s22, s3 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s25, s1 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vstrw.32 q2, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.u16 r2, q5[3] +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.u16 r2, q7[7] +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov.f32 s26, s31 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill ; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s27, s7 ; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q4[2], r0 -; CHECK-NEXT: vmov.u16 r0, q5[4] -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.16 q1[2], r0 +; CHECK-NEXT: vmov.u16 r2, q6[3] +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.u16 r2, q6[4] +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q4[5], r0 -; CHECK-NEXT: vmov.u16 r0, q2[0] -; CHECK-NEXT: vmov.16 q6[0], r0 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.u16 r0, q4[0] +; CHECK-NEXT: vmov.16 q5[0], r0 ; CHECK-NEXT: vmov.u16 r0, q3[0] -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q6[1], r0 +; CHECK-NEXT: vmov.16 q5[1], r0 ; CHECK-NEXT: vmov.u16 r0, q3[1] -; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov.u16 r0, q2[2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q6[6], r0 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.f32 s21, s16 +; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov.u16 r0, q3[2] +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.f32 s22, s2 +; CHECK-NEXT: vmov.16 q2[7], r0 ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.16 q6[7], r0 +; CHECK-NEXT: vmov.f32 s23, s11 ; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.f32 s25, s8 +; CHECK-NEXT: vmov.u16 r0, q5[3] ; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov.u16 r0, q6[3] -; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov.u16 r2, q5[4] ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q6[4] -; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vldrw.u32 q4, [sp, #192] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vldrw.u32 q7, [sp, #224] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] +; CHECK-NEXT: vmov.u16 r0, q4[5] ; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r0, q7[5] ; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q7[6] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q7[7] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vmov.f32 s2, s19 ; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.u16 r0, q4[6] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.u16 r0, q4[7] +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmov.16 q4[6], r0 +; CHECK-NEXT: vmov.u16 r0, q7[7] +; CHECK-NEXT: vmov.f32 s2, s31 +; CHECK-NEXT: vldrw.u32 q7, [sp, #208] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q4[7], r0 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov r2, s31 +; CHECK-NEXT: vdup.32 q2, r2 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmov.u16 r2, q2[2] ; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vdup.32 q7, r2 -; CHECK-NEXT: vrev32.16 q3, q3 -; CHECK-NEXT: vmov.u16 r2, q7[2] -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov.16 q4[2], r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov.16 q4[3], r0 +; CHECK-NEXT: vmov.u16 r0, q2[5] +; CHECK-NEXT: vldrw.u32 q2, [sp, #192] @ 16-byte Reload ; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q7[5] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q4[2] +; CHECK-NEXT: vmov.16 q4[4], r2 +; CHECK-NEXT: vrev32.16 q2, q2 +; CHECK-NEXT: vmov.16 q4[5], r0 +; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: vrev32.16 q2, q3 +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #224] @ 16-byte Reload +; CHECK-NEXT: vmov.u16 r0, q2[2] ; CHECK-NEXT: vmov.16 q3[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.u16 r0, q7[3] ; CHECK-NEXT: vmov.16 q3[1], r0 -; CHECK-NEXT: vmov.u16 r0, q4[4] -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q3[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vrev32.16 q7, q7 -; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u16 r0, q7[5] +; CHECK-NEXT: vmov.16 q2[7], r0 ; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #176] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov.u16 r0, q4[3] -; CHECK-NEXT: vmov.f32 s1, s9 +; CHECK-NEXT: vmov.u16 r0, q1[3] ; CHECK-NEXT: vmov.16 q7[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov.f32 s25, s5 -; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.f32 s26, s6 +; CHECK-NEXT: vmov.u16 r0, q2[4] +; CHECK-NEXT: vmov q2, q7 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov.f32 s1, s5 ; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [r1, #80] -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp, #128] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vmov.16 q7[7], r0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s25, s17 +; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s26, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [r1, #80] +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s5, s17 +; CHECK-NEXT: vldrw.u32 q4, [sp, #144] @ 16-byte Reload ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.f32 s9, s5 +; CHECK-NEXT: vmov.f32 s6, s18 ; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.f32 s10, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q5, [r1, #32] -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s30, s18 -; CHECK-NEXT: vstrw.32 q6, [r1] +; CHECK-NEXT: vstrw.32 q1, [r1, #48] +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q6, [r1, #32] +; CHECK-NEXT: vstrw.32 q5, [r1] +; CHECK-NEXT: vmov.f32 s29, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s30, s6 +; CHECK-NEXT: vmov.f32 s31, s11 ; CHECK-NEXT: vmov.u16 r2, q7[3] -; CHECK-NEXT: vmov.f32 s13, s5 ; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vmov.u16 r2, q7[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vmov.f32 s30, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s14, s6 -; CHECK-NEXT: vstrw.32 q7, [r1, #16] +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.f32 s30, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #224] @ 16-byte Reload ; CHECK-NEXT: vmov.u16 r2, q0[2] ; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.f32 s13, s5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #208] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vstrw.32 q7, [r1, #16] +; CHECK-NEXT: vmov.f32 s14, s6 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s15, s7 ; CHECK-NEXT: vmov.u16 r2, q3[3] ; CHECK-NEXT: vmov.16 q0[3], r2 ; CHECK-NEXT: vmov.u16 r2, q3[4] -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov.f32 s13, s1 -; CHECK-NEXT: vmov.f32 s14, s2 +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.16 q1[5], r0 +; CHECK-NEXT: vmov.f32 s14, s6 ; CHECK-NEXT: vstrw.32 q3, [r1, #64] -; CHECK-NEXT: add sp, #160 +; CHECK-NEXT: add sp, #240 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -679,38 +738,36 @@ define void @vst3_v2i8(<2 x i8> *%src, <6 x i8> *%dst) { ; CHECK-LABEL: vst3_v2i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: ldrb r5, [r0, #5] -; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #20 +; CHECK-NEXT: sub sp, #20 +; CHECK-NEXT: ldrb r3, [r0] +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: ldrb r4, [r0, #2] +; CHECK-NEXT: mov r7, sp +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrb r2, [r0, #1] +; CHECK-NEXT: ldrb r5, [r0, #3] +; CHECK-NEXT: vmov.16 q0[1], r4 +; CHECK-NEXT: ldrb r6, [r0, #5] ; CHECK-NEXT: ldrb r0, [r0, #4] -; CHECK-NEXT: vmov.16 q0[1], r12 -; CHECK-NEXT: mov r2, sp ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: add r0, sp, #8 -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: vmov.16 q0[4], lr -; CHECK-NEXT: vmov.16 q0[5], r5 -; CHECK-NEXT: vmov.16 q0[6], r6 -; CHECK-NEXT: vmov.16 q0[7], r6 -; CHECK-NEXT: vstrb.16 q0, [r2] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov.16 q0[4], r5 +; CHECK-NEXT: vmov.16 q0[5], r6 +; CHECK-NEXT: vmov.16 q0[6], r12 +; CHECK-NEXT: vmov.16 q0[7], r12 +; CHECK-NEXT: vstrb.16 q0, [r7] ; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: vldrh.u32 q0, [r0] ; CHECK-NEXT: ldr r2, [sp] ; CHECK-NEXT: str r2, [r1] ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: strh r0, [r1, #4] -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: add sp, #20 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: %s1 = getelementptr <2 x i8>, <2 x i8>* %src, i32 0 %l1 = load <2 x i8>, <2 x i8>* %s1, align 4 @@ -779,32 +836,38 @@ define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) { ; CHECK-LABEL: vst3_v8i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vldrb.u16 q1, [r0, #8] ; CHECK-NEXT: vldrb.u16 q2, [r0, #16] -; CHECK-NEXT: vldrb.u16 q3, [r0] ; CHECK-NEXT: vmov.u16 r2, q1[5] ; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmov.u16 r2, q2[5] ; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov.16 q3[3], r2 ; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vldrb.u16 q3, [r0] +; CHECK-NEXT: vmov.16 q4[6], r2 ; CHECK-NEXT: vmov.u16 r2, q2[7] +; CHECK-NEXT: vmov.f32 s2, s11 +; CHECK-NEXT: vmov.16 q4[7], r2 ; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.16 q0[7], r2 +; CHECK-NEXT: vmov.f32 s3, s19 ; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vmov.u16 r0, q4[2] ; CHECK-NEXT: vmov.u16 r2, q0[3] ; CHECK-NEXT: vmov.16 q5[2], r0 ; CHECK-NEXT: vmov.u16 r0, q0[4] ; CHECK-NEXT: vmov.16 q5[3], r2 -; CHECK-NEXT: vmov.16 q5[4], r0 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.f32 s1, s21 +; CHECK-NEXT: vmov.16 q6[4], r0 ; CHECK-NEXT: vmov.u16 r0, q4[5] -; CHECK-NEXT: vmov.16 q5[5], r0 +; CHECK-NEXT: vmov.16 q6[5], r0 ; CHECK-NEXT: vmov.u16 r0, q3[0] ; CHECK-NEXT: vmov.8 q4[0], r0 ; CHECK-NEXT: vmov.u16 r0, q1[0] @@ -834,14 +897,13 @@ ; CHECK-NEXT: vmov.u16 r0, q1[4] ; CHECK-NEXT: vmov.8 q4[13], r0 ; CHECK-NEXT: vmov.u16 r0, q2[4] -; CHECK-NEXT: vmov.f32 s1, s21 ; CHECK-NEXT: vmov.8 q4[14], r0 ; CHECK-NEXT: vmov.u16 r0, q3[5] -; CHECK-NEXT: vmov.f32 s2, s22 +; CHECK-NEXT: vmov.f32 s2, s26 ; CHECK-NEXT: vmov.8 q4[15], r0 ; CHECK-NEXT: vstrb.16 q0, [r1, #16] ; CHECK-NEXT: vstrw.32 q4, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0 @@ -860,15 +922,28 @@ define void @vst3_v16i8(<16 x i8> *%src, <48 x i8> *%dst) { ; CHECK-LABEL: vst3_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.u8 r2, q1[0] ; CHECK-NEXT: vmov.u8 r3, q3[0] -; CHECK-NEXT: vmov.u8 r0, q2[0] +; CHECK-NEXT: vmov.8 q4[2], r2 +; CHECK-NEXT: vmov.u8 r2, q1[2] +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vmov.8 q0[8], r2 +; CHECK-NEXT: vmov.u8 r2, q1[3] +; CHECK-NEXT: vmov.8 q0[11], r2 +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.8 q2[14], r2 ; CHECK-NEXT: vmov.8 q5[0], r3 -; CHECK-NEXT: vmov.u8 r2, q1[0] +; CHECK-NEXT: vmov.f32 s19, s11 +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov.u8 r2, q4[2] +; CHECK-NEXT: vmov.u8 r0, q2[0] ; CHECK-NEXT: vmov.8 q5[1], r0 ; CHECK-NEXT: vmov.u8 r0, q3[1] ; CHECK-NEXT: vmov.8 q5[3], r0 @@ -883,26 +958,17 @@ ; CHECK-NEXT: vmov.u8 r0, q2[3] ; CHECK-NEXT: vmov.8 q5[10], r0 ; CHECK-NEXT: vmov.u8 r0, q3[4] -; CHECK-NEXT: vmov.8 q4[2], r2 -; CHECK-NEXT: vmov.u8 r2, q1[2] ; CHECK-NEXT: vmov.8 q5[12], r0 ; CHECK-NEXT: vmov.u8 r0, q2[4] -; CHECK-NEXT: vmov.8 q4[8], r2 -; CHECK-NEXT: vmov.u8 r2, q1[3] ; CHECK-NEXT: vmov.8 q5[13], r0 ; CHECK-NEXT: vmov.u8 r0, q3[5] ; CHECK-NEXT: vmov.8 q5[15], r0 -; CHECK-NEXT: vmov.8 q4[11], r2 -; CHECK-NEXT: vmov.u8 r2, q1[4] ; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vmov.8 q4[14], r2 ; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.f32 s17, s4 ; CHECK-NEXT: vmov.u8 r0, q5[1] ; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r2, q4[2] -; CHECK-NEXT: vmov.8 q0[2], r2 ; CHECK-NEXT: vmov.u8 r0, q5[3] +; CHECK-NEXT: vmov.8 q0[2], r2 ; CHECK-NEXT: vmov.8 q0[3], r0 ; CHECK-NEXT: vmov.u8 r0, q5[4] ; CHECK-NEXT: vmov.8 q0[4], r0 @@ -958,10 +1024,12 @@ ; CHECK-NEXT: vmov.u8 r0, q3[7] ; CHECK-NEXT: vmov.8 q6[5], r0 ; CHECK-NEXT: vmov.u8 r0, q3[8] -; CHECK-NEXT: vmov.8 q6[8], r0 -; CHECK-NEXT: vmov.u8 r0, q3[9] -; CHECK-NEXT: vmov.8 q6[11], r0 +; CHECK-NEXT: vmov q7, q6 ; CHECK-NEXT: vmov.f32 s24, s13 +; CHECK-NEXT: vmov.8 q7[8], r0 +; CHECK-NEXT: vmov.u8 r0, q3[9] +; CHECK-NEXT: vmov.8 q7[11], r0 +; CHECK-NEXT: vmov.f32 s26, s30 ; CHECK-NEXT: vmov.f32 s27, s14 ; CHECK-NEXT: vmov.u8 r0, q6[2] ; CHECK-NEXT: vmov.8 q4[2], r0 @@ -1057,7 +1125,7 @@ ; CHECK-NEXT: vmov.u8 r0, q5[15] ; CHECK-NEXT: vmov.8 q1[15], r0 ; CHECK-NEXT: vstrw.32 q1, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <16 x i8>, <16 x i8>* %src, i32 0 @@ -1426,19 +1494,20 @@ ; CHECK-NEXT: vmov.32 q2[0], r0 ; CHECK-NEXT: vmov.16 q0[1], r2 ; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov r0, s12 ; CHECK-NEXT: vmovx.f16 s4, s5 ; CHECK-NEXT: vmov.16 q0[3], r0 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmovx.f16 s4, s8 +; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: vmov.16 q0[4], r0 ; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: stm r1!, {r0, r2, r3} +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: strd r2, r3, [r1] +; CHECK-NEXT: str r0, [r1, #8] ; CHECK-NEXT: bx lr entry: %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0 @@ -1457,50 +1526,59 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) { ; CHECK-LABEL: vst3_v4f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q0[2], q0[0], lr, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r12, r2 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.32 q1[1], r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: ldrd r3, r2, [r0] +; CHECK-NEXT: ldrd lr, r12, [r0, #8] +; CHECK-NEXT: vmov.32 q3[1], r2 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov q3[2], q3[0], r3, lr +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov.16 q2[0], r5 +; CHECK-NEXT: vmov.16 q2[1], r4 +; CHECK-NEXT: ldr r4, [r0, #16] +; CHECK-NEXT: vmovx.f16 s16, s4 +; CHECK-NEXT: vmovx.f16 s12, s14 +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: ldr r0, [r0, #20] +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov.16 q2[2], r4 +; CHECK-NEXT: vmov r4, s16 +; CHECK-NEXT: vmov.16 q2[3], r4 +; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s2 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov.16 q2[4], r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov q3[2], q3[0], r3, lr +; CHECK-NEXT: vmov.32 q0[1], r0 +; CHECK-NEXT: vmov q3[3], q3[1], r2, r12 +; CHECK-NEXT: vmov.16 q2[5], r4 +; CHECK-NEXT: vmov r4, s5 +; CHECK-NEXT: vmov.16 q2[6], r4 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmovx.f16 s4, s5 ; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q1[0], r0 +; CHECK-NEXT: vmovx.f16 s0, s1 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: strd r2, r0, [r1, #16] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 %l1 = load <4 x half>, <4 x half>* %s1, align 4 @@ -1520,97 +1598,114 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q6, [r0, #32] +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmov.16 q1[0], r3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q1[1], r2 ; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vdup.32 q4, r0 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov r2, s17 +; CHECK-NEXT: vmov.16 q2[4], r3 +; CHECK-NEXT: vmov.f32 s5, s16 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: vmov.16 q3[6], r2 +; CHECK-NEXT: vmov.f32 s6, s10 +; CHECK-NEXT: vmov.16 q3[7], r12 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov.f32 s7, s15 +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vdup.32 q2, r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmovx.f16 s8, s10 ; CHECK-NEXT: vmov.16 q3[2], r0 -; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov r0, s6 ; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vmovx.f16 s16, s18 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s8, s26 +; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q3[4], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s22 -; CHECK-NEXT: vmov.16 q3[5], r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q4[1], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q4[3], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmovx.f16 s24, s23 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vdup.32 q7, r2 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov r2, s29 -; CHECK-NEXT: vmov.f32 s18, s23 -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vmovx.f16 s28, s30 -; CHECK-NEXT: vmovx.f16 s4, s10 -; CHECK-NEXT: vmov.f32 s1, s13 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmovx.f16 s28, s9 -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r2, s28 -; CHECK-NEXT: vmov.16 q7[0], r0 -; CHECK-NEXT: vmov.16 q7[1], r2 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.16 q7[6], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q7[7], r0 -; CHECK-NEXT: vmov.f32 s17, s25 -; CHECK-NEXT: vmov.f32 s29, s21 -; CHECK-NEXT: vmov.f32 s30, s10 -; CHECK-NEXT: vmovx.f16 s4, s29 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s26 -; CHECK-NEXT: vrev32.16 q2, q1 -; CHECK-NEXT: vstrw.32 q4, [r1, #32] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vmov.16 q3[5], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vmov.16 q5[0], r2 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov.16 q5[1], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmovx.f16 s8, s27 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmov.16 q2[3], r3 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmov.f32 s21, s9 +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov.f32 s22, s27 +; CHECK-NEXT: vmov.16 q3[7], r2 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov.f32 s23, s15 +; CHECK-NEXT: vmovx.f16 s8, s21 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vdup.32 q2, r2 ; CHECK-NEXT: vmov r2, s9 ; CHECK-NEXT: vmovx.f16 s8, s10 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov r0, s30 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vmov.f32 s30, s6 -; CHECK-NEXT: vstrw.32 q7, [r1, #16] -; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vmov.16 q7[2], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q7[3], r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vstrw.32 q7, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q7[4], r0 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmov.16 q7[5], r2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmovx.f16 s12, s18 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.f32 s9, s25 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vrev32.16 q3, q0 +; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmovx.f16 s12, s14 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q4[4], r0 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q4[5], r2 +; CHECK-NEXT: vstrw.32 q1, [r1] +; CHECK-NEXT: vmov.f32 s21, s13 +; CHECK-NEXT: vmov.f32 s22, s30 +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vstrw.32 q2, [r1, #16] +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -1632,203 +1727,239 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #144 -; CHECK-NEXT: sub sp, #144 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s12 -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: .pad #240 +; CHECK-NEXT: sub sp, #240 +; CHECK-NEXT: vldrw.u32 q6, [r0, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q7, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmovx.f16 s0, s26 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vmov.f32 s5, s12 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q3[2], r3 -; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov.16 q3[3], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vmov.16 q3[5], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmovx.f16 s0, s27 ; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: vmov.16 q1[1], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vmov.16 q1[7], r2 -; CHECK-NEXT: vstrw.32 q3, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s9, s1 +; CHECK-NEXT: vmov.16 q1[6], r12 +; CHECK-NEXT: vmov.f32 s10, s27 +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vmov r3, s31 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmovx.f16 s0, s9 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q2[2], r3 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov.16 q7[0], r3 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #48] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vmov.16 q1[2], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vstrw.32 q1, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.16 q1[5], r3 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vstrw.32 q1, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vstrw.32 q2, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.16 q2[1], r3 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov.16 q7[4], r2 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmov.16 q7[6], r2 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: vmovx.f16 s0, s20 -; CHECK-NEXT: vmov.16 q7[7], r2 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov r12, s13 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.16 q1[6], r12 +; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vmov.16 q1[7], r3 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vstrw.32 q2, [sp, #176] @ 16-byte Spill ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vdup.32 q0, r0 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.f32 s29, s20 +; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vstrw.32 q1, [sp, #160] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q5 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q5[0], r0 ; CHECK-NEXT: vmovx.f16 s0, s7 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.16 q5[1], r2 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmovx.f16 s0, s15 -; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vmov r2, s19 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s21, s1 +; CHECK-NEXT: vmov.16 q1[6], r3 +; CHECK-NEXT: vmov.f32 s22, s11 +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov.f32 s23, s7 +; CHECK-NEXT: vmovx.f16 s0, s21 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.f32 s10, s15 +; CHECK-NEXT: vdup.32 q0, r2 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmovx.f16 s0, s12 +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: vmov r0, s29 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s9, s28 +; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vmov.16 q1[6], r0 +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov.f32 s10, s2 +; CHECK-NEXT: vmov.f32 s11, s7 ; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov r2, s24 +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vdup.32 q0, r2 ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vmov.16 q6[2], r2 -; CHECK-NEXT: vmov.16 q6[3], r0 +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q1[3], r0 ; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.16 q6[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q6[5], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmovx.f16 s0, s29 +; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov.16 q4[0], r0 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vmov.16 q4[1], r2 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.16 q4[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: vmov.f32 s9, s25 -; CHECK-NEXT: vmov.f32 s17, s13 -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmovx.f16 s4, s30 +; CHECK-NEXT: vmov.f32 s17, s25 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s30 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.f32 s19, s3 ; CHECK-NEXT: vmovx.f16 s0, s17 ; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vrev32.16 q0, q1 +; CHECK-NEXT: vrev32.16 q0, q3 ; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov.f32 s10, s26 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, #80] ; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmovx.f16 s0, s13 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s29 +; CHECK-NEXT: vmovx.f16 s0, s25 +; CHECK-NEXT: vmov.16 q1[5], r2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.16 q5[0], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q5[1], r2 -; CHECK-NEXT: vmov.16 q5[6], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q5[7], r0 -; CHECK-NEXT: vmov.f32 s21, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s22, s14 -; CHECK-NEXT: vmovx.f16 s0, s21 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vrev32.16 q3, q0 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmovx.f16 s12, s14 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s1, s29 +; CHECK-NEXT: vmovx.f16 s12, s26 +; CHECK-NEXT: vmov.f32 s2, s26 +; CHECK-NEXT: vldrw.u32 q6, [sp, #208] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp, #224] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[6], r0 ; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vldrw.u32 q3, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.f32 s13, s25 -; CHECK-NEXT: vmov.f32 s14, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s25, s5 -; CHECK-NEXT: vstrw.32 q3, [r1] -; CHECK-NEXT: vmov.f32 s21, s1 -; CHECK-NEXT: vmov.f32 s26, s6 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s22, s2 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s29, s5 -; CHECK-NEXT: vstrw.32 q6, [r1, #32] +; CHECK-NEXT: vmov.f32 s29, s25 +; CHECK-NEXT: vldrw.u32 q6, [sp, #192] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[7], r0 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmov.f32 s30, s26 +; CHECK-NEXT: vldrw.u32 q6, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vrev32.16 q1, q1 +; CHECK-NEXT: vmov.f32 s25, s29 +; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vmov.16 q3[2], r2 +; CHECK-NEXT: vmov.f32 s26, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vstrw.32 q6, [r1] +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmov.f32 s21, s29 +; CHECK-NEXT: vldrw.u32 q7, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vmov.16 q1[4], r0 +; CHECK-NEXT: vmov.f32 s1, s13 +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.f32 s22, s30 +; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s9, s29 +; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s17, s1 -; CHECK-NEXT: vstrw.32 q5, [r1, #16] -; CHECK-NEXT: vmov.f32 s30, s6 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s10, s30 +; CHECK-NEXT: vstrw.32 q5, [r1, #32] +; CHECK-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-NEXT: vldrw.u32 q2, [sp, #224] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s18, s2 -; CHECK-NEXT: vstrw.32 q7, [r1, #48] +; CHECK-NEXT: vstrw.32 q2, [r1, #80] ; CHECK-NEXT: vstrw.32 q4, [r1, #64] -; CHECK-NEXT: add sp, #144 +; CHECK-NEXT: add sp, #240 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -8,25 +8,29 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] +; CHECK-NEXT: ldrd r3, r2, [r0] +; CHECK-NEXT: ldrd r12, lr, [r0, #8] ; CHECK-NEXT: ldrd r4, r0, [r0, #16] +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmov.f64 d6, d4 +; CHECK-NEXT: vmov.32 q2[1], r2 ; CHECK-NEXT: vmov q1[2], q1[0], r4, r0 -; CHECK-NEXT: vmov.f64 d0, d2 +; CHECK-NEXT: vmov.32 q0[0], r4 ; CHECK-NEXT: vmov.f32 s1, s6 -; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s2, s0 ; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vstrw.32 q2, [r1] -; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.32 q1[1], r2 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov.f32 s8, s9 +; CHECK-NEXT: vmov.f32 s13, s6 +; CHECK-NEXT: vmov q1[2], q1[0], r3, r12 +; CHECK-NEXT: vmov q1[3], q1[1], r2, lr +; CHECK-NEXT: vmov.f32 s14, s0 ; CHECK-NEXT: vmov.f32 s9, s7 ; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s15, s2 ; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vstrw.32 q3, [r1] ; CHECK-NEXT: vstrw.32 q2, [r1, #16] ; CHECK-NEXT: pop {r4, pc} entry: @@ -207,22 +211,20 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrh r2, [r0, #2] -; CHECK-NEXT: ldrh.w r12, [r0, #4] -; CHECK-NEXT: ldrh r3, [r0, #8] -; CHECK-NEXT: ldrh.w lr, [r0, #6] -; CHECK-NEXT: ldrh r4, [r0, #10] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.16 q0[1], r12 -; CHECK-NEXT: vmov.16 q0[2], r3 -; CHECK-NEXT: vmov.16 q0[3], r3 -; CHECK-NEXT: vmov.16 q0[4], r2 -; CHECK-NEXT: vmov.16 q0[5], lr -; CHECK-NEXT: vmov.16 q0[6], r4 -; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: ldrh r2, [r0, #4] +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: ldrh.w lr, [r0, #2] +; CHECK-NEXT: ldrh.w r12, [r0, #6] +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: ldrh r3, [r0, #10] +; CHECK-NEXT: ldrh r0, [r0, #8] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.16 q0[4], lr +; CHECK-NEXT: vmov.16 q0[5], r12 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r3 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: pop {r4, pc} entry: @@ -372,18 +374,16 @@ ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: ldrb r2, [r0] -; CHECK-NEXT: ldrb r3, [r0, #1] -; CHECK-NEXT: ldrb.w r12, [r0, #2] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 +; CHECK-NEXT: ldrb r3, [r0, #2] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrb.w r12, [r0, #1] ; CHECK-NEXT: ldrb.w lr, [r0, #3] -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.16 q0[1], r3 ; CHECK-NEXT: ldrb r4, [r0, #5] -; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: ldrb r0, [r0, #4] -; CHECK-NEXT: vmov.16 q0[1], r12 ; CHECK-NEXT: vmov.16 q0[2], r0 ; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: vmov.16 q0[4], r12 ; CHECK-NEXT: vmov.16 q0[5], lr ; CHECK-NEXT: vmov.16 q0[6], r4 ; CHECK-NEXT: vmov.16 q0[7], r4 @@ -909,55 +909,68 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: ldrd lr, r12, [r0] -; CHECK-NEXT: ldrd r3, r2, [r0, #8] -; CHECK-NEXT: ldrd r4, r0, [r0, #16] -; CHECK-NEXT: vmov q1[2], q1[0], lr, r3 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r2 -; CHECK-NEXT: vmov q0[2], q0[0], r4, r4 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r0 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: ldrd lr, r3, [r0] +; CHECK-NEXT: ldrd r12, r2, [r0, #8] +; CHECK-NEXT: vmov.32 q1[0], lr +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vmov q3[2], q3[0], lr, r12 +; CHECK-NEXT: vmov.32 q6[1], r3 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r2 +; CHECK-NEXT: vmov r4, s25 +; CHECK-NEXT: vmovx.f16 s24, s25 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov.16 q2[0], r4 ; CHECK-NEXT: vmov.16 q2[1], r2 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: vmovx.f16 s12, s5 -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s1 -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s12, s3 -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: ldrd r2, r0, [r0, #16] +; CHECK-NEXT: vmovx.f16 s12, s15 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r2 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov q5[3], q5[1], r0, r0 +; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov.16 q2[2], r4 +; CHECK-NEXT: vmov r4, s23 +; CHECK-NEXT: vmov.16 q2[3], r4 +; CHECK-NEXT: vmov r4, s24 +; CHECK-NEXT: vmov.16 q2[4], r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmovx.f16 s12, s17 +; CHECK-NEXT: vmov.16 q2[5], r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmovx.f16 s12, s23 +; CHECK-NEXT: vmov.16 q2[6], r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov.16 q2[7], r4 +; CHECK-NEXT: vmov q3[2], q3[0], lr, r12 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov.32 q4[1], r0 ; CHECK-NEXT: vstrw.32 q2, [r1, #16] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov.16 q2[0], r4 +; CHECK-NEXT: vmov.16 q2[1], r3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov q4[2], q4[0], r2, r2 +; CHECK-NEXT: vmov.16 q2[2], r3 +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmovx.f16 s4, s4 ; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmovx.f16 s4, s14 ; CHECK-NEXT: vmov.16 q2[4], r0 ; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmovx.f16 s0, s18 ; CHECK-NEXT: vmov.16 q2[6], r0 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: vmov.16 q2[7], r0 ; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r4, pc} entry: %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll @@ -50,48 +50,39 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pSrc, i32 %blockSize, <4 x i32> %a) { ; CHECK-LABEL: foo_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vpt.s32 lt, q0, zr ; CHECK-NEXT: vldrwt.u32 q4, [r0] -; CHECK-NEXT: vmov.f64 d0, d8 -; CHECK-NEXT: vmov.i64 q5, #0xffffffff -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vand q6, q0, q5 -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov r1, s25 +; CHECK-NEXT: vmov.f64 d10, d9 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmov.f32 s22, s19 +; CHECK-NEXT: vmov r0, s20 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov r0, s22 ; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov r1, s27 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: vmov.f64 d0, d9 -; CHECK-NEXT: vmov.f32 s2, s19 -; CHECK-NEXT: vand q0, q0, q5 +; CHECK-NEXT: vmov.f32 s18, s17 +; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r7, s1 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: vmov d8, r4, r5 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d11, r0, r1 ; CHECK-NEXT: mov r0, r6 -; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d10, r0, r1 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov q1, q5 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %pSrc, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) diff --git a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll --- a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -49,14 +49,12 @@ define void @zero_test() { ; X86-LABEL: zero_test: ; X86: # %bb.0: # %entry -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: movlps %xmm0, (%eax) +; X86-NEXT: movl $0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: zero_test: ; X64: # %bb.0: # %entry -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: movlps %xmm0, (%rax) +; X64-NEXT: movq $0, (%rax) ; X64-NEXT: retq entry: %0 = select <2 x i1> undef, <2 x float> undef, <2 x float> zeroinitializer diff --git a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll --- a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll +++ b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll @@ -29,8 +29,8 @@ ; X86-LABEL: store_64: ; X86: # %bb.0: # %BB ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: movlps %xmm0, (%eax) +; X86-NEXT: movl $0, 4(%eax) +; X86-NEXT: movl $0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: store_64: diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -513,7 +513,6 @@ ; SSE2-SSSE3-NEXT: shll $16, %edx ; SSE2-SSSE3-NEXT: orl %eax, %edx ; SSE2-SSSE3-NEXT: shlq $32, %rdx -; SSE2-SSSE3-NEXT: orq %rcx, %rdx ; SSE2-SSSE3-NEXT: movq %rdx, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-SSSE3-NEXT: movd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -773,6 +773,7 @@ define <4 x i32> @ossfuzz5688(i32 %a0) { ; CHECK-LABEL: ossfuzz5688: ; CHECK: # %bb.0: +; CHECK-NEXT: movl $0, (%rax) ; CHECK-NEXT: retq %1 = insertelement <4 x i32> zeroinitializer, i32 -2147483648, i32 %a0 %2 = extractelement <4 x i32> %1, i32 %a0 diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll --- a/llvm/test/CodeGen/X86/fold-load-vec.ll +++ b/llvm/test/CodeGen/X86/fold-load-vec.ll @@ -10,8 +10,8 @@ ; CHECK-NEXT: subq $24, %rsp ; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq $0, (%rsp) ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movlps %xmm0, (%rsp) ; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movlps %xmm0, (%rsp) ; CHECK-NEXT: movlps %xmm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -2054,22 +2054,22 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,7,42,32] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [32768,4294934528,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[0,2] -; SSE2-NEXT: pmuludq %xmm3, %xmm0 -; SSE2-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] -; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pmuludq %xmm0, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2] +; SSE2-NEXT: pmuludq %xmm3, %xmm2 +; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295] +; SSE2-NEXT: andps %xmm2, %xmm1 +; SSE2-NEXT: andps %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_negative2: diff --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll --- a/llvm/test/CodeGen/X86/nontemporal-3.ll +++ b/llvm/test/CodeGen/X86/nontemporal-3.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=SSE,SSE4A -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=AVX512 @@ -195,33 +195,14 @@ } define void @test_zero_v8f32_align1(<8 x float>* %dst) nounwind { -; SSE2-LABEL: test_zero_v8f32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v8f32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorl %eax, %eax -; SSE4A-NEXT: movntiq %rax, 8(%rdi) -; SSE4A-NEXT: movntiq %rax, 24(%rdi) -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v8f32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v8f32_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8f32_align1: ; AVX: # %bb.0: @@ -245,32 +226,14 @@ } define void @test_zero_v4i64_align1(<4 x i64>* %dst) nounwind { -; SSE2-LABEL: test_zero_v4i64_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v4i64_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v4i64_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v4i64_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v4i64_align1: ; AVX: # %bb.0: @@ -294,32 +257,14 @@ } define void @test_zero_v8i32_align1(<8 x i32>* %dst) nounwind { -; SSE2-LABEL: test_zero_v8i32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v8i32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v8i32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v8i32_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8i32_align1: ; AVX: # %bb.0: @@ -343,32 +288,14 @@ } define void @test_zero_v16i16_align1(<16 x i16>* %dst) nounwind { -; SSE2-LABEL: test_zero_v16i16_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v16i16_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v16i16_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v16i16_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16i16_align1: ; AVX: # %bb.0: @@ -392,32 +319,14 @@ } define void @test_zero_v32i8_align1(<32 x i8>* %dst) nounwind { -; SSE2-LABEL: test_zero_v32i8_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v32i8_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v32i8_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v32i8_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v32i8_align1: ; AVX: # %bb.0: @@ -636,45 +545,18 @@ } define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind { -; SSE2-LABEL: test_zero_v16f32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v16f32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorl %eax, %eax -; SSE4A-NEXT: movntiq %rax, 24(%rdi) -; SSE4A-NEXT: movntiq %rax, 8(%rdi) -; SSE4A-NEXT: movntiq %rax, 56(%rdi) -; SSE4A-NEXT: movntiq %rax, 40(%rdi) -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v16f32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v16f32_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16f32_align1: ; AVX: # %bb.0: @@ -706,44 +588,18 @@ } define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind { -; SSE2-LABEL: test_zero_v8i64_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v8i64_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v8i64_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v8i64_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8i64_align1: ; AVX: # %bb.0: @@ -775,44 +631,18 @@ } define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind { -; SSE2-LABEL: test_zero_v16i32_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v16i32_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v16i32_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v16i32_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16i32_align1: ; AVX: # %bb.0: @@ -844,44 +674,18 @@ } define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind { -; SSE2-LABEL: test_zero_v32i16_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v32i16_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v32i16_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v32i16_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v32i16_align1: ; AVX: # %bb.0: @@ -913,44 +717,18 @@ } define void @test_zero_v64i8_align1(<64 x i8>* %dst) nounwind { -; SSE2-LABEL: test_zero_v64i8_align1: -; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movntiq %rax, 24(%rdi) -; SSE2-NEXT: movntiq %rax, 16(%rdi) -; SSE2-NEXT: movntiq %rax, 8(%rdi) -; SSE2-NEXT: movntiq %rax, (%rdi) -; SSE2-NEXT: movntiq %rax, 56(%rdi) -; SSE2-NEXT: movntiq %rax, 48(%rdi) -; SSE2-NEXT: movntiq %rax, 40(%rdi) -; SSE2-NEXT: movntiq %rax, 32(%rdi) -; SSE2-NEXT: retq -; -; SSE4A-LABEL: test_zero_v64i8_align1: -; SSE4A: # %bb.0: -; SSE4A-NEXT: xorps %xmm0, %xmm0 -; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) -; SSE4A-NEXT: movntsd %xmm0, (%rdi) -; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) -; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) -; SSE4A-NEXT: retq -; -; SSE41-LABEL: test_zero_v64i8_align1: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: movntiq %rax, 24(%rdi) -; SSE41-NEXT: movntiq %rax, 16(%rdi) -; SSE41-NEXT: movntiq %rax, 8(%rdi) -; SSE41-NEXT: movntiq %rax, (%rdi) -; SSE41-NEXT: movntiq %rax, 56(%rdi) -; SSE41-NEXT: movntiq %rax, 48(%rdi) -; SSE41-NEXT: movntiq %rax, 40(%rdi) -; SSE41-NEXT: movntiq %rax, 32(%rdi) -; SSE41-NEXT: retq +; SSE-LABEL: test_zero_v64i8_align1: +; SSE: # %bb.0: +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v64i8_align1: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll --- a/llvm/test/CodeGen/X86/pr41619.ll +++ b/llvm/test/CodeGen/X86/pr41619.ll @@ -7,10 +7,9 @@ ; CHECK: ## %bb.0: ## %bb ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: vmovd %eax, %xmm0 -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: movl %eax, (%rax) -; CHECK-NEXT: vmovlps %xmm1, (%rax) +; CHECK-NEXT: movq $0, (%rax) ; CHECK-NEXT: retq bb: %tmp = bitcast double %arg to i64 diff --git a/llvm/test/CodeGen/X86/promote-cmp.ll b/llvm/test/CodeGen/X86/promote-cmp.ll --- a/llvm/test/CodeGen/X86/promote-cmp.ll +++ b/llvm/test/CodeGen/X86/promote-cmp.ll @@ -14,9 +14,10 @@ ; SSE2-NEXT: pxor %xmm4, %xmm6 ; SSE2-NEXT: movdqa %xmm6, %xmm7 ; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm5 +; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] ; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: movdqa %xmm2, %xmm5 @@ -31,12 +32,9 @@ ; SSE2-NEXT: por %xmm4, %xmm5 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] ; SSE2-NEXT: pxor {{.*}}(%rip), %xmm4 -; SSE2-NEXT: psllq $63, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: por %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,1,3] ; SSE2-NEXT: psllq $63, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 @@ -56,7 +54,6 @@ ; SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] ; SSE4-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE4-NEXT: pxor %xmm5, %xmm6 -; SSE4-NEXT: psllq $63, %xmm0 ; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero ; SSE4-NEXT: psllq $63, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_setcc.ll b/llvm/test/CodeGen/X86/vec_setcc.ll --- a/llvm/test/CodeGen/X86/vec_setcc.ll +++ b/llvm/test/CodeGen/X86/vec_setcc.ll @@ -206,11 +206,12 @@ ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl +; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_setcc_v3i1_v3i16: @@ -218,9 +219,10 @@ ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: pextrb $2, %xmm1, %edx -; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: pextrb $4, %xmm0, %edx +; SSE41-NEXT: pextrb $8, %xmm0, %ecx ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx ; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx @@ -231,9 +233,10 @@ ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: vpextrb $2, %xmm0, %edx -; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpextrb $4, %xmm1, %edx +; AVX-NEXT: vpextrb $8, %xmm1, %ecx ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: # kill: def $dl killed $dl killed $edx ; AVX-NEXT: # kill: def $cl killed $cl killed $ecx diff --git a/llvm/test/CodeGen/X86/vec_zero_cse.ll b/llvm/test/CodeGen/X86/vec_zero_cse.ll --- a/llvm/test/CodeGen/X86/vec_zero_cse.ll +++ b/llvm/test/CodeGen/X86/vec_zero_cse.ll @@ -15,8 +15,8 @@ ; X32: # %bb.0: ; X32-NEXT: movl $0, M1+4 ; X32-NEXT: movl $0, M1 -; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: movlps %xmm0, M2 +; X32-NEXT: movl $0, M2+4 +; X32-NEXT: movl $0, M2 ; X32-NEXT: retl ; ; X64-LABEL: test1: @@ -34,8 +34,8 @@ ; X32: # %bb.0: ; X32-NEXT: movl $-1, M1+4 ; X32-NEXT: movl $-1, M1 -; X32-NEXT: pcmpeqd %xmm0, %xmm0 -; X32-NEXT: movq %xmm0, M2 +; X32-NEXT: movl $-1, M2+4 +; X32-NEXT: movl $-1, M2 ; X32-NEXT: retl ; ; X64-LABEL: test2: diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -184,13 +184,14 @@ ; X86-SSE2-NEXT: psrlq $1, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X86-SSE2-NEXT: pxor %xmm6, %xmm6 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: psllq %xmm2, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X86-SSE2-NEXT: orpd %xmm1, %xmm0 @@ -1233,13 +1234,14 @@ ; X86-SSE2-NEXT: psrlq $1, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X86-SSE2-NEXT: pxor %xmm6, %xmm6 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: psllq %xmm2, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X86-SSE2-NEXT: orpd %xmm1, %xmm0 @@ -2494,17 +2496,15 @@ ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <4,u,14,u> ; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE2-NEXT: pandn %xmm2, %xmm4 -; X86-SSE2-NEXT: psrlq $1, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: psrlq $1, %xmm5 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 +; X86-SSE2-NEXT: psrlq $50, %xmm1 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE2-NEXT: psllq %xmm3, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psllq %xmm3, %xmm0 +; X86-SSE2-NEXT: psllq $14, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X86-SSE2-NEXT: orpd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -3024,10 +3024,8 @@ ; X86-SSE2-LABEL: splatconstant_funnnel_v2i64: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: psrlq $50, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X86-SSE2-NEXT: psllq $14, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -152,17 +152,18 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: psllq %xmm1, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 -; X86-SSE2-NEXT: psllq %xmm1, %xmm5 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X86-SSE2-NEXT: pxor %xmm5, %xmm5 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: psllq %xmm1, %xmm6 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X86-SSE2-NEXT: psrlq %xmm3, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm5, %xmm0 +; X86-SSE2-NEXT: orpd %xmm6, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt) ret <2 x i64> %res @@ -839,17 +840,18 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: psllq %xmm1, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 -; X86-SSE2-NEXT: psllq %xmm1, %xmm5 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X86-SSE2-NEXT: pxor %xmm5, %xmm5 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: psllq %xmm1, %xmm6 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X86-SSE2-NEXT: psrlq %xmm3, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm5, %xmm0 +; X86-SSE2-NEXT: orpd %xmm6, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat) @@ -1472,17 +1474,15 @@ ; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: psllq %xmm2, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 -; X86-SSE2-NEXT: psllq %xmm2, %xmm5 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psllq $14, %xmm2 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; X86-SSE2-NEXT: pand %xmm1, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 +; X86-SSE2-NEXT: psrlq $50, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm5, %xmm0 +; X86-SSE2-NEXT: orpd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res @@ -1938,10 +1938,8 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psrlq $50, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X86-SSE2-NEXT: psllq $14, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -184,14 +184,15 @@ ; X86-SSE2-NEXT: pand %xmm3, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X86-SSE2-NEXT: pxor %xmm6, %xmm6 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 ; X86-SSE2-NEXT: psllq $1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: psllq %xmm2, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X86-SSE2-NEXT: orpd %xmm1, %xmm0 @@ -1218,14 +1219,15 @@ ; X86-SSE2-NEXT: pand %xmm3, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] +; X86-SSE2-NEXT: pxor %xmm6, %xmm6 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X86-SSE2-NEXT: pandn %xmm3, %xmm2 ; X86-SSE2-NEXT: psllq $1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: psllq %xmm2, %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; X86-SSE2-NEXT: psllq %xmm2, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] ; X86-SSE2-NEXT: orpd %xmm1, %xmm0 @@ -2113,15 +2115,13 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm4 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 ; X86-SSE2-NEXT: psrlq %xmm4, %xmm5 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; X86-SSE2-NEXT: psrlq %xmm4, %xmm1 +; X86-SSE2-NEXT: psrlq $14, %xmm1 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] ; X86-SSE2-NEXT: pandn %xmm2, %xmm3 -; X86-SSE2-NEXT: psllq $1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psllq $1, %xmm2 ; X86-SSE2-NEXT: psllq %xmm3, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psllq %xmm3, %xmm0 +; X86-SSE2-NEXT: psllq $50, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; X86-SSE2-NEXT: orpd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl @@ -2654,10 +2654,8 @@ ; X86-SSE2-LABEL: splatconstant_funnnel_v2i64: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: psrlq $14, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X86-SSE2-NEXT: psllq $50, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -154,17 +154,18 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: psrlq %xmm1, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 -; X86-SSE2-NEXT: psrlq %xmm1, %xmm5 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X86-SSE2-NEXT: pxor %xmm5, %xmm5 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: psrlq %xmm1, %xmm6 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psllq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psllq %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X86-SSE2-NEXT: psllq %xmm3, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm5, %xmm0 +; X86-SSE2-NEXT: orpd %xmm6, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %amt) ret <2 x i64> %res @@ -883,17 +884,18 @@ ; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: psrlq %xmm1, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 -; X86-SSE2-NEXT: psrlq %xmm1, %xmm5 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X86-SSE2-NEXT: pxor %xmm5, %xmm5 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm6 +; X86-SSE2-NEXT: psrlq %xmm1, %xmm6 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] ; X86-SSE2-NEXT: pand %xmm2, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psllq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psllq %xmm2, %xmm0 +; X86-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X86-SSE2-NEXT: psllq %xmm3, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm5, %xmm0 +; X86-SSE2-NEXT: orpd %xmm6, %xmm0 ; X86-SSE2-NEXT: retl %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat) @@ -1558,17 +1560,15 @@ ; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 ; X86-SSE2-NEXT: psrlq %xmm2, %xmm4 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 -; X86-SSE2-NEXT: psrlq %xmm2, %xmm5 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: psrlq $14, %xmm2 +; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] ; X86-SSE2-NEXT: pand %xmm1, %xmm3 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psllq %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; X86-SSE2-NEXT: psllq %xmm2, %xmm0 +; X86-SSE2-NEXT: psllq $50, %xmm0 ; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; X86-SSE2-NEXT: orpd %xmm5, %xmm0 +; X86-SSE2-NEXT: orpd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res @@ -2024,10 +2024,8 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: psllq $50, %xmm1 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm1[0,1] ; X86-SSE2-NEXT: psrlq $14, %xmm0 -; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm0[0,1] -; X86-SSE2-NEXT: orpd %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> ) ret <2 x i64> %res diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3065,22 +3065,14 @@ ; SSE: # %bb.0: ; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; SSE-NEXT: movaps %xmm0, (%rax) -; SSE-NEXT: addss {{.*}}(%rip), %xmm0 -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: addss %xmm1, %xmm0 -; SSE-NEXT: addss %xmm1, %xmm0 -; SSE-NEXT: movss %xmm0, (%rax) +; SSE-NEXT: movl $2143289344, (%rax) # imm = 0x7FC00000 ; SSE-NEXT: retq ; ; AVX-LABEL: PR43024: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; AVX-NEXT: vmovaps %xmm0, (%rax) -; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovss %xmm0, (%rax) +; AVX-NEXT: movl $2143289344, (%rax) # imm = 0x7FC00000 ; AVX-NEXT: retq store <4 x float> , <4 x float>* undef, align 16 %1 = load <4 x float>, <4 x float>* undef, align 16 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2313,11 +2313,9 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -569,8 +569,7 @@ ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 @@ -583,23 +582,32 @@ ; SSE41-NEXT: movd %edi, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] ; SSE41-NEXT: por %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE41-NEXT: pinsrd $1, %edi, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: simplify_select: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX-NEXT: vmovd %edi, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 -; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: simplify_select: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: simplify_select: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: retq %a = insertelement <2 x i32> , i32 %x, i32 1 %b = insertelement <2 x i32> , i32 %x, i32 0 %y = or <2 x i32> %a, %b diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll --- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll +++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll @@ -105,8 +105,8 @@ ; X86-LABEL: shuf5: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movsd %xmm0, (%eax) +; X86-NEXT: movl $555819297, 4(%eax) # imm = 0x21212121 +; X86-NEXT: movl $555819297, (%eax) # imm = 0x21212121 ; X86-NEXT: retl ; ; X64-LABEL: shuf5: