diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll --- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll +++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll @@ -601,3 +601,41 @@ %3 = and i64 %2, 32640 ; #0x7f80 ret i64 %3 } + +; orr with left-shifted operand is better than bfi, since it improves data +; dependency, and orr has a smaller latency and higher throughput than bfm on +; some AArch64 processors (for the rest, orr is at least as good as bfm) +; +; ubfx x8, x0, #8, #7 +; and x9, x0, #0x7f +; orr x0, x9, x8, lsl #7 +define i64 @test_orr_not_bfxil_i64(i64 %0) { +; CHECK-LABEL: test_orr_not_bfxil_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr x8, x0, #1 +; CHECK-NEXT: and x8, x8, #0x3f80 +; CHECK-NEXT: bfxil x8, x0, #0, #7 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret + %2 = and i64 %0, 127 + %3 = lshr i64 %0, 1 + %4 = and i64 %3, 16256 ; 0x3f80 + %5 = or i64 %4, %2 + ret i64 %5 +} + +; The 32-bit test for `test_orr_not_bfxil_i64`. +define i32 @test_orr_not_bfxil_i32(i32 %0) { +; CHECK-LABEL: test_orr_not_bfxil_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr w8, w0, #1 +; CHECK-NEXT: and w8, w8, #0x3f80 +; CHECK-NEXT: bfxil w8, w0, #0, #7 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + %2 = and i32 %0, 127 + %3 = lshr i32 %0, 1 + %4 = and i32 %3, 16256 ; 0x3f80 + %5 = or i32 %4, %2 + ret i32 %5 +} diff --git a/llvm/test/CodeGen/AArch64/build-pair-isel.ll b/llvm/test/CodeGen/AArch64/build-pair-isel.ll --- a/llvm/test/CodeGen/AArch64/build-pair-isel.ll +++ b/llvm/test/CodeGen/AArch64/build-pair-isel.ll @@ -1,10 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64 -o - -O0 %s | FileCheck %s target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-ios5.0.0" ; This test checks we don't fail isel due to unhandled build_pair nodes. -; CHECK: bfi define void @compare_and_swap128() { +; CHECK-LABEL: compare_and_swap128: +; CHECK: // %bb.0: +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: // implicit-def: $x9 +; CHECK-NEXT: mov w9, w10 +; CHECK-NEXT: mov w8, w8 +; CHECK-NEXT: // kill: def $x8 killed $w8 +; CHECK-NEXT: bfi x8, x9, #32, #32 +; CHECK-NEXT: // implicit-def: $x9 +; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: ret %1 = call i128 asm sideeffect "nop", "=r,~{memory}"() store i128 %1, i128* undef, align 16 ret void diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll @@ -236,65 +236,66 @@ } define void @trunc_v16i64_to_v16i8_in_loop(ptr %A, ptr %dst) { -; CHECK-LABEL: trunc_v16i64_to_v16i8_in_loop: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: LBB3_1: ; %loop -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x9, x0, x8, lsl #7 -; CHECK-NEXT: ldp q3, q2, [x9, #96] -; CHECK-NEXT: ldp q1, q0, [x9, #32] -; CHECK-NEXT: uzp1.4s v2, v3, v2 -; CHECK-NEXT: ldp q5, q4, [x9, #64] -; CHECK-NEXT: uzp1.4s v0, v1, v0 -; CHECK-NEXT: ldp q3, q6, [x9] -; CHECK-NEXT: uzp1.4s v4, v5, v4 -; CHECK-NEXT: uzp1.8h v2, v4, v2 -; CHECK-NEXT: uzp1.4s v1, v3, v6 -; CHECK-NEXT: uzp1.8h v0, v1, v0 -; CHECK-NEXT: uzp1.16b v0, v0, v2 -; CHECK-NEXT: str q0, [x1, x8, lsl #4] -; CHECK-NEXT: add x8, x8, #1 -; CHECK-NEXT: cmp x8, #1000 -; CHECK-NEXT: b.eq LBB3_1 -; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: ret - -; CHECK-BE-LABEL: trunc_v16i64_to_v16i8_in_loop: -; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr -; CHECK-BE-NEXT: .LBB3_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x9, x0, x8, lsl #7 -; CHECK-BE-NEXT: add x10, x9, #48 -; CHECK-BE-NEXT: add x11, x9, #32 -; CHECK-BE-NEXT: ld1 { v5.2d }, [x9] -; CHECK-BE-NEXT: ld1 { v0.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #80 -; CHECK-BE-NEXT: ld1 { v1.2d }, [x11] -; CHECK-BE-NEXT: add x11, x9, #112 -; CHECK-BE-NEXT: ld1 { v2.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #96 -; CHECK-BE-NEXT: ld1 { v3.2d }, [x11] -; CHECK-BE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; CHECK-BE-NEXT: ld1 { v4.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #64 -; CHECK-BE-NEXT: add x9, x9, #16 -; CHECK-BE-NEXT: ld1 { v6.2d }, [x10] -; CHECK-BE-NEXT: ld1 { v7.2d }, [x9] -; CHECK-BE-NEXT: add x9, x1, x8, lsl #4 -; CHECK-BE-NEXT: uzp1 v3.4s, v4.4s, v3.4s -; CHECK-BE-NEXT: add x8, x8, #1 -; CHECK-BE-NEXT: cmp x8, #1000 -; CHECK-BE-NEXT: uzp1 v2.4s, v6.4s, v2.4s -; CHECK-BE-NEXT: uzp1 v1.4s, v5.4s, v7.4s -; CHECK-BE-NEXT: uzp1 v2.8h, v2.8h, v3.8h -; CHECK-BE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-BE-NEXT: st1 { v0.16b }, [x9] -; CHECK-BE-NEXT: b.eq .LBB3_1 -; CHECK-BE-NEXT: // %bb.2: // %exit -; CHECK-BE-NEXT: ret +; CHECK-LABEL: trunc_v16i64_to_v16i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB3_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #7 +; CHECK-NEXT: ldp q3, q2, [x9, #96] +; CHECK-NEXT: ldp q1, q0, [x9, #32] +; CHECK-NEXT: uzp1.4s v2, v3, v2 +; CHECK-NEXT: ldp q5, q4, [x9, #64] +; CHECK-NEXT: uzp1.4s v0, v1, v0 +; CHECK-NEXT: ldp q3, q6, [x9] +; CHECK-NEXT: uzp1.4s v4, v5, v4 +; CHECK-NEXT: uzp1.8h v2, v4, v2 +; CHECK-NEXT: uzp1.4s v1, v3, v6 +; CHECK-NEXT: uzp1.8h v0, v1, v0 +; CHECK-NEXT: uzp1.16b v0, v0, v2 +; CHECK-NEXT: str q0, [x1, x8, lsl #4] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB3_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; +; CHECK-BE-LABEL: trunc_v16i64_to_v16i8_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB3_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8, lsl #7 +; CHECK-BE-NEXT: add x10, x9, #48 +; CHECK-BE-NEXT: add x11, x9, #32 +; CHECK-BE-NEXT: ld1 { v5.2d }, [x9] +; CHECK-BE-NEXT: ld1 { v0.2d }, [x10] +; CHECK-BE-NEXT: add x10, x9, #80 +; CHECK-BE-NEXT: ld1 { v1.2d }, [x11] +; CHECK-BE-NEXT: add x11, x9, #112 +; CHECK-BE-NEXT: ld1 { v2.2d }, [x10] +; CHECK-BE-NEXT: add x10, x9, #96 +; CHECK-BE-NEXT: ld1 { v3.2d }, [x11] +; CHECK-BE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-BE-NEXT: ld1 { v4.2d }, [x10] +; CHECK-BE-NEXT: add x10, x9, #64 +; CHECK-BE-NEXT: add x9, x9, #16 +; CHECK-BE-NEXT: ld1 { v6.2d }, [x10] +; CHECK-BE-NEXT: ld1 { v7.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, x8, lsl #4 +; CHECK-BE-NEXT: uzp1 v3.4s, v4.4s, v3.4s +; CHECK-BE-NEXT: add x8, x8, #1 +; CHECK-BE-NEXT: cmp x8, #1000 +; CHECK-BE-NEXT: uzp1 v2.4s, v6.4s, v2.4s +; CHECK-BE-NEXT: uzp1 v1.4s, v5.4s, v7.4s +; CHECK-BE-NEXT: uzp1 v2.8h, v2.8h, v3.8h +; CHECK-BE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; CHECK-BE-NEXT: st1 { v0.16b }, [x9] +; CHECK-BE-NEXT: b.eq .LBB3_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + entry: br label %loop @@ -315,49 +316,50 @@ } define void @trunc_v8i64_to_v8i8_in_loop(ptr %A, ptr %dst) { -; CHECK-LABEL: trunc_v8i64_to_v8i8_in_loop: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: LBB4_1: ; %loop -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x9, x0, x8, lsl #6 -; CHECK-NEXT: ldp q1, q0, [x9, #32] -; CHECK-NEXT: ldp q3, q2, [x9] -; CHECK-NEXT: uzp1.4s v0, v1, v0 -; CHECK-NEXT: uzp1.4s v1, v3, v2 -; CHECK-NEXT: uzp1.8h v0, v1, v0 -; CHECK-NEXT: xtn.8b v0, v0 -; CHECK-NEXT: str d0, [x1, x8, lsl #3] -; CHECK-NEXT: add x8, x8, #1 -; CHECK-NEXT: cmp x8, #1000 -; CHECK-NEXT: b.eq LBB4_1 -; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: ret - -; CHECK-BE-LABEL: trunc_v8i64_to_v8i8_in_loop: -; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr -; CHECK-BE-NEXT: .LBB4_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x9, x0, x8, lsl #6 -; CHECK-BE-NEXT: add x10, x9, #48 -; CHECK-BE-NEXT: ld1 { v1.2d }, [x9] -; CHECK-BE-NEXT: ld1 { v0.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #32 -; CHECK-BE-NEXT: add x9, x9, #16 -; CHECK-BE-NEXT: ld1 { v2.2d }, [x10] -; CHECK-BE-NEXT: ld1 { v3.2d }, [x9] -; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 -; CHECK-BE-NEXT: add x8, x8, #1 -; CHECK-BE-NEXT: cmp x8, #1000 -; CHECK-BE-NEXT: uzp1 v0.4s, v2.4s, v0.4s -; CHECK-BE-NEXT: uzp1 v1.4s, v1.4s, v3.4s -; CHECK-BE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; CHECK-BE-NEXT: xtn v0.8b, v0.8h -; CHECK-BE-NEXT: st1 { v0.8b }, [x9] -; CHECK-BE-NEXT: b.eq .LBB4_1 -; CHECK-BE-NEXT: // %bb.2: // %exit -; CHECK-BE-NEXT: ret +; CHECK-LABEL: trunc_v8i64_to_v8i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB4_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8, lsl #6 +; CHECK-NEXT: ldp q1, q0, [x9, #32] +; CHECK-NEXT: ldp q3, q2, [x9] +; CHECK-NEXT: uzp1.4s v0, v1, v0 +; CHECK-NEXT: uzp1.4s v1, v3, v2 +; CHECK-NEXT: uzp1.8h v0, v1, v0 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: str d0, [x1, x8, lsl #3] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB4_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; +; CHECK-BE-LABEL: trunc_v8i64_to_v8i8_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB4_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8, lsl #6 +; CHECK-BE-NEXT: add x10, x9, #48 +; CHECK-BE-NEXT: ld1 { v1.2d }, [x9] +; CHECK-BE-NEXT: ld1 { v0.2d }, [x10] +; CHECK-BE-NEXT: add x10, x9, #32 +; CHECK-BE-NEXT: add x9, x9, #16 +; CHECK-BE-NEXT: ld1 { v2.2d }, [x10] +; CHECK-BE-NEXT: ld1 { v3.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 +; CHECK-BE-NEXT: add x8, x8, #1 +; CHECK-BE-NEXT: cmp x8, #1000 +; CHECK-BE-NEXT: uzp1 v0.4s, v2.4s, v0.4s +; CHECK-BE-NEXT: uzp1 v1.4s, v1.4s, v3.4s +; CHECK-BE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-BE-NEXT: xtn v0.8b, v0.8h +; CHECK-BE-NEXT: st1 { v0.8b }, [x9] +; CHECK-BE-NEXT: b.eq .LBB4_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + entry: br label %loop @@ -378,83 +380,84 @@ } define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) { -; CHECK-LABEL: trunc_v8i19_to_v8i8_in_loop: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: LBB5_1: ; %loop -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp x10, x9, [x0] -; CHECK-NEXT: ldrb w11, [x0, #18] -; CHECK-NEXT: ldrh w13, [x0, #16] -; CHECK-NEXT: add x0, x0, #32 -; CHECK-NEXT: lsr x14, x10, #19 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: ubfx x12, x9, #12, #20 -; CHECK-NEXT: lsr x15, x9, #31 -; CHECK-NEXT: bfi w13, w11, #16, #8 -; CHECK-NEXT: lsr x11, x9, #50 -; CHECK-NEXT: mov.s v0[1], w14 -; CHECK-NEXT: fmov s1, w12 -; CHECK-NEXT: lsr x12, x10, #38 -; CHECK-NEXT: bfi w11, w13, #14, #18 -; CHECK-NEXT: lsr x10, x10, #57 -; CHECK-NEXT: bfi w10, w9, #7, #25 -; CHECK-NEXT: lsr w9, w13, #5 -; CHECK-NEXT: mov.s v1[1], w15 -; CHECK-NEXT: mov.s v0[2], w12 -; CHECK-NEXT: mov.s v1[2], w11 -; CHECK-NEXT: mov.s v0[3], w10 -; CHECK-NEXT: mov.s v1[3], w9 -; CHECK-NEXT: uzp1.8h v0, v0, v1 -; CHECK-NEXT: xtn.8b v0, v0 -; CHECK-NEXT: str d0, [x1, x8, lsl #3] -; CHECK-NEXT: add x8, x8, #1 -; CHECK-NEXT: cmp x8, #1000 -; CHECK-NEXT: b.eq LBB5_1 -; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: ret - -; CHECK-BE-LABEL: trunc_v8i19_to_v8i8_in_loop: -; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr -; CHECK-BE-NEXT: .LBB5_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: ldp x10, x9, [x0] -; CHECK-BE-NEXT: ldrh w15, [x0, #16] -; CHECK-BE-NEXT: lsr x12, x10, #40 -; CHECK-BE-NEXT: lsr x13, x10, #45 -; CHECK-BE-NEXT: lsr x11, x9, #40 -; CHECK-BE-NEXT: ubfx x14, x9, #33, #7 -; CHECK-BE-NEXT: ubfx x16, x10, #26, #14 -; CHECK-BE-NEXT: bfi w16, w12, #14, #18 -; CHECK-BE-NEXT: ubfx x12, x9, #14, #18 -; CHECK-BE-NEXT: bfi w14, w11, #7, #24 -; CHECK-BE-NEXT: ldrb w11, [x0, #18] -; CHECK-BE-NEXT: fmov s0, w13 -; CHECK-BE-NEXT: add x0, x0, #32 -; CHECK-BE-NEXT: fmov s1, w14 -; CHECK-BE-NEXT: bfi w11, w15, #8, #16 -; CHECK-BE-NEXT: mov v0.s[1], w16 -; CHECK-BE-NEXT: mov v1.s[1], w12 -; CHECK-BE-NEXT: extr x12, x10, x9, #40 -; CHECK-BE-NEXT: lsl x9, x9, #24 -; CHECK-BE-NEXT: ubfx x10, x10, #7, #25 -; CHECK-BE-NEXT: orr w9, w11, w9 -; CHECK-BE-NEXT: lsr w9, w9, #19 -; CHECK-BE-NEXT: mov v0.s[2], w10 -; CHECK-BE-NEXT: ubfx x10, x12, #12, #20 -; CHECK-BE-NEXT: mov v1.s[2], w9 -; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 -; CHECK-BE-NEXT: add x8, x8, #1 -; CHECK-BE-NEXT: mov v0.s[3], w10 -; CHECK-BE-NEXT: cmp x8, #1000 -; CHECK-BE-NEXT: mov v1.s[3], w11 -; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-BE-NEXT: xtn v0.8b, v0.8h -; CHECK-BE-NEXT: st1 { v0.8b }, [x9] -; CHECK-BE-NEXT: b.eq .LBB5_1 -; CHECK-BE-NEXT: // %bb.2: // %exit -; CHECK-BE-NEXT: ret +; CHECK-LABEL: trunc_v8i19_to_v8i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB5_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldp x10, x9, [x0] +; CHECK-NEXT: ldrb w11, [x0, #18] +; CHECK-NEXT: ldrh w13, [x0, #16] +; CHECK-NEXT: add x0, x0, #32 +; CHECK-NEXT: lsr x14, x10, #19 +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: ubfx x12, x9, #12, #20 +; CHECK-NEXT: lsr x15, x9, #31 +; CHECK-NEXT: bfi w13, w11, #16, #8 +; CHECK-NEXT: lsr x11, x9, #50 +; CHECK-NEXT: mov.s v0[1], w14 +; CHECK-NEXT: fmov s1, w12 +; CHECK-NEXT: lsr x12, x10, #38 +; CHECK-NEXT: bfi w11, w13, #14, #18 +; CHECK-NEXT: lsr x10, x10, #57 +; CHECK-NEXT: bfi w10, w9, #7, #25 +; CHECK-NEXT: lsr w9, w13, #5 +; CHECK-NEXT: mov.s v1[1], w15 +; CHECK-NEXT: mov.s v0[2], w12 +; CHECK-NEXT: mov.s v1[2], w11 +; CHECK-NEXT: mov.s v0[3], w10 +; CHECK-NEXT: mov.s v1[3], w9 +; CHECK-NEXT: uzp1.8h v0, v0, v1 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: str d0, [x1, x8, lsl #3] +; CHECK-NEXT: add x8, x8, #1 +; CHECK-NEXT: cmp x8, #1000 +; CHECK-NEXT: b.eq LBB5_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; +; CHECK-BE-LABEL: trunc_v8i19_to_v8i8_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB5_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: ldp x10, x9, [x0] +; CHECK-BE-NEXT: ldrh w15, [x0, #16] +; CHECK-BE-NEXT: lsr x12, x10, #40 +; CHECK-BE-NEXT: lsr x13, x10, #45 +; CHECK-BE-NEXT: lsr x11, x9, #40 +; CHECK-BE-NEXT: ubfx x14, x9, #33, #7 +; CHECK-BE-NEXT: ubfx x16, x10, #26, #14 +; CHECK-BE-NEXT: bfi w16, w12, #14, #18 +; CHECK-BE-NEXT: ubfx x12, x9, #14, #18 +; CHECK-BE-NEXT: bfi w14, w11, #7, #24 +; CHECK-BE-NEXT: ldrb w11, [x0, #18] +; CHECK-BE-NEXT: fmov s0, w13 +; CHECK-BE-NEXT: add x0, x0, #32 +; CHECK-BE-NEXT: fmov s1, w14 +; CHECK-BE-NEXT: bfi w11, w15, #8, #16 +; CHECK-BE-NEXT: mov v0.s[1], w16 +; CHECK-BE-NEXT: mov v1.s[1], w12 +; CHECK-BE-NEXT: extr x12, x10, x9, #40 +; CHECK-BE-NEXT: lsl x9, x9, #24 +; CHECK-BE-NEXT: ubfx x10, x10, #7, #25 +; CHECK-BE-NEXT: orr w9, w11, w9 +; CHECK-BE-NEXT: lsr w9, w9, #19 +; CHECK-BE-NEXT: mov v0.s[2], w10 +; CHECK-BE-NEXT: ubfx x10, x12, #12, #20 +; CHECK-BE-NEXT: mov v1.s[2], w9 +; CHECK-BE-NEXT: add x9, x1, x8, lsl #3 +; CHECK-BE-NEXT: add x8, x8, #1 +; CHECK-BE-NEXT: mov v0.s[3], w10 +; CHECK-BE-NEXT: cmp x8, #1000 +; CHECK-BE-NEXT: mov v1.s[3], w11 +; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-BE-NEXT: xtn v0.8b, v0.8h +; CHECK-BE-NEXT: st1 { v0.8b }, [x9] +; CHECK-BE-NEXT: b.eq .LBB5_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + entry: br label %loop @@ -475,67 +478,68 @@ } define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) { -; CHECK-LABEL: trunc_v11i64_to_v11i8_in_loop: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov w8, #1000 -; CHECK-NEXT: LBB6_1: ; %loop -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldp q1, q0, [x0, #32] -; CHECK-NEXT: add x9, x1, #8 -; CHECK-NEXT: add x10, x1, #10 -; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: uzp1.4s v0, v1, v0 -; CHECK-NEXT: ldr d4, [x0, #80] -; CHECK-NEXT: ldr q1, [x0, #64] -; CHECK-NEXT: add x0, x0, #128 -; CHECK-NEXT: uzp1.4s v2, v3, v2 -; CHECK-NEXT: uzp1.4s v1, v1, v4 -; CHECK-NEXT: uzp1.8h v0, v2, v0 -; CHECK-NEXT: xtn.4h v1, v1 -; CHECK-NEXT: uzp1.16b v0, v0, v1 -; CHECK-NEXT: xtn.8b v1, v1 -; CHECK-NEXT: st1.b { v1 }[2], [x10] -; CHECK-NEXT: str d0, [x1], #16 -; CHECK-NEXT: st1.h { v0 }[4], [x9] -; CHECK-NEXT: b.eq LBB6_1 -; CHECK-NEXT: ; %bb.2: ; %exit -; CHECK-NEXT: ret - -; CHECK-BE-LABEL: trunc_v11i64_to_v11i8_in_loop: -; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov w8, #1000 -; CHECK-BE-NEXT:.LBB6_1: // %loop -; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 -; CHECK-BE-NEXT: ld1 { v0.2d }, [x0] -; CHECK-BE-NEXT: subs x8, x8, #1 -; CHECK-BE-NEXT: ld1 { v1.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #16 -; CHECK-BE-NEXT: ld1 { v2.2d }, [x10] -; CHECK-BE-NEXT: add x10, x0, #64 -; CHECK-BE-NEXT: ld1 { v3.2d }, [x9] -; CHECK-BE-NEXT: add x9, x1, #10 -; CHECK-BE-NEXT: ld1 { v4.2d }, [x10] -; CHECK-BE-NEXT: add x10, x1, #8 -; CHECK-BE-NEXT: uzp1 v1.4s, v2.4s, v1.4s -; CHECK-BE-NEXT: ldr d2, [x0, #80] -; CHECK-BE-NEXT: add x0, x0, #128 -; CHECK-BE-NEXT: uzp1 v0.4s, v0.4s, v3.4s -; CHECK-BE-NEXT: uzp1 v2.4s, v4.4s, v2.4s -; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-BE-NEXT: xtn v1.4h, v2.4s -; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v1.16b -; CHECK-BE-NEXT: xtn v1.8b, v1.8h -; CHECK-BE-NEXT: st1 { v1.b }[2], [x9] -; CHECK-BE-NEXT: rev64 v2.16b, v0.16b -; CHECK-BE-NEXT: rev16 v0.16b, v0.16b -; CHECK-BE-NEXT: str d2, [x1], #16 -; CHECK-BE-NEXT: st1 { v0.h }[4], [x10] -; CHECK-BE-NEXT: b.eq .LBB6_1 -; CHECK-BE-NEXT:// %bb.2: // %exit -; CHECK-BE-NEXT: ret +; CHECK-LABEL: trunc_v11i64_to_v11i8_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov w8, #1000 +; CHECK-NEXT: LBB6_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: add x9, x1, #8 +; CHECK-NEXT: add x10, x1, #10 +; CHECK-NEXT: subs x8, x8, #1 +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: uzp1.4s v0, v1, v0 +; CHECK-NEXT: ldr d4, [x0, #80] +; CHECK-NEXT: ldr q1, [x0, #64] +; CHECK-NEXT: add x0, x0, #128 +; CHECK-NEXT: uzp1.4s v2, v3, v2 +; CHECK-NEXT: uzp1.4s v1, v1, v4 +; CHECK-NEXT: uzp1.8h v0, v2, v0 +; CHECK-NEXT: xtn.4h v1, v1 +; CHECK-NEXT: uzp1.16b v0, v0, v1 +; CHECK-NEXT: xtn.8b v1, v1 +; CHECK-NEXT: st1.b { v1 }[2], [x10] +; CHECK-NEXT: str d0, [x1], #16 +; CHECK-NEXT: st1.h { v0 }[4], [x9] +; CHECK-NEXT: b.eq LBB6_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret +; +; CHECK-BE-LABEL: trunc_v11i64_to_v11i8_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov w8, #1000 +; CHECK-BE-NEXT: .LBB6_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, #48 +; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: ld1 { v0.2d }, [x0] +; CHECK-BE-NEXT: subs x8, x8, #1 +; CHECK-BE-NEXT: ld1 { v1.2d }, [x9] +; CHECK-BE-NEXT: add x9, x0, #16 +; CHECK-BE-NEXT: ld1 { v2.2d }, [x10] +; CHECK-BE-NEXT: add x10, x0, #64 +; CHECK-BE-NEXT: ld1 { v3.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #10 +; CHECK-BE-NEXT: ld1 { v4.2d }, [x10] +; CHECK-BE-NEXT: add x10, x1, #8 +; CHECK-BE-NEXT: uzp1 v1.4s, v2.4s, v1.4s +; CHECK-BE-NEXT: ldr d2, [x0, #80] +; CHECK-BE-NEXT: add x0, x0, #128 +; CHECK-BE-NEXT: uzp1 v0.4s, v0.4s, v3.4s +; CHECK-BE-NEXT: uzp1 v2.4s, v4.4s, v2.4s +; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-BE-NEXT: xtn v1.4h, v2.4s +; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-BE-NEXT: xtn v1.8b, v1.8h +; CHECK-BE-NEXT: st1 { v1.b }[2], [x9] +; CHECK-BE-NEXT: rev64 v2.16b, v0.16b +; CHECK-BE-NEXT: rev16 v0.16b, v0.16b +; CHECK-BE-NEXT: str d2, [x1], #16 +; CHECK-BE-NEXT: st1 { v0.h }[4], [x10] +; CHECK-BE-NEXT: b.eq .LBB6_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + entry: br label %loop