diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -791,3 +791,671 @@ exit: ret void } + +define void @zext_v8i8_to_v8i64_in_loop(i8* %src, i64* %dst) { +; CHECK-LABEL: _zext_v8i8_to_v8i64_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB8_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d0, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ushll2.4s v1, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: ushll2.2d v2, v1, #0 +; CHECK-NEXT: ushll.2d v1, v1, #0 +; CHECK-NEXT: ushll2.2d v3, v0, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 +; CHECK-NEXT: stp q1, q2, [x1, #32] +; CHECK-NEXT: stp q0, q3, [x1], #128 +; CHECK-NEXT: b.ne LBB8_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v8i8_to_v8i64_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB8_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x10, x1, #32 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v0.8b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #48 +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: ushll2 v2.2d, v1.4s, #0 +; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-BE-NEXT: st1 { v2.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: ushll v2.2d, v0.2s, #0 +; CHECK-BE-NEXT: st1 { v1.2d }, [x10] +; CHECK-BE-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-BE-NEXT: st1 { v2.2d }, [x1] +; CHECK-BE-NEXT: add x1, x1, #128 +; CHECK-BE-NEXT: st1 { v0.2d }, [x9] +; CHECK-BE-NEXT: b.ne .LBB8_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <8 x i8>* + %load = load <8 x i8>, <8 x i8>* %src.gep.cast + %ext = zext <8 x i8> %load to <8 x i64> + %dst.gep = getelementptr i64, i64* %dst, i64 %iv + %dst.gep.cast = bitcast i64* %dst.gep to <8 x i64>* + store <8 x i64> %ext, <8 x i64>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v8i8_to_v8i16_in_loop(i8* %src, i16* %dst) { +; CHECK-LABEL: _zext_v8i8_to_v8i16_in_loop: ; @zext_v8i8_to_v8i16_in_loop +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB9_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d0, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: str q0, [x1], #32 +; CHECK-NEXT: b.ne LBB9_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v8i8_to_v8i16_in_loop: // @zext_v8i8_to_v8i16_in_loop +; CHECK-BE: .cfi_startproc +; CHECK-BE-NEXT: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB9_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v0.8b }, [x9] +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: st1 { v0.8h }, [x1] +; CHECK-BE-NEXT: add x1, x1, #32 +; CHECK-BE-NEXT: b.ne .LBB9_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <8 x i8>* + %load = load <8 x i8>, <8 x i8>* %src.gep.cast + %ext = zext <8 x i8> %load to <8 x i16> + %dst.gep = getelementptr i16, i16* %dst, i64 %iv + %dst.gep.cast = bitcast i16* %dst.gep to <8 x i16>* + store <8 x i16> %ext, <8 x i16>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v8i8_to_v8i20_in_loop(i8* %src, i20* %dst) { +; CHECK-LABEL: _zext_v8i8_to_v8i20_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB10_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr d0, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ushll2.4s v1, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: mov.s w10, v1[1] +; CHECK-NEXT: mov.s w13, v0[1] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: mov.s w12, v1[2] +; CHECK-NEXT: fmov w15, s0 +; CHECK-NEXT: mov.s w16, v0[2] +; CHECK-NEXT: mov.s w9, v1[3] +; CHECK-NEXT: mov.s w14, v0[3] +; CHECK-NEXT: bfi x11, x10, #20, #8 +; CHECK-NEXT: bfi x15, x13, #20, #8 +; CHECK-NEXT: bfi x11, x12, #40, #8 +; CHECK-NEXT: bfi x15, x16, #40, #8 +; CHECK-NEXT: lsr x10, x9, #4 +; CHECK-NEXT: lsr x12, x14, #4 +; CHECK-NEXT: bfi x11, x9, #60, #4 +; CHECK-NEXT: bfi x15, x14, #60, #4 +; CHECK-NEXT: strh w10, [x1, #18] +; CHECK-NEXT: strh w12, [x1, #8] +; CHECK-NEXT: stur x11, [x1, #10] +; CHECK-NEXT: str x15, [x1], #64 +; CHECK-NEXT: b.ne LBB10_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v8i8_to_v8i20_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB10_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v0.8b }, [x9] +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: mov w9, v1.s[1] +; CHECK-BE-NEXT: mov w11, v0.s[1] +; CHECK-BE-NEXT: mov w12, v1.s[2] +; CHECK-BE-NEXT: fmov w13, s1 +; CHECK-BE-NEXT: mov w14, v0.s[2] +; CHECK-BE-NEXT: fmov w15, s0 +; CHECK-BE-NEXT: mov w10, v1.s[3] +; CHECK-BE-NEXT: lsl x9, x9, #40 +; CHECK-BE-NEXT: mov w16, v0.s[3] +; CHECK-BE-NEXT: lsl x11, x11, #40 +; CHECK-BE-NEXT: bfi x9, x13, #60, #4 +; CHECK-BE-NEXT: bfi x11, x15, #60, #4 +; CHECK-BE-NEXT: bfi x9, x12, #20, #8 +; CHECK-BE-NEXT: bfi x11, x14, #20, #8 +; CHECK-BE-NEXT: lsr w12, w13, #4 +; CHECK-BE-NEXT: lsr w13, w15, #4 +; CHECK-BE-NEXT: strh w10, [x1, #18] +; CHECK-BE-NEXT: extr x9, x12, x9, #16 +; CHECK-BE-NEXT: strh w16, [x1, #8] +; CHECK-BE-NEXT: extr x10, x13, x11, #16 +; CHECK-BE-NEXT: stur x9, [x1, #10] +; CHECK-BE-NEXT: str x10, [x1], #64 +; CHECK-BE-NEXT: b.ne .LBB10_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <8 x i8>* + %load = load <8 x i8>, <8 x i8>* %src.gep.cast + %ext = zext <8 x i8> %load to <8 x i20> + %dst.gep = getelementptr i20, i20* %dst, i64 %iv + %dst.gep.cast = bitcast i20* %dst.gep to <8 x i20>* + store <8 x i20> %ext, <8 x i20>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v4i8_to_v4i32_in_loop(i8* %src, i32* %dst) { +; CHECK-LABEL: _zext_v4i8_to_v4i32_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB11_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr s0, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: str q0, [x1], #64 +; CHECK-NEXT: b.ne LBB11_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB11_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: ldr s0, [x0, x8] +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: rev32 v0.8b, v0.8b +; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: st1 { v0.4s }, [x1] +; CHECK-BE-NEXT: add x1, x1, #64 +; CHECK-BE-NEXT: b.ne .LBB11_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <4 x i8>* + %load = load <4 x i8>, <4 x i8>* %src.gep.cast + %ext = zext <4 x i8> %load to <4 x i32> + %dst.gep = getelementptr i32, i32* %dst, i64 %iv + %dst.gep.cast = bitcast i32* %dst.gep to <4 x i32>* + store <4 x i32> %ext, <4 x i32>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v12i8_to_v12i32_in_loop(i8* %src, i32* %dst) { +; CHECK-LABEL: _zext_v12i8_to_v12i32_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB12_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q0, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: ushll2.8h v1, v0, #0 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: ushll2.4s v2, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: stp q2, q1, [x1, #16] +; CHECK-NEXT: str q0, [x1], #64 +; CHECK-NEXT: b.ne LBB12_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v12i8_to_v12i32_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB12_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x10, x1, #16 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x9] +; CHECK-BE-NEXT: add x9, x1, #32 +; CHECK-BE-NEXT: ushll v1.8h, v0.8b, #0 +; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-BE-NEXT: ushll v2.4s, v1.4h, #0 +; CHECK-BE-NEXT: ushll2 v1.4s, v1.8h, #0 +; CHECK-BE-NEXT: st1 { v2.4s }, [x1] +; CHECK-BE-NEXT: add x1, x1, #64 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: st1 { v1.4s }, [x10] +; CHECK-BE-NEXT: st1 { v0.4s }, [x9] +; CHECK-BE-NEXT: b.ne .LBB12_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i8, i8* %src, i64 %iv + %src.gep.cast = bitcast i8* %src.gep to <12 x i8>* + %load = load <12 x i8>, <12 x i8>* %src.gep.cast + %ext = zext <12 x i8> %load to <12 x i32> + %dst.gep = getelementptr i32, i32* %dst, i64 %iv + %dst.gep.cast = bitcast i32* %dst.gep to <12 x i32>* + store <12 x i32> %ext, <12 x i32>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v16i4_to_v16i32_in_loop(i4* %src, i32* %dst) { +; CHECK-LABEL: _zext_v16i4_to_v16i32_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: movi.4s v0, #15 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB13_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr x9, [x0, x8] +; CHECK-NEXT: add x8, x8, #16 +; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: and w10, w9, #0xf +; CHECK-NEXT: ubfx w11, w9, #4, #4 +; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: ubfx w10, w9, #8, #4 +; CHECK-NEXT: mov.b v1[1], w11 +; CHECK-NEXT: mov.b v1[2], w10 +; CHECK-NEXT: ubfx w10, w9, #12, #4 +; CHECK-NEXT: mov.b v1[3], w10 +; CHECK-NEXT: ubfx w10, w9, #16, #4 +; CHECK-NEXT: mov.b v1[4], w10 +; CHECK-NEXT: ubfx w10, w9, #20, #4 +; CHECK-NEXT: mov.b v1[5], w10 +; CHECK-NEXT: ubfx w10, w9, #24, #4 +; CHECK-NEXT: mov.b v1[6], w10 +; CHECK-NEXT: ubfx x10, x9, #28, #4 +; CHECK-NEXT: mov.b v1[7], w10 +; CHECK-NEXT: ubfx x10, x9, #32, #4 +; CHECK-NEXT: mov.b v1[8], w10 +; CHECK-NEXT: ubfx x10, x9, #36, #4 +; CHECK-NEXT: mov.b v1[9], w10 +; CHECK-NEXT: ubfx x10, x9, #40, #4 +; CHECK-NEXT: mov.b v1[10], w10 +; CHECK-NEXT: ubfx x10, x9, #44, #4 +; CHECK-NEXT: mov.b v1[11], w10 +; CHECK-NEXT: ubfx x10, x9, #48, #4 +; CHECK-NEXT: mov.b v1[12], w10 +; CHECK-NEXT: ubfx x10, x9, #52, #4 +; CHECK-NEXT: mov.b v1[13], w10 +; CHECK-NEXT: ubfx x10, x9, #56, #4 +; CHECK-NEXT: lsr x9, x9, #60 +; CHECK-NEXT: mov.b v1[14], w10 +; CHECK-NEXT: mov.b v1[15], w9 +; CHECK-NEXT: ext.16b v2, v1, v1, #8 +; CHECK-NEXT: zip2.8b v3, v1, v0 +; CHECK-NEXT: zip1.8b v1, v1, v0 +; CHECK-NEXT: zip1.8b v4, v2, v0 +; CHECK-NEXT: zip2.8b v2, v2, v0 +; CHECK-NEXT: ushll.4s v3, v3, #0 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: and.16b v3, v3, v0 +; CHECK-NEXT: and.16b v1, v1, v0 +; CHECK-NEXT: stp q1, q3, [x1] +; CHECK-NEXT: ushll.4s v1, v2, #0 +; CHECK-NEXT: ushll.4s v2, v4, #0 +; CHECK-NEXT: and.16b v1, v1, v0 +; CHECK-NEXT: and.16b v2, v2, v0 +; CHECK-NEXT: stp q2, q1, [x1, #32] +; CHECK-NEXT: add x1, x1, #64 +; CHECK-NEXT: b.ne LBB13_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v16i4_to_v16i32_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: movi v0.4s, #15 +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB13_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: ldr x9, [x0, x8] +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: lsr x10, x9, #60 +; CHECK-BE-NEXT: ubfx x11, x9, #56, #4 +; CHECK-BE-NEXT: fmov s1, w10 +; CHECK-BE-NEXT: ubfx x10, x9, #52, #4 +; CHECK-BE-NEXT: mov v1.b[1], w11 +; CHECK-BE-NEXT: add x11, x1, #32 +; CHECK-BE-NEXT: mov v1.b[2], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #48, #4 +; CHECK-BE-NEXT: mov v1.b[3], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #44, #4 +; CHECK-BE-NEXT: mov v1.b[4], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #40, #4 +; CHECK-BE-NEXT: mov v1.b[5], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #36, #4 +; CHECK-BE-NEXT: mov v1.b[6], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #32, #4 +; CHECK-BE-NEXT: mov v1.b[7], w10 +; CHECK-BE-NEXT: ubfx x10, x9, #28, #4 +; CHECK-BE-NEXT: mov v1.b[8], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #24, #4 +; CHECK-BE-NEXT: mov v1.b[9], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #20, #4 +; CHECK-BE-NEXT: mov v1.b[10], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #16, #4 +; CHECK-BE-NEXT: mov v1.b[11], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #12, #4 +; CHECK-BE-NEXT: mov v1.b[12], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #8, #4 +; CHECK-BE-NEXT: mov v1.b[13], w10 +; CHECK-BE-NEXT: ubfx w10, w9, #4, #4 +; CHECK-BE-NEXT: and w9, w9, #0xf +; CHECK-BE-NEXT: mov v1.b[14], w10 +; CHECK-BE-NEXT: add x10, x1, #48 +; CHECK-BE-NEXT: mov v1.b[15], w9 +; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-BE-NEXT: zip2 v3.8b, v1.8b, v0.8b +; CHECK-BE-NEXT: zip1 v1.8b, v1.8b, v0.8b +; CHECK-BE-NEXT: zip1 v4.8b, v2.8b, v0.8b +; CHECK-BE-NEXT: zip2 v2.8b, v2.8b, v0.8b +; CHECK-BE-NEXT: rev16 v1.8b, v1.8b +; CHECK-BE-NEXT: rev16 v3.8b, v3.8b +; CHECK-BE-NEXT: rev16 v4.8b, v4.8b +; CHECK-BE-NEXT: rev16 v2.8b, v2.8b +; CHECK-BE-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-BE-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-BE-NEXT: and v1.16b, v1.16b, v0.16b +; CHECK-BE-NEXT: st1 { v1.4s }, [x1] +; CHECK-BE-NEXT: add x1, x1, #64 +; CHECK-BE-NEXT: ushll v1.4s, v2.4h, #0 +; CHECK-BE-NEXT: ushll v2.4s, v4.4h, #0 +; CHECK-BE-NEXT: and v3.16b, v3.16b, v0.16b +; CHECK-BE-NEXT: and v1.16b, v1.16b, v0.16b +; CHECK-BE-NEXT: st1 { v3.4s }, [x9] +; CHECK-BE-NEXT: and v2.16b, v2.16b, v0.16b +; CHECK-BE-NEXT: st1 { v1.4s }, [x10] +; CHECK-BE-NEXT: st1 { v2.4s }, [x11] +; CHECK-BE-NEXT: b.ne .LBB13_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i4, i4* %src, i64 %iv + %src.gep.cast = bitcast i4* %src.gep to <16 x i4>* + %load = load <16 x i4>, <16 x i4>* %src.gep.cast + %ext = zext <16 x i4> %load to <16 x i32> + %dst.gep = getelementptr i32, i32* %dst, i64 %iv + %dst.gep.cast = bitcast i32* %dst.gep to <16 x i32>* + store <16 x i32> %ext, <16 x i32>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v16i16_to_v16i64_in_loop(i16* %src, i64* %dst) { +; CHECK-LABEL: _zext_v16i16_to_v16i64_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB14_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8 +; CHECK-NEXT: add x8, x8, #32 +; CHECK-NEXT: cmp x8, #256 +; CHECK-NEXT: ldp q0, q1, [x9] +; CHECK-NEXT: ushll.4s v2, v0, #0 +; CHECK-NEXT: ushll2.4s v0, v0, #0 +; CHECK-NEXT: ushll.4s v3, v1, #0 +; CHECK-NEXT: ushll2.4s v1, v1, #0 +; CHECK-NEXT: ushll2.2d v5, v0, #0 +; CHECK-NEXT: ushll2.2d v4, v1, #0 +; CHECK-NEXT: ushll.2d v1, v1, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 +; CHECK-NEXT: stp q1, q4, [x1, #96] +; CHECK-NEXT: ushll2.2d v1, v3, #0 +; CHECK-NEXT: stp q0, q5, [x1, #32] +; CHECK-NEXT: ushll.2d v3, v3, #0 +; CHECK-NEXT: ushll2.2d v0, v2, #0 +; CHECK-NEXT: stp q3, q1, [x1, #64] +; CHECK-NEXT: ushll.2d v1, v2, #0 +; CHECK-NEXT: stp q1, q0, [x1], #128 +; CHECK-NEXT: b.ne LBB14_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v16i16_to_v16i64_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB14_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x10, x1, #48 +; CHECK-BE-NEXT: add x8, x8, #32 +; CHECK-BE-NEXT: cmp x8, #256 +; CHECK-BE-NEXT: ld1 { v0.8h }, [x9] +; CHECK-BE-NEXT: add x9, x9, #16 +; CHECK-BE-NEXT: ld1 { v2.8h }, [x9] +; CHECK-BE-NEXT: add x9, x1, #32 +; CHECK-BE-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: ushll2 v3.2d, v1.4s, #0 +; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-BE-NEXT: st1 { v3.2d }, [x10] +; CHECK-BE-NEXT: add x10, x1, #112 +; CHECK-BE-NEXT: st1 { v1.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #16 +; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-BE-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-BE-NEXT: st1 { v3.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #96 +; CHECK-BE-NEXT: ushll2 v4.2d, v1.4s, #0 +; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-BE-NEXT: st1 { v4.2d }, [x10] +; CHECK-BE-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-BE-NEXT: add x10, x1, #80 +; CHECK-BE-NEXT: st1 { v0.2d }, [x1] +; CHECK-BE-NEXT: st1 { v1.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #64 +; CHECK-BE-NEXT: add x1, x1, #128 +; CHECK-BE-NEXT: ushll v3.2d, v2.2s, #0 +; CHECK-BE-NEXT: ushll2 v2.2d, v2.4s, #0 +; CHECK-BE-NEXT: st1 { v3.2d }, [x9] +; CHECK-BE-NEXT: st1 { v2.2d }, [x10] +; CHECK-BE-NEXT: b.ne .LBB14_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i16, i16* %src, i64 %iv + %src.gep.cast = bitcast i16* %src.gep to <16 x i16>* + %load = load <16 x i16>, <16 x i16>* %src.gep.cast + %ext = zext <16 x i16> %load to <16 x i64> + %dst.gep = getelementptr i64, i64* %dst, i64 %iv + %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>* + store <16 x i64> %ext, <16 x i64>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @zext_v16i32_to_v16i64_in_loop(i32* %src, i64* %dst) { +; CHECK-LABEL: _zext_v16i32_to_v16i64_in_loop: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB15_1: ; %loop +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8 +; CHECK-NEXT: add x8, x8, #64 +; CHECK-NEXT: cmp x8, #512 +; CHECK-NEXT: ldp q1, q0, [x9, #32] +; CHECK-NEXT: ushll2.2d v5, v1, #0 +; CHECK-NEXT: ushll.2d v1, v1, #0 +; CHECK-NEXT: ldp q3, q2, [x9] +; CHECK-NEXT: ushll2.2d v4, v0, #0 +; CHECK-NEXT: stp q1, q5, [x1, #64] +; CHECK-NEXT: ushll.2d v0, v0, #0 +; CHECK-NEXT: stp q0, q4, [x1, #96] +; CHECK-NEXT: ushll2.2d v1, v3, #0 +; CHECK-NEXT: ushll2.2d v0, v2, #0 +; CHECK-NEXT: ushll.2d v2, v2, #0 +; CHECK-NEXT: stp q2, q0, [x1, #32] +; CHECK-NEXT: ushll.2d v0, v3, #0 +; CHECK-NEXT: stp q0, q1, [x1], #128 +; CHECK-NEXT: b.ne LBB15_1 +; CHECK-NEXT: ; %bb.2: ; %exit +; CHECK-NEXT: ret + +; CHECK-BE-LABEL: zext_v16i32_to_v16i64_in_loop: +; CHECK-BE: // %bb.0: // %entry +; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: .LBB15_1: // %loop +; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-BE-NEXT: add x9, x0, x8 +; CHECK-BE-NEXT: add x8, x8, #64 +; CHECK-BE-NEXT: add x10, x9, #48 +; CHECK-BE-NEXT: add x11, x9, #32 +; CHECK-BE-NEXT: cmp x8, #512 +; CHECK-BE-NEXT: ld1 { v0.4s }, [x9] +; CHECK-BE-NEXT: add x9, x9, #16 +; CHECK-BE-NEXT: ld1 { v1.4s }, [x10] +; CHECK-BE-NEXT: add x10, x1, #16 +; CHECK-BE-NEXT: ld1 { v2.4s }, [x11] +; CHECK-BE-NEXT: ushll2 v3.2d, v0.4s, #0 +; CHECK-BE-NEXT: ld1 { v4.4s }, [x9] +; CHECK-BE-NEXT: add x9, x1, #112 +; CHECK-BE-NEXT: st1 { v3.2d }, [x10] +; CHECK-BE-NEXT: add x10, x1, #80 +; CHECK-BE-NEXT: ushll2 v3.2d, v1.4s, #0 +; CHECK-BE-NEXT: ushll2 v5.2d, v2.4s, #0 +; CHECK-BE-NEXT: st1 { v3.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #48 +; CHECK-BE-NEXT: st1 { v5.2d }, [x10] +; CHECK-BE-NEXT: add x10, x1, #96 +; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-BE-NEXT: ushll v3.2d, v4.2s, #0 +; CHECK-BE-NEXT: ushll2 v4.2d, v4.4s, #0 +; CHECK-BE-NEXT: st1 { v0.2d }, [x1] +; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-BE-NEXT: st1 { v4.2d }, [x9] +; CHECK-BE-NEXT: add x9, x1, #64 +; CHECK-BE-NEXT: st1 { v1.2d }, [x10] +; CHECK-BE-NEXT: add x10, x1, #32 +; CHECK-BE-NEXT: add x1, x1, #128 +; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-BE-NEXT: st1 { v3.2d }, [x10] +; CHECK-BE-NEXT: st1 { v2.2d }, [x9] +; CHECK-BE-NEXT: b.ne .LBB15_1 +; CHECK-BE-NEXT: // %bb.2: // %exit +; CHECK-BE-NEXT: ret + +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %src.gep = getelementptr i32, i32* %src, i64 %iv + %src.gep.cast = bitcast i32* %src.gep to <16 x i32>* + %load = load <16 x i32>, <16 x i32>* %src.gep.cast + %ext = zext <16 x i32> %load to <16 x i64> + %dst.gep = getelementptr i64, i64* %dst, i64 %iv + %dst.gep.cast = bitcast i64* %dst.gep to <16 x i64>* + store <16 x i64> %ext, <16 x i64>* %dst.gep.cast + %iv.next = add nuw i64 %iv, 16 + %ec = icmp eq i64 %iv.next, 128 + br i1 %ec, label %exit, label %loop + +exit: + ret void +}